Add crawler Use Sitemap option to Browsertrix Cloud (#978)

* Add user-guide docs for Use Sitemap option
---------

Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
This commit is contained in:
Tessa Walsh 2023-07-19 13:57:52 -04:00 committed by GitHub
parent db851b8360
commit d5c3a8519f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 37 additions and 3 deletions

View File

@ -90,6 +90,8 @@ class RawCrawlConfig(BaseModel):
generateWACZ: Optional[bool]
combineWARC: Optional[bool]
useSitemap: Optional[bool] = False
logging: Optional[str]
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"

View File

@ -74,10 +74,18 @@ This can be useful for crawling websites that span multiple domains such as `exa
`Seeded Crawl`{ .badge-orange }
When Enabled, the crawler will visit all the links it finds within each page, regardless of the _Start URL Scope_ setting.
When enabled, the crawler will visit all the links it finds within each page, regardless of the _Start URL Scope_ setting.
This can be useful for capturing links on a page that lead outside the website that is being crawled but should still be included in the archive for context.
### Check For Sitemap
`Seeded Crawl`{ .badge-orange }
When enabled, the crawler will check for a sitemap at /sitemap.xml and use it to discover pages to crawl if found. It will not crawl pages found in the sitemap that do not meet the crawl's scope settings or limits.
This can be useful for discovering and capturing pages on a website that aren't linked to from the seed and which might not otherwise be captured.
### Exclusions
`URL List`{ .badge-blue } `Seeded Crawl`{ .badge-orange }

View File

@ -353,6 +353,10 @@ export class ConfigDetails extends LiteElement {
msg("Include Any Linked Page (“one hop out”)"),
Boolean(primarySeedConfig.extraHops ?? seedsConfig.extraHops)
)}
${this.renderSetting(
msg("Check For Sitemap"),
Boolean(seedsConfig.useSitemap)
)}
${this.renderSetting(
msg("List of Additional URLs"),
additionalUrlList?.length

View File

@ -76,6 +76,7 @@ type FormState = {
primarySeedUrl: string;
urlList: string;
includeLinkedPages: boolean;
useSitemap: boolean;
customIncludeUrlList: string;
crawlTimeoutMinutes: number | null;
behaviorTimeoutSeconds: number | null;
@ -149,6 +150,7 @@ const getDefaultFormState = (): FormState => ({
primarySeedUrl: "",
urlList: "",
includeLinkedPages: false,
useSitemap: true,
customIncludeUrlList: "",
crawlTimeoutMinutes: null,
behaviorTimeoutSeconds: null,
@ -442,6 +444,7 @@ export class CrawlConfigEditor extends LiteElement {
if (additionalSeeds.length) {
formState.urlList = mapSeedToUrl(additionalSeeds).join("\n");
}
formState.useSitemap = seedsConfig.useSitemap;
} else {
// Treat "custom" like URL list
formState.urlList = mapSeedToUrl(seeds).join("\n");
@ -518,6 +521,7 @@ export class CrawlConfigEditor extends LiteElement {
exclusions: seedsConfig.exclude,
includeLinkedPages:
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
useSitemap: defaultFormState.useSitemap,
pageLimit:
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
autoscrollBehavior: this.initialWorkflow.config.behaviors
@ -1115,6 +1119,18 @@ https://example.net`}
Start URL Scope.`),
false
)}
${this.renderFormCol(html`
<sl-checkbox
name="useSitemap"
?checked=${this.formState.useSitemap}
>
${msg("Check For Sitemap")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(
msg(`If checked, the crawler will check for a sitemap at /sitemap.xml and use it to discover pages to crawl if present.`),
false
)}
<div class="col-span-5">
<btrix-details ?open=${exclusions.length > 0}>
<span slot="title"
@ -2118,7 +2134,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
private parseUrlListConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "extraHops"
"seeds" | "scopeType" | "extraHops" | "useSitemap"
> {
const config = {
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
@ -2127,6 +2143,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
}),
scopeType: "page" as FormState["scopeType"],
extraHops: this.formState.includeLinkedPages ? 1 : 0,
useSitemap: false,
};
return config;
@ -2134,7 +2151,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
private parseSeededConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType"
"seeds" | "scopeType" | "useSitemap"
> {
const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList
@ -2167,6 +2184,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
const config = {
seeds: [primarySeed, ...additionalSeedUrlList],
scopeType: this.formState.scopeType,
useSitemap: this.formState.useSitemap,
};
return config;
}

View File

@ -23,6 +23,7 @@ const defaultValue = {
behaviorTimeout: null,
pageLoadTimeout: null,
pageExtraDelay: null,
useSitemap: false,
},
tags: [],
crawlTimeout: null,

View File

@ -29,6 +29,7 @@ export type SeedConfig = Pick<
pageExtraDelay: number | null;
behaviors?: string | null;
extraHops?: number | null;
useSitemap: boolean;
depth?: number | null;
};