Add crawler Use Sitemap option to Browsertrix Cloud (#978)
* Add user-guide docs for Use Sitemap option --------- Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
This commit is contained in:
parent
db851b8360
commit
d5c3a8519f
@ -90,6 +90,8 @@ class RawCrawlConfig(BaseModel):
|
||||
generateWACZ: Optional[bool]
|
||||
combineWARC: Optional[bool]
|
||||
|
||||
useSitemap: Optional[bool] = False
|
||||
|
||||
logging: Optional[str]
|
||||
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
||||
|
||||
|
||||
@ -74,10 +74,18 @@ This can be useful for crawling websites that span multiple domains such as `exa
|
||||
|
||||
`Seeded Crawl`{ .badge-orange }
|
||||
|
||||
When Enabled, the crawler will visit all the links it finds within each page, regardless of the _Start URL Scope_ setting.
|
||||
When enabled, the crawler will visit all the links it finds within each page, regardless of the _Start URL Scope_ setting.
|
||||
|
||||
This can be useful for capturing links on a page that lead outside the website that is being crawled but should still be included in the archive for context.
|
||||
|
||||
### Check For Sitemap
|
||||
|
||||
`Seeded Crawl`{ .badge-orange }
|
||||
|
||||
When enabled, the crawler will check for a sitemap at /sitemap.xml and use it to discover pages to crawl if found. It will not crawl pages found in the sitemap that do not meet the crawl's scope settings or limits.
|
||||
|
||||
This can be useful for discovering and capturing pages on a website that aren't linked to from the seed and which might not otherwise be captured.
|
||||
|
||||
### Exclusions
|
||||
|
||||
`URL List`{ .badge-blue } `Seeded Crawl`{ .badge-orange }
|
||||
|
||||
@ -353,6 +353,10 @@ export class ConfigDetails extends LiteElement {
|
||||
msg("Include Any Linked Page (“one hop out”)"),
|
||||
Boolean(primarySeedConfig.extraHops ?? seedsConfig.extraHops)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Check For Sitemap"),
|
||||
Boolean(seedsConfig.useSitemap)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("List of Additional URLs"),
|
||||
additionalUrlList?.length
|
||||
|
||||
@ -76,6 +76,7 @@ type FormState = {
|
||||
primarySeedUrl: string;
|
||||
urlList: string;
|
||||
includeLinkedPages: boolean;
|
||||
useSitemap: boolean;
|
||||
customIncludeUrlList: string;
|
||||
crawlTimeoutMinutes: number | null;
|
||||
behaviorTimeoutSeconds: number | null;
|
||||
@ -149,6 +150,7 @@ const getDefaultFormState = (): FormState => ({
|
||||
primarySeedUrl: "",
|
||||
urlList: "",
|
||||
includeLinkedPages: false,
|
||||
useSitemap: true,
|
||||
customIncludeUrlList: "",
|
||||
crawlTimeoutMinutes: null,
|
||||
behaviorTimeoutSeconds: null,
|
||||
@ -442,6 +444,7 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
if (additionalSeeds.length) {
|
||||
formState.urlList = mapSeedToUrl(additionalSeeds).join("\n");
|
||||
}
|
||||
formState.useSitemap = seedsConfig.useSitemap;
|
||||
} else {
|
||||
// Treat "custom" like URL list
|
||||
formState.urlList = mapSeedToUrl(seeds).join("\n");
|
||||
@ -518,6 +521,7 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
exclusions: seedsConfig.exclude,
|
||||
includeLinkedPages:
|
||||
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
|
||||
useSitemap: defaultFormState.useSitemap,
|
||||
pageLimit:
|
||||
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
||||
autoscrollBehavior: this.initialWorkflow.config.behaviors
|
||||
@ -1115,6 +1119,18 @@ https://example.net`}
|
||||
Start URL Scope.`),
|
||||
false
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-checkbox
|
||||
name="useSitemap"
|
||||
?checked=${this.formState.useSitemap}
|
||||
>
|
||||
${msg("Check For Sitemap")}
|
||||
</sl-checkbox>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`If checked, the crawler will check for a sitemap at /sitemap.xml and use it to discover pages to crawl if present.`),
|
||||
false
|
||||
)}
|
||||
<div class="col-span-5">
|
||||
<btrix-details ?open=${exclusions.length > 0}>
|
||||
<span slot="title"
|
||||
@ -2118,7 +2134,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
|
||||
private parseUrlListConfig(): Pick<
|
||||
NewCrawlConfigParams["config"],
|
||||
"seeds" | "scopeType" | "extraHops"
|
||||
"seeds" | "scopeType" | "extraHops" | "useSitemap"
|
||||
> {
|
||||
const config = {
|
||||
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
|
||||
@ -2127,6 +2143,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
}),
|
||||
scopeType: "page" as FormState["scopeType"],
|
||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||
useSitemap: false,
|
||||
};
|
||||
|
||||
return config;
|
||||
@ -2134,7 +2151,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
|
||||
private parseSeededConfig(): Pick<
|
||||
NewCrawlConfigParams["config"],
|
||||
"seeds" | "scopeType"
|
||||
"seeds" | "scopeType" | "useSitemap"
|
||||
> {
|
||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||
const includeUrlList = this.formState.customIncludeUrlList
|
||||
@ -2167,6 +2184,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
const config = {
|
||||
seeds: [primarySeed, ...additionalSeedUrlList],
|
||||
scopeType: this.formState.scopeType,
|
||||
useSitemap: this.formState.useSitemap,
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
||||
@ -23,6 +23,7 @@ const defaultValue = {
|
||||
behaviorTimeout: null,
|
||||
pageLoadTimeout: null,
|
||||
pageExtraDelay: null,
|
||||
useSitemap: false,
|
||||
},
|
||||
tags: [],
|
||||
crawlTimeout: null,
|
||||
|
||||
@ -29,6 +29,7 @@ export type SeedConfig = Pick<
|
||||
pageExtraDelay: number | null;
|
||||
behaviors?: string | null;
|
||||
extraHops?: number | null;
|
||||
useSitemap: boolean;
|
||||
depth?: number | null;
|
||||
};
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user