Add --failOnFailedSeed checkbox to URL list workflows (#1236)

- If set, and any of the seeds fails, the entire crawl is marked as a failure.
- Add checkbox which adds --failOnFailedSeed checkbox to URL list workflows
- Add 'Fail Crawl On Failed URL' to crawl workflow setup docs
This commit is contained in:
Tessa Walsh 2023-10-03 21:46:09 -04:00 committed by GitHub
parent 4f36a94bc6
commit b1ead614ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 35 additions and 2 deletions

View File

@ -103,6 +103,7 @@ class RawCrawlConfig(BaseModel):
combineWARC: Optional[bool] combineWARC: Optional[bool]
useSitemap: Optional[bool] = False useSitemap: Optional[bool] = False
failOnFailedSeed: Optional[bool] = False
logging: Optional[str] logging: Optional[str]
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"

View File

@ -29,6 +29,12 @@ When enabled, the crawler will visit all the links it finds within each page def
??? example "Crawling tags & search queries with URL List crawls" ??? example "Crawling tags & search queries with URL List crawls"
This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page. This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.
### Fail Crawl on Failed URL
`URL List`{ .badge-blue }
When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed".
### Crawl Start URL ### Crawl Start URL
`Seeded Crawl`{ .badge-orange } `Seeded Crawl`{ .badge-orange }

View File

@ -304,6 +304,10 @@ export class ConfigDetails extends LiteElement {
msg("Include Any Linked Page"), msg("Include Any Linked Page"),
Boolean(crawlConfig?.config.extraHops) Boolean(crawlConfig?.config.extraHops)
)} )}
${this.renderSetting(
msg("Fail Crawl On Failed URL"),
Boolean(crawlConfig?.config.failOnFailedSeed)
)}
`; `;
}; };

View File

@ -82,6 +82,7 @@ type FormState = {
urlList: string; urlList: string;
includeLinkedPages: boolean; includeLinkedPages: boolean;
useSitemap: boolean; useSitemap: boolean;
failOnFailedSeed: boolean;
customIncludeUrlList: string; customIncludeUrlList: string;
crawlTimeoutMinutes: number; crawlTimeoutMinutes: number;
behaviorTimeoutSeconds: number | null; behaviorTimeoutSeconds: number | null;
@ -157,6 +158,7 @@ const getDefaultFormState = (): FormState => ({
urlList: "", urlList: "",
includeLinkedPages: false, includeLinkedPages: false,
useSitemap: true, useSitemap: true,
failOnFailedSeed: false,
customIncludeUrlList: "", customIncludeUrlList: "",
crawlTimeoutMinutes: 0, crawlTimeoutMinutes: 0,
maxCrawlSizeGB: 0, maxCrawlSizeGB: 0,
@ -467,6 +469,8 @@ export class CrawlConfigEditor extends LiteElement {
if (this.initialWorkflow.jobType === "custom") { if (this.initialWorkflow.jobType === "custom") {
formState.scopeType = seedsConfig.scopeType || "page"; formState.scopeType = seedsConfig.scopeType || "page";
} }
formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
} }
if (this.initialWorkflow.schedule) { if (this.initialWorkflow.schedule) {
@ -543,6 +547,8 @@ export class CrawlConfigEditor extends LiteElement {
includeLinkedPages: includeLinkedPages:
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true, Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
useSitemap: defaultFormState.useSitemap, useSitemap: defaultFormState.useSitemap,
failOnFailedSeed:
seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
pageLimit: pageLimit:
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit, this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
autoscrollBehavior: this.initialWorkflow.config.behaviors autoscrollBehavior: this.initialWorkflow.config.behaviors
@ -994,6 +1000,18 @@ https://example.com/path`}
URL.`), URL.`),
false false
)} )}
${this.renderFormCol(html`<sl-checkbox
name="failOnFailedSeed"
?checked=${this.formState.failOnFailedSeed}
>
${msg("Fail Crawl on Failed URL")}
</sl-checkbox>`)}
${this.renderHelpTextCol(
msg(
`If checked, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled.`
),
false
)}
${when( ${when(
this.formState.includeLinkedPages || this.jobType === "custom", this.formState.includeLinkedPages || this.jobType === "custom",
() => html` () => html`
@ -2273,7 +2291,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
private parseUrlListConfig(): Pick< private parseUrlListConfig(): Pick<
NewCrawlConfigParams["config"], NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "extraHops" | "useSitemap" "seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
> { > {
const config = { const config = {
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => { seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
@ -2283,6 +2301,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
scopeType: "page" as FormState["scopeType"], scopeType: "page" as FormState["scopeType"],
extraHops: this.formState.includeLinkedPages ? 1 : 0, extraHops: this.formState.includeLinkedPages ? 1 : 0,
useSitemap: false, useSitemap: false,
failOnFailedSeed: this.formState.failOnFailedSeed,
}; };
return config; return config;
@ -2290,7 +2309,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
private parseSeededConfig(): Pick< private parseSeededConfig(): Pick<
NewCrawlConfigParams["config"], NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "useSitemap" "seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
> { > {
const primarySeedUrl = this.formState.primarySeedUrl; const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList const includeUrlList = this.formState.customIncludeUrlList
@ -2325,6 +2344,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
seeds: [primarySeed, ...additionalSeedUrlList], seeds: [primarySeed, ...additionalSeedUrlList],
scopeType: this.formState.scopeType, scopeType: this.formState.scopeType,
useSitemap: this.formState.useSitemap, useSitemap: this.formState.useSitemap,
failOnFailedSeed: false,
}; };
return config; return config;
} }

View File

@ -23,6 +23,7 @@ const defaultValue = {
pageLoadTimeout: null, pageLoadTimeout: null,
pageExtraDelay: null, pageExtraDelay: null,
useSitemap: false, useSitemap: false,
failOnFailedSeed: false,
}, },
tags: [], tags: [],
crawlTimeout: null, crawlTimeout: null,

View File

@ -29,6 +29,7 @@ export type SeedConfig = Pick<
behaviors?: string | null; behaviors?: string | null;
extraHops?: number | null; extraHops?: number | null;
useSitemap: boolean; useSitemap: boolean;
failOnFailedSeed: boolean;
depth?: number | null; depth?: number | null;
}; };