Add --failOnFailedSeed checkbox to URL list workflows (#1236)

- If set, and any of the seeds fails, the entire crawl is marked as a failure.
- Add checkbox which adds --failOnFailedSeed checkbox to URL list workflows
- Add 'Fail Crawl On Failed URL' to crawl workflow setup docs
This commit is contained in:
Tessa Walsh 2023-10-03 21:46:09 -04:00 committed by GitHub
parent 4f36a94bc6
commit b1ead614ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 35 additions and 2 deletions

View File

@ -103,6 +103,7 @@ class RawCrawlConfig(BaseModel):
combineWARC: Optional[bool]
useSitemap: Optional[bool] = False
failOnFailedSeed: Optional[bool] = False
logging: Optional[str]
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"

View File

@ -29,6 +29,12 @@ When enabled, the crawler will visit all the links it finds within each page def
??? example "Crawling tags & search queries with URL List crawls"
This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.
### Fail Crawl on Failed URL
`URL List`{ .badge-blue }
When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed".
### Crawl Start URL
`Seeded Crawl`{ .badge-orange }

View File

@ -304,6 +304,10 @@ export class ConfigDetails extends LiteElement {
msg("Include Any Linked Page"),
Boolean(crawlConfig?.config.extraHops)
)}
${this.renderSetting(
msg("Fail Crawl On Failed URL"),
Boolean(crawlConfig?.config.failOnFailedSeed)
)}
`;
};

View File

@ -82,6 +82,7 @@ type FormState = {
urlList: string;
includeLinkedPages: boolean;
useSitemap: boolean;
failOnFailedSeed: boolean;
customIncludeUrlList: string;
crawlTimeoutMinutes: number;
behaviorTimeoutSeconds: number | null;
@ -157,6 +158,7 @@ const getDefaultFormState = (): FormState => ({
urlList: "",
includeLinkedPages: false,
useSitemap: true,
failOnFailedSeed: false,
customIncludeUrlList: "",
crawlTimeoutMinutes: 0,
maxCrawlSizeGB: 0,
@ -467,6 +469,8 @@ export class CrawlConfigEditor extends LiteElement {
if (this.initialWorkflow.jobType === "custom") {
formState.scopeType = seedsConfig.scopeType || "page";
}
formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
}
if (this.initialWorkflow.schedule) {
@ -543,6 +547,8 @@ export class CrawlConfigEditor extends LiteElement {
includeLinkedPages:
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
useSitemap: defaultFormState.useSitemap,
failOnFailedSeed:
seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
pageLimit:
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
autoscrollBehavior: this.initialWorkflow.config.behaviors
@ -994,6 +1000,18 @@ https://example.com/path`}
URL.`),
false
)}
${this.renderFormCol(html`<sl-checkbox
name="failOnFailedSeed"
?checked=${this.formState.failOnFailedSeed}
>
${msg("Fail Crawl on Failed URL")}
</sl-checkbox>`)}
${this.renderHelpTextCol(
msg(
`If checked, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled.`
),
false
)}
${when(
this.formState.includeLinkedPages || this.jobType === "custom",
() => html`
@ -2273,7 +2291,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
private parseUrlListConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "extraHops" | "useSitemap"
"seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
> {
const config = {
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
@ -2283,6 +2301,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
scopeType: "page" as FormState["scopeType"],
extraHops: this.formState.includeLinkedPages ? 1 : 0,
useSitemap: false,
failOnFailedSeed: this.formState.failOnFailedSeed,
};
return config;
@ -2290,7 +2309,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
private parseSeededConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "useSitemap"
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
> {
const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList
@ -2325,6 +2344,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
seeds: [primarySeed, ...additionalSeedUrlList],
scopeType: this.formState.scopeType,
useSitemap: this.formState.useSitemap,
failOnFailedSeed: false,
};
return config;
}

View File

@ -23,6 +23,7 @@ const defaultValue = {
pageLoadTimeout: null,
pageExtraDelay: null,
useSitemap: false,
failOnFailedSeed: false,
},
tags: [],
crawlTimeout: null,

View File

@ -29,6 +29,7 @@ export type SeedConfig = Pick<
behaviors?: string | null;
extraHops?: number | null;
useSitemap: boolean;
failOnFailedSeed: boolean;
depth?: number | null;
};