diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index fb7c4240..8c220ec7 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -103,6 +103,7 @@ class RawCrawlConfig(BaseModel): combineWARC: Optional[bool] useSitemap: Optional[bool] = False + failOnFailedSeed: Optional[bool] = False logging: Optional[str] behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index 21d568d4..5c2eb4d3 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -29,6 +29,12 @@ When enabled, the crawler will visit all the links it finds within each page def ??? example "Crawling tags & search queries with URL List crawls" This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page. +### Fail Crawl on Failed URL + +`URL List`{ .badge-blue } + +When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed". + ### Crawl Start URL `Seeded Crawl`{ .badge-orange } diff --git a/frontend/src/components/config-details.ts b/frontend/src/components/config-details.ts index 7748b669..c9053064 100644 --- a/frontend/src/components/config-details.ts +++ b/frontend/src/components/config-details.ts @@ -304,6 +304,10 @@ export class ConfigDetails extends LiteElement { msg("Include Any Linked Page"), Boolean(crawlConfig?.config.extraHops) )} + ${this.renderSetting( + msg("Fail Crawl On Failed URL"), + Boolean(crawlConfig?.config.failOnFailedSeed) + )} `; }; diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 562a38b5..0125852c 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -82,6 +82,7 @@ type FormState = { urlList: string; includeLinkedPages: boolean; useSitemap: boolean; + failOnFailedSeed: boolean; customIncludeUrlList: string; crawlTimeoutMinutes: number; behaviorTimeoutSeconds: number | null; @@ -157,6 +158,7 @@ const getDefaultFormState = (): FormState => ({ urlList: "", includeLinkedPages: false, useSitemap: true, + failOnFailedSeed: false, customIncludeUrlList: "", crawlTimeoutMinutes: 0, maxCrawlSizeGB: 0, @@ -467,6 +469,8 @@ export class CrawlConfigEditor extends LiteElement { if (this.initialWorkflow.jobType === "custom") { formState.scopeType = seedsConfig.scopeType || "page"; } + + formState.failOnFailedSeed = seedsConfig.failOnFailedSeed; } if (this.initialWorkflow.schedule) { @@ -543,6 +547,8 @@ export class CrawlConfigEditor extends LiteElement { includeLinkedPages: Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true, useSitemap: defaultFormState.useSitemap, + failOnFailedSeed: + seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed, pageLimit: this.initialWorkflow.config.limit ?? defaultFormState.pageLimit, autoscrollBehavior: this.initialWorkflow.config.behaviors @@ -994,6 +1000,18 @@ https://example.com/path`} URL.`), false )} + ${this.renderFormCol(html` + ${msg("Fail Crawl on Failed URL")} + `)} + ${this.renderHelpTextCol( + msg( + `If checked, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled.` + ), + false + )} ${when( this.formState.includeLinkedPages || this.jobType === "custom", () => html` @@ -2273,7 +2291,7 @@ https://archiveweb.page/images/${"logo.svg"}`} private parseUrlListConfig(): Pick< NewCrawlConfigParams["config"], - "seeds" | "scopeType" | "extraHops" | "useSitemap" + "seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed" > { const config = { seeds: urlListToArray(this.formState.urlList).map((seedUrl) => { @@ -2283,6 +2301,7 @@ https://archiveweb.page/images/${"logo.svg"}`} scopeType: "page" as FormState["scopeType"], extraHops: this.formState.includeLinkedPages ? 1 : 0, useSitemap: false, + failOnFailedSeed: this.formState.failOnFailedSeed, }; return config; @@ -2290,7 +2309,7 @@ https://archiveweb.page/images/${"logo.svg"}`} private parseSeededConfig(): Pick< NewCrawlConfigParams["config"], - "seeds" | "scopeType" | "useSitemap" + "seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed" > { const primarySeedUrl = this.formState.primarySeedUrl; const includeUrlList = this.formState.customIncludeUrlList @@ -2325,6 +2344,7 @@ https://archiveweb.page/images/${"logo.svg"}`} seeds: [primarySeed, ...additionalSeedUrlList], scopeType: this.formState.scopeType, useSitemap: this.formState.useSitemap, + failOnFailedSeed: false, }; return config; } diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index be36d27d..12dc54c5 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -23,6 +23,7 @@ const defaultValue = { pageLoadTimeout: null, pageExtraDelay: null, useSitemap: false, + failOnFailedSeed: false, }, tags: [], crawlTimeout: null, diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index d72c443e..0a1b2821 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -29,6 +29,7 @@ export type SeedConfig = Pick< behaviors?: string | null; extraHops?: number | null; useSitemap: boolean; + failOnFailedSeed: boolean; depth?: number | null; };