Add --failOnFailedSeed checkbox to URL list workflows (#1236)
- If set, and any of the seeds fails, the entire crawl is marked as a failure. - Add checkbox which adds --failOnFailedSeed checkbox to URL list workflows - Add 'Fail Crawl On Failed URL' to crawl workflow setup docs
This commit is contained in:
		
							parent
							
								
									4f36a94bc6
								
							
						
					
					
						commit
						b1ead614ee
					
				| @ -103,6 +103,7 @@ class RawCrawlConfig(BaseModel): | ||||
|     combineWARC: Optional[bool] | ||||
| 
 | ||||
|     useSitemap: Optional[bool] = False | ||||
|     failOnFailedSeed: Optional[bool] = False | ||||
| 
 | ||||
|     logging: Optional[str] | ||||
|     behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" | ||||
|  | ||||
| @ -29,6 +29,12 @@ When enabled, the crawler will visit all the links it finds within each page def | ||||
| ??? example "Crawling tags & search queries with URL List crawls" | ||||
|     This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page. | ||||
| 
 | ||||
| ### Fail Crawl on Failed URL | ||||
| 
 | ||||
| `URL List`{ .badge-blue } | ||||
| 
 | ||||
| When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed". | ||||
| 
 | ||||
| ### Crawl Start URL | ||||
| 
 | ||||
| `Seeded Crawl`{ .badge-orange } | ||||
|  | ||||
| @ -304,6 +304,10 @@ export class ConfigDetails extends LiteElement { | ||||
|         msg("Include Any Linked Page"), | ||||
|         Boolean(crawlConfig?.config.extraHops) | ||||
|       )} | ||||
|       ${this.renderSetting( | ||||
|         msg("Fail Crawl On Failed URL"), | ||||
|         Boolean(crawlConfig?.config.failOnFailedSeed) | ||||
|       )} | ||||
|     `;
 | ||||
|   }; | ||||
| 
 | ||||
|  | ||||
| @ -82,6 +82,7 @@ type FormState = { | ||||
|   urlList: string; | ||||
|   includeLinkedPages: boolean; | ||||
|   useSitemap: boolean; | ||||
|   failOnFailedSeed: boolean; | ||||
|   customIncludeUrlList: string; | ||||
|   crawlTimeoutMinutes: number; | ||||
|   behaviorTimeoutSeconds: number | null; | ||||
| @ -157,6 +158,7 @@ const getDefaultFormState = (): FormState => ({ | ||||
|   urlList: "", | ||||
|   includeLinkedPages: false, | ||||
|   useSitemap: true, | ||||
|   failOnFailedSeed: false, | ||||
|   customIncludeUrlList: "", | ||||
|   crawlTimeoutMinutes: 0, | ||||
|   maxCrawlSizeGB: 0, | ||||
| @ -467,6 +469,8 @@ export class CrawlConfigEditor extends LiteElement { | ||||
|       if (this.initialWorkflow.jobType === "custom") { | ||||
|         formState.scopeType = seedsConfig.scopeType || "page"; | ||||
|       } | ||||
| 
 | ||||
|       formState.failOnFailedSeed = seedsConfig.failOnFailedSeed; | ||||
|     } | ||||
| 
 | ||||
|     if (this.initialWorkflow.schedule) { | ||||
| @ -543,6 +547,8 @@ export class CrawlConfigEditor extends LiteElement { | ||||
|       includeLinkedPages: | ||||
|         Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true, | ||||
|       useSitemap: defaultFormState.useSitemap, | ||||
|       failOnFailedSeed: | ||||
|         seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed, | ||||
|       pageLimit: | ||||
|         this.initialWorkflow.config.limit ?? defaultFormState.pageLimit, | ||||
|       autoscrollBehavior: this.initialWorkflow.config.behaviors | ||||
| @ -994,6 +1000,18 @@ https://example.com/path`} | ||||
|         URL.`),
 | ||||
|         false | ||||
|       )} | ||||
|       ${this.renderFormCol(html`<sl-checkbox
 | ||||
|         name="failOnFailedSeed" | ||||
|         ?checked=${this.formState.failOnFailedSeed} | ||||
|       > | ||||
|         ${msg("Fail Crawl on Failed URL")} | ||||
|       </sl-checkbox>`)} | ||||
|       ${this.renderHelpTextCol( | ||||
|         msg( | ||||
|           `If checked, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled.` | ||||
|         ), | ||||
|         false | ||||
|       )} | ||||
|       ${when( | ||||
|         this.formState.includeLinkedPages || this.jobType === "custom", | ||||
|         () => html` | ||||
| @ -2273,7 +2291,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | ||||
| 
 | ||||
|   private parseUrlListConfig(): Pick< | ||||
|     NewCrawlConfigParams["config"], | ||||
|     "seeds" | "scopeType" | "extraHops" | "useSitemap" | ||||
|     "seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed" | ||||
|   > { | ||||
|     const config = { | ||||
|       seeds: urlListToArray(this.formState.urlList).map((seedUrl) => { | ||||
| @ -2283,6 +2301,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | ||||
|       scopeType: "page" as FormState["scopeType"], | ||||
|       extraHops: this.formState.includeLinkedPages ? 1 : 0, | ||||
|       useSitemap: false, | ||||
|       failOnFailedSeed: this.formState.failOnFailedSeed, | ||||
|     }; | ||||
| 
 | ||||
|     return config; | ||||
| @ -2290,7 +2309,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | ||||
| 
 | ||||
|   private parseSeededConfig(): Pick< | ||||
|     NewCrawlConfigParams["config"], | ||||
|     "seeds" | "scopeType" | "useSitemap" | ||||
|     "seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed" | ||||
|   > { | ||||
|     const primarySeedUrl = this.formState.primarySeedUrl; | ||||
|     const includeUrlList = this.formState.customIncludeUrlList | ||||
| @ -2325,6 +2344,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | ||||
|       seeds: [primarySeed, ...additionalSeedUrlList], | ||||
|       scopeType: this.formState.scopeType, | ||||
|       useSitemap: this.formState.useSitemap, | ||||
|       failOnFailedSeed: false, | ||||
|     }; | ||||
|     return config; | ||||
|   } | ||||
|  | ||||
| @ -23,6 +23,7 @@ const defaultValue = { | ||||
|     pageLoadTimeout: null, | ||||
|     pageExtraDelay: null, | ||||
|     useSitemap: false, | ||||
|     failOnFailedSeed: false, | ||||
|   }, | ||||
|   tags: [], | ||||
|   crawlTimeout: null, | ||||
|  | ||||
| @ -29,6 +29,7 @@ export type SeedConfig = Pick< | ||||
|   behaviors?: string | null; | ||||
|   extraHops?: number | null; | ||||
|   useSitemap: boolean; | ||||
|   failOnFailedSeed: boolean; | ||||
|   depth?: number | null; | ||||
| }; | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user