Add --failOnFailedSeed checkbox to URL list workflows (#1236)
- If set, and any of the seeds fails, the entire crawl is marked as a failure. - Add checkbox which adds --failOnFailedSeed checkbox to URL list workflows - Add 'Fail Crawl On Failed URL' to crawl workflow setup docs
This commit is contained in:
parent
4f36a94bc6
commit
b1ead614ee
@ -103,6 +103,7 @@ class RawCrawlConfig(BaseModel):
|
||||
combineWARC: Optional[bool]
|
||||
|
||||
useSitemap: Optional[bool] = False
|
||||
failOnFailedSeed: Optional[bool] = False
|
||||
|
||||
logging: Optional[str]
|
||||
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
||||
|
@ -29,6 +29,12 @@ When enabled, the crawler will visit all the links it finds within each page def
|
||||
??? example "Crawling tags & search queries with URL List crawls"
|
||||
This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.
|
||||
|
||||
### Fail Crawl on Failed URL
|
||||
|
||||
`URL List`{ .badge-blue }
|
||||
|
||||
When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed".
|
||||
|
||||
### Crawl Start URL
|
||||
|
||||
`Seeded Crawl`{ .badge-orange }
|
||||
|
@ -304,6 +304,10 @@ export class ConfigDetails extends LiteElement {
|
||||
msg("Include Any Linked Page"),
|
||||
Boolean(crawlConfig?.config.extraHops)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Fail Crawl On Failed URL"),
|
||||
Boolean(crawlConfig?.config.failOnFailedSeed)
|
||||
)}
|
||||
`;
|
||||
};
|
||||
|
||||
|
@ -82,6 +82,7 @@ type FormState = {
|
||||
urlList: string;
|
||||
includeLinkedPages: boolean;
|
||||
useSitemap: boolean;
|
||||
failOnFailedSeed: boolean;
|
||||
customIncludeUrlList: string;
|
||||
crawlTimeoutMinutes: number;
|
||||
behaviorTimeoutSeconds: number | null;
|
||||
@ -157,6 +158,7 @@ const getDefaultFormState = (): FormState => ({
|
||||
urlList: "",
|
||||
includeLinkedPages: false,
|
||||
useSitemap: true,
|
||||
failOnFailedSeed: false,
|
||||
customIncludeUrlList: "",
|
||||
crawlTimeoutMinutes: 0,
|
||||
maxCrawlSizeGB: 0,
|
||||
@ -467,6 +469,8 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
if (this.initialWorkflow.jobType === "custom") {
|
||||
formState.scopeType = seedsConfig.scopeType || "page";
|
||||
}
|
||||
|
||||
formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
|
||||
}
|
||||
|
||||
if (this.initialWorkflow.schedule) {
|
||||
@ -543,6 +547,8 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
includeLinkedPages:
|
||||
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
|
||||
useSitemap: defaultFormState.useSitemap,
|
||||
failOnFailedSeed:
|
||||
seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
|
||||
pageLimit:
|
||||
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
||||
autoscrollBehavior: this.initialWorkflow.config.behaviors
|
||||
@ -994,6 +1000,18 @@ https://example.com/path`}
|
||||
URL.`),
|
||||
false
|
||||
)}
|
||||
${this.renderFormCol(html`<sl-checkbox
|
||||
name="failOnFailedSeed"
|
||||
?checked=${this.formState.failOnFailedSeed}
|
||||
>
|
||||
${msg("Fail Crawl on Failed URL")}
|
||||
</sl-checkbox>`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`If checked, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled.`
|
||||
),
|
||||
false
|
||||
)}
|
||||
${when(
|
||||
this.formState.includeLinkedPages || this.jobType === "custom",
|
||||
() => html`
|
||||
@ -2273,7 +2291,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
|
||||
private parseUrlListConfig(): Pick<
|
||||
NewCrawlConfigParams["config"],
|
||||
"seeds" | "scopeType" | "extraHops" | "useSitemap"
|
||||
"seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
|
||||
> {
|
||||
const config = {
|
||||
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
|
||||
@ -2283,6 +2301,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
scopeType: "page" as FormState["scopeType"],
|
||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||
useSitemap: false,
|
||||
failOnFailedSeed: this.formState.failOnFailedSeed,
|
||||
};
|
||||
|
||||
return config;
|
||||
@ -2290,7 +2309,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
|
||||
private parseSeededConfig(): Pick<
|
||||
NewCrawlConfigParams["config"],
|
||||
"seeds" | "scopeType" | "useSitemap"
|
||||
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
|
||||
> {
|
||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||
const includeUrlList = this.formState.customIncludeUrlList
|
||||
@ -2325,6 +2344,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
seeds: [primarySeed, ...additionalSeedUrlList],
|
||||
scopeType: this.formState.scopeType,
|
||||
useSitemap: this.formState.useSitemap,
|
||||
failOnFailedSeed: false,
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ const defaultValue = {
|
||||
pageLoadTimeout: null,
|
||||
pageExtraDelay: null,
|
||||
useSitemap: false,
|
||||
failOnFailedSeed: false,
|
||||
},
|
||||
tags: [],
|
||||
crawlTimeout: null,
|
||||
|
@ -29,6 +29,7 @@ export type SeedConfig = Pick<
|
||||
behaviors?: string | null;
|
||||
extraHops?: number | null;
|
||||
useSitemap: boolean;
|
||||
failOnFailedSeed: boolean;
|
||||
depth?: number | null;
|
||||
};
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user