Add --failOnFailedSeed checkbox to URL list workflows (#1236)
- If set, and any of the seeds fails, the entire crawl is marked as a failure. - Add checkbox which adds --failOnFailedSeed checkbox to URL list workflows - Add 'Fail Crawl On Failed URL' to crawl workflow setup docs
This commit is contained in:
parent
4f36a94bc6
commit
b1ead614ee
@ -103,6 +103,7 @@ class RawCrawlConfig(BaseModel):
|
|||||||
combineWARC: Optional[bool]
|
combineWARC: Optional[bool]
|
||||||
|
|
||||||
useSitemap: Optional[bool] = False
|
useSitemap: Optional[bool] = False
|
||||||
|
failOnFailedSeed: Optional[bool] = False
|
||||||
|
|
||||||
logging: Optional[str]
|
logging: Optional[str]
|
||||||
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
||||||
|
|||||||
@ -29,6 +29,12 @@ When enabled, the crawler will visit all the links it finds within each page def
|
|||||||
??? example "Crawling tags & search queries with URL List crawls"
|
??? example "Crawling tags & search queries with URL List crawls"
|
||||||
This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.
|
This setting can be useful for crawling the content of specific tags or searh queries. Specify the tag or search query URL(s) in the _List of URLs_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.
|
||||||
|
|
||||||
|
### Fail Crawl on Failed URL
|
||||||
|
|
||||||
|
`URL List`{ .badge-blue }
|
||||||
|
|
||||||
|
When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed".
|
||||||
|
|
||||||
### Crawl Start URL
|
### Crawl Start URL
|
||||||
|
|
||||||
`Seeded Crawl`{ .badge-orange }
|
`Seeded Crawl`{ .badge-orange }
|
||||||
|
|||||||
@ -304,6 +304,10 @@ export class ConfigDetails extends LiteElement {
|
|||||||
msg("Include Any Linked Page"),
|
msg("Include Any Linked Page"),
|
||||||
Boolean(crawlConfig?.config.extraHops)
|
Boolean(crawlConfig?.config.extraHops)
|
||||||
)}
|
)}
|
||||||
|
${this.renderSetting(
|
||||||
|
msg("Fail Crawl On Failed URL"),
|
||||||
|
Boolean(crawlConfig?.config.failOnFailedSeed)
|
||||||
|
)}
|
||||||
`;
|
`;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -82,6 +82,7 @@ type FormState = {
|
|||||||
urlList: string;
|
urlList: string;
|
||||||
includeLinkedPages: boolean;
|
includeLinkedPages: boolean;
|
||||||
useSitemap: boolean;
|
useSitemap: boolean;
|
||||||
|
failOnFailedSeed: boolean;
|
||||||
customIncludeUrlList: string;
|
customIncludeUrlList: string;
|
||||||
crawlTimeoutMinutes: number;
|
crawlTimeoutMinutes: number;
|
||||||
behaviorTimeoutSeconds: number | null;
|
behaviorTimeoutSeconds: number | null;
|
||||||
@ -157,6 +158,7 @@ const getDefaultFormState = (): FormState => ({
|
|||||||
urlList: "",
|
urlList: "",
|
||||||
includeLinkedPages: false,
|
includeLinkedPages: false,
|
||||||
useSitemap: true,
|
useSitemap: true,
|
||||||
|
failOnFailedSeed: false,
|
||||||
customIncludeUrlList: "",
|
customIncludeUrlList: "",
|
||||||
crawlTimeoutMinutes: 0,
|
crawlTimeoutMinutes: 0,
|
||||||
maxCrawlSizeGB: 0,
|
maxCrawlSizeGB: 0,
|
||||||
@ -467,6 +469,8 @@ export class CrawlConfigEditor extends LiteElement {
|
|||||||
if (this.initialWorkflow.jobType === "custom") {
|
if (this.initialWorkflow.jobType === "custom") {
|
||||||
formState.scopeType = seedsConfig.scopeType || "page";
|
formState.scopeType = seedsConfig.scopeType || "page";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.initialWorkflow.schedule) {
|
if (this.initialWorkflow.schedule) {
|
||||||
@ -543,6 +547,8 @@ export class CrawlConfigEditor extends LiteElement {
|
|||||||
includeLinkedPages:
|
includeLinkedPages:
|
||||||
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
|
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
|
||||||
useSitemap: defaultFormState.useSitemap,
|
useSitemap: defaultFormState.useSitemap,
|
||||||
|
failOnFailedSeed:
|
||||||
|
seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
|
||||||
pageLimit:
|
pageLimit:
|
||||||
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
||||||
autoscrollBehavior: this.initialWorkflow.config.behaviors
|
autoscrollBehavior: this.initialWorkflow.config.behaviors
|
||||||
@ -994,6 +1000,18 @@ https://example.com/path`}
|
|||||||
URL.`),
|
URL.`),
|
||||||
false
|
false
|
||||||
)}
|
)}
|
||||||
|
${this.renderFormCol(html`<sl-checkbox
|
||||||
|
name="failOnFailedSeed"
|
||||||
|
?checked=${this.formState.failOnFailedSeed}
|
||||||
|
>
|
||||||
|
${msg("Fail Crawl on Failed URL")}
|
||||||
|
</sl-checkbox>`)}
|
||||||
|
${this.renderHelpTextCol(
|
||||||
|
msg(
|
||||||
|
`If checked, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled.`
|
||||||
|
),
|
||||||
|
false
|
||||||
|
)}
|
||||||
${when(
|
${when(
|
||||||
this.formState.includeLinkedPages || this.jobType === "custom",
|
this.formState.includeLinkedPages || this.jobType === "custom",
|
||||||
() => html`
|
() => html`
|
||||||
@ -2273,7 +2291,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
|
|
||||||
private parseUrlListConfig(): Pick<
|
private parseUrlListConfig(): Pick<
|
||||||
NewCrawlConfigParams["config"],
|
NewCrawlConfigParams["config"],
|
||||||
"seeds" | "scopeType" | "extraHops" | "useSitemap"
|
"seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
|
||||||
> {
|
> {
|
||||||
const config = {
|
const config = {
|
||||||
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
|
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
|
||||||
@ -2283,6 +2301,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
scopeType: "page" as FormState["scopeType"],
|
scopeType: "page" as FormState["scopeType"],
|
||||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||||
useSitemap: false,
|
useSitemap: false,
|
||||||
|
failOnFailedSeed: this.formState.failOnFailedSeed,
|
||||||
};
|
};
|
||||||
|
|
||||||
return config;
|
return config;
|
||||||
@ -2290,7 +2309,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
|
|
||||||
private parseSeededConfig(): Pick<
|
private parseSeededConfig(): Pick<
|
||||||
NewCrawlConfigParams["config"],
|
NewCrawlConfigParams["config"],
|
||||||
"seeds" | "scopeType" | "useSitemap"
|
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
|
||||||
> {
|
> {
|
||||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||||
const includeUrlList = this.formState.customIncludeUrlList
|
const includeUrlList = this.formState.customIncludeUrlList
|
||||||
@ -2325,6 +2344,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
seeds: [primarySeed, ...additionalSeedUrlList],
|
seeds: [primarySeed, ...additionalSeedUrlList],
|
||||||
scopeType: this.formState.scopeType,
|
scopeType: this.formState.scopeType,
|
||||||
useSitemap: this.formState.useSitemap,
|
useSitemap: this.formState.useSitemap,
|
||||||
|
failOnFailedSeed: false,
|
||||||
};
|
};
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -23,6 +23,7 @@ const defaultValue = {
|
|||||||
pageLoadTimeout: null,
|
pageLoadTimeout: null,
|
||||||
pageExtraDelay: null,
|
pageExtraDelay: null,
|
||||||
useSitemap: false,
|
useSitemap: false,
|
||||||
|
failOnFailedSeed: false,
|
||||||
},
|
},
|
||||||
tags: [],
|
tags: [],
|
||||||
crawlTimeout: null,
|
crawlTimeout: null,
|
||||||
|
|||||||
@ -29,6 +29,7 @@ export type SeedConfig = Pick<
|
|||||||
behaviors?: string | null;
|
behaviors?: string | null;
|
||||||
extraHops?: number | null;
|
extraHops?: number | null;
|
||||||
useSitemap: boolean;
|
useSitemap: boolean;
|
||||||
|
failOnFailedSeed: boolean;
|
||||||
depth?: number | null;
|
depth?: number | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user