Add saveStorage option to workflow (#2757)
Fixes #2753 - Adds `saveStorage` to `RawCrawlConfig` model in backend - Adds option to Browser Settings pane of workflow editor - Adds option to config details component - Adds setting to docs
This commit is contained in:
parent
5a4add84a8
commit
3a05002491
@ -370,6 +370,8 @@ class RawCrawlConfig(BaseModel):
|
|||||||
selectLinks: List[str] = ["a[href]->href"]
|
selectLinks: List[str] = ["a[href]->href"]
|
||||||
clickSelector: str = "a"
|
clickSelector: str = "a"
|
||||||
|
|
||||||
|
saveStorage: Optional[bool] = False
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlConfigIn(BaseModel):
|
class CrawlConfigIn(BaseModel):
|
||||||
|
@ -262,6 +262,10 @@ This setting will only be shown if multiple different release channels are avail
|
|||||||
|
|
||||||
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
|
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
|
||||||
|
|
||||||
|
### Save Local and Session Storage
|
||||||
|
|
||||||
|
When enabled, instructs the crawler to save the browser's `localStorage` and `sessionStorage` data for each page in the web archive as part of the `WARC-JSON-Metadata` field. This option may be necessary to properly archive and replay certain websites. Use caution when sharing WACZ files created with this option enabled, as the saved browser storage may contain sensitive information.
|
||||||
|
|
||||||
### User Agent
|
### User Agent
|
||||||
|
|
||||||
Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/).
|
Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/).
|
||||||
|
@ -270,6 +270,10 @@ export class ConfigDetails extends BtrixElement {
|
|||||||
msg("Block Ads by Domain"),
|
msg("Block Ads by Domain"),
|
||||||
seedsConfig?.blockAds,
|
seedsConfig?.blockAds,
|
||||||
)}
|
)}
|
||||||
|
${this.renderSetting(
|
||||||
|
msg("Save Local and Session Storage"),
|
||||||
|
seedsConfig?.saveStorage,
|
||||||
|
)}
|
||||||
${this.renderSetting(
|
${this.renderSetting(
|
||||||
msg("User Agent"),
|
msg("User Agent"),
|
||||||
seedsConfig?.userAgent
|
seedsConfig?.userAgent
|
||||||
|
@ -1980,6 +1980,19 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
</sl-checkbox>
|
</sl-checkbox>
|
||||||
`)}
|
`)}
|
||||||
${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
|
${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
|
||||||
|
${inputCol(html`
|
||||||
|
<sl-checkbox name="saveStorage" ?checked=${this.formState.saveStorage}>
|
||||||
|
${msg("Save local and session storage")}
|
||||||
|
</sl-checkbox>
|
||||||
|
`)}
|
||||||
|
${this.renderHelpTextCol(
|
||||||
|
html`${infoTextFor["saveStorage"]}
|
||||||
|
${this.renderUserGuideLink({
|
||||||
|
hash: "save-local-and-session-storage",
|
||||||
|
content: msg("Implications for shared archives"),
|
||||||
|
})}.`,
|
||||||
|
false,
|
||||||
|
)}
|
||||||
${inputCol(html`
|
${inputCol(html`
|
||||||
<sl-input
|
<sl-input
|
||||||
name="userAgent"
|
name="userAgent"
|
||||||
@ -3064,6 +3077,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
| "useSitemap"
|
| "useSitemap"
|
||||||
| "failOnFailedSeed"
|
| "failOnFailedSeed"
|
||||||
| "failOnContentCheck"
|
| "failOnContentCheck"
|
||||||
|
| "saveStorage"
|
||||||
> {
|
> {
|
||||||
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
|
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
|
||||||
|
|
||||||
@ -3082,6 +3096,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
useSitemap: false,
|
useSitemap: false,
|
||||||
failOnFailedSeed: this.formState.failOnFailedSeed,
|
failOnFailedSeed: this.formState.failOnFailedSeed,
|
||||||
failOnContentCheck: this.formState.failOnContentCheck,
|
failOnContentCheck: this.formState.failOnContentCheck,
|
||||||
|
saveStorage: this.formState.saveStorage,
|
||||||
};
|
};
|
||||||
|
|
||||||
return config;
|
return config;
|
||||||
@ -3094,6 +3109,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
| "useSitemap"
|
| "useSitemap"
|
||||||
| "failOnFailedSeed"
|
| "failOnFailedSeed"
|
||||||
| "failOnContentCheck"
|
| "failOnContentCheck"
|
||||||
|
| "saveStorage"
|
||||||
> {
|
> {
|
||||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||||
const includeUrlList = this.formState.customIncludeUrlList
|
const includeUrlList = this.formState.customIncludeUrlList
|
||||||
@ -3125,6 +3141,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
useSitemap: this.formState.useSitemap,
|
useSitemap: this.formState.useSitemap,
|
||||||
failOnFailedSeed: false,
|
failOnFailedSeed: false,
|
||||||
failOnContentCheck: this.formState.failOnContentCheck,
|
failOnContentCheck: this.formState.failOnContentCheck,
|
||||||
|
saveStorage: this.formState.saveStorage,
|
||||||
};
|
};
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
@ -79,6 +79,9 @@ export const infoTextFor = {
|
|||||||
failOnContentCheck: msg(
|
failOnContentCheck: msg(
|
||||||
`Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
|
`Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
|
||||||
),
|
),
|
||||||
|
saveStorage: msg(
|
||||||
|
`Include data from the browser's local and session storage in the web archive.`,
|
||||||
|
),
|
||||||
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
|
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
|
||||||
|
|
||||||
export default infoTextFor;
|
export default infoTextFor;
|
||||||
|
@ -50,6 +50,7 @@ export type SeedConfig = Expand<
|
|||||||
selectLinks: string[];
|
selectLinks: string[];
|
||||||
customBehaviors: string[];
|
customBehaviors: string[];
|
||||||
clickSelector: string;
|
clickSelector: string;
|
||||||
|
saveStorage?: boolean;
|
||||||
}
|
}
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
@ -154,6 +154,7 @@ export type FormState = {
|
|||||||
proxyId: string | null;
|
proxyId: string | null;
|
||||||
selectLinks: string[];
|
selectLinks: string[];
|
||||||
clickSelector: string;
|
clickSelector: string;
|
||||||
|
saveStorage: WorkflowParams["config"]["saveStorage"];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type FormStateField = keyof FormState;
|
export type FormStateField = keyof FormState;
|
||||||
@ -215,6 +216,7 @@ export const getDefaultFormState = (): FormState => ({
|
|||||||
selectLinks: DEFAULT_SELECT_LINKS,
|
selectLinks: DEFAULT_SELECT_LINKS,
|
||||||
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
|
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
|
||||||
customBehavior: false,
|
customBehavior: false,
|
||||||
|
saveStorage: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
export const mapSeedToUrl = (arr: Seed[]) =>
|
export const mapSeedToUrl = (arr: Seed[]) =>
|
||||||
@ -379,6 +381,7 @@ export function getInitialFormState(params: {
|
|||||||
crawlerChannel:
|
crawlerChannel:
|
||||||
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
|
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
|
||||||
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
|
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
|
||||||
|
saveStorage: params.initialWorkflow.config.saveStorage,
|
||||||
...formState,
|
...formState,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user