Add saveStorage option to workflow (#2757)
Fixes #2753 - Adds `saveStorage` to `RawCrawlConfig` model in backend - Adds option to Browser Settings pane of workflow editor - Adds option to config details component - Adds setting to docs
This commit is contained in:
parent
5a4add84a8
commit
3a05002491
@ -370,6 +370,8 @@ class RawCrawlConfig(BaseModel):
|
||||
selectLinks: List[str] = ["a[href]->href"]
|
||||
clickSelector: str = "a"
|
||||
|
||||
saveStorage: Optional[bool] = False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlConfigIn(BaseModel):
|
||||
|
@ -262,6 +262,10 @@ This setting will only be shown if multiple different release channels are avail
|
||||
|
||||
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
|
||||
|
||||
### Save Local and Session Storage
|
||||
|
||||
When enabled, instructs the crawler to save the browser's `localStorage` and `sessionStorage` data for each page in the web archive as part of the `WARC-JSON-Metadata` field. This option may be necessary to properly archive and replay certain websites. Use caution when sharing WACZ files created with this option enabled, as the saved browser storage may contain sensitive information.
|
||||
|
||||
### User Agent
|
||||
|
||||
Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/).
|
||||
|
@ -270,6 +270,10 @@ export class ConfigDetails extends BtrixElement {
|
||||
msg("Block Ads by Domain"),
|
||||
seedsConfig?.blockAds,
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Save Local and Session Storage"),
|
||||
seedsConfig?.saveStorage,
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("User Agent"),
|
||||
seedsConfig?.userAgent
|
||||
|
@ -1980,6 +1980,19 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
</sl-checkbox>
|
||||
`)}
|
||||
${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
|
||||
${inputCol(html`
|
||||
<sl-checkbox name="saveStorage" ?checked=${this.formState.saveStorage}>
|
||||
${msg("Save local and session storage")}
|
||||
</sl-checkbox>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
html`${infoTextFor["saveStorage"]}
|
||||
${this.renderUserGuideLink({
|
||||
hash: "save-local-and-session-storage",
|
||||
content: msg("Implications for shared archives"),
|
||||
})}.`,
|
||||
false,
|
||||
)}
|
||||
${inputCol(html`
|
||||
<sl-input
|
||||
name="userAgent"
|
||||
@ -3064,6 +3077,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
| "useSitemap"
|
||||
| "failOnFailedSeed"
|
||||
| "failOnContentCheck"
|
||||
| "saveStorage"
|
||||
> {
|
||||
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
|
||||
|
||||
@ -3082,6 +3096,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
useSitemap: false,
|
||||
failOnFailedSeed: this.formState.failOnFailedSeed,
|
||||
failOnContentCheck: this.formState.failOnContentCheck,
|
||||
saveStorage: this.formState.saveStorage,
|
||||
};
|
||||
|
||||
return config;
|
||||
@ -3094,6 +3109,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
| "useSitemap"
|
||||
| "failOnFailedSeed"
|
||||
| "failOnContentCheck"
|
||||
| "saveStorage"
|
||||
> {
|
||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||
const includeUrlList = this.formState.customIncludeUrlList
|
||||
@ -3125,6 +3141,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
useSitemap: this.formState.useSitemap,
|
||||
failOnFailedSeed: false,
|
||||
failOnContentCheck: this.formState.failOnContentCheck,
|
||||
saveStorage: this.formState.saveStorage,
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
@ -79,6 +79,9 @@ export const infoTextFor = {
|
||||
failOnContentCheck: msg(
|
||||
`Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
|
||||
),
|
||||
saveStorage: msg(
|
||||
`Include data from the browser's local and session storage in the web archive.`,
|
||||
),
|
||||
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
|
||||
|
||||
export default infoTextFor;
|
||||
|
@ -50,6 +50,7 @@ export type SeedConfig = Expand<
|
||||
selectLinks: string[];
|
||||
customBehaviors: string[];
|
||||
clickSelector: string;
|
||||
saveStorage?: boolean;
|
||||
}
|
||||
>;
|
||||
|
||||
|
@ -154,6 +154,7 @@ export type FormState = {
|
||||
proxyId: string | null;
|
||||
selectLinks: string[];
|
||||
clickSelector: string;
|
||||
saveStorage: WorkflowParams["config"]["saveStorage"];
|
||||
};
|
||||
|
||||
export type FormStateField = keyof FormState;
|
||||
@ -215,6 +216,7 @@ export const getDefaultFormState = (): FormState => ({
|
||||
selectLinks: DEFAULT_SELECT_LINKS,
|
||||
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
|
||||
customBehavior: false,
|
||||
saveStorage: false,
|
||||
});
|
||||
|
||||
export const mapSeedToUrl = (arr: Seed[]) =>
|
||||
@ -379,6 +381,7 @@ export function getInitialFormState(params: {
|
||||
crawlerChannel:
|
||||
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
|
||||
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
|
||||
saveStorage: params.initialWorkflow.config.saveStorage,
|
||||
...formState,
|
||||
};
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user