diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e31ac097..7996a67a 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -370,6 +370,8 @@ class RawCrawlConfig(BaseModel): selectLinks: List[str] = ["a[href]->href"] clickSelector: str = "a" + saveStorage: Optional[bool] = False + # ============================================================================ class CrawlConfigIn(BaseModel): diff --git a/frontend/docs/docs/user-guide/workflow-setup.md b/frontend/docs/docs/user-guide/workflow-setup.md index a80b4355..8ea1afd3 100644 --- a/frontend/docs/docs/user-guide/workflow-setup.md +++ b/frontend/docs/docs/user-guide/workflow-setup.md @@ -262,6 +262,10 @@ This setting will only be shown if multiple different release channels are avail Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler. +### Save Local and Session Storage + +When enabled, instructs the crawler to save the browser's `localStorage` and `sessionStorage` data for each page in the web archive as part of the `WARC-JSON-Metadata` field. This option may be necessary to properly archive and replay certain websites. Use caution when sharing WACZ files created with this option enabled, as the saved browser storage may contain sensitive information. + ### User Agent Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/). diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index a9f6eeb1..6ce67e69 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -270,6 +270,10 @@ export class ConfigDetails extends BtrixElement { msg("Block Ads by Domain"), seedsConfig?.blockAds, )} + ${this.renderSetting( + msg("Save Local and Session Storage"), + seedsConfig?.saveStorage, + )} ${this.renderSetting( msg("User Agent"), seedsConfig?.userAgent diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index d342a772..5ccb05f4 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1980,6 +1980,19 @@ https://archiveweb.page/images/${"logo.svg"}`} `)} ${this.renderHelpTextCol(infoTextFor["blockAds"], false)} + ${inputCol(html` + + ${msg("Save local and session storage")} + + `)} + ${this.renderHelpTextCol( + html`${infoTextFor["saveStorage"]} + ${this.renderUserGuideLink({ + hash: "save-local-and-session-storage", + content: msg("Implications for shared archives"), + })}.`, + false, + )} ${inputCol(html` { const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON; @@ -3082,6 +3096,7 @@ https://archiveweb.page/images/${"logo.svg"}`} useSitemap: false, failOnFailedSeed: this.formState.failOnFailedSeed, failOnContentCheck: this.formState.failOnContentCheck, + saveStorage: this.formState.saveStorage, }; return config; @@ -3094,6 +3109,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | "useSitemap" | "failOnFailedSeed" | "failOnContentCheck" + | "saveStorage" > { const primarySeedUrl = this.formState.primarySeedUrl; const includeUrlList = this.formState.customIncludeUrlList @@ -3125,6 +3141,7 @@ https://archiveweb.page/images/${"logo.svg"}`} useSitemap: this.formState.useSitemap, failOnFailedSeed: false, failOnContentCheck: this.formState.failOnContentCheck, + saveStorage: this.formState.saveStorage, }; return config; } diff --git a/frontend/src/strings/crawl-workflows/infoText.ts b/frontend/src/strings/crawl-workflows/infoText.ts index ccc4f40b..63366a23 100644 --- a/frontend/src/strings/crawl-workflows/infoText.ts +++ b/frontend/src/strings/crawl-workflows/infoText.ts @@ -79,6 +79,9 @@ export const infoTextFor = { failOnContentCheck: msg( `Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`, ), + saveStorage: msg( + `Include data from the browser's local and session storage in the web archive.`, + ), } as const satisfies Partial>; export default infoTextFor; diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 0fefb915..b00b81fe 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -50,6 +50,7 @@ export type SeedConfig = Expand< selectLinks: string[]; customBehaviors: string[]; clickSelector: string; + saveStorage?: boolean; } >; diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index 6d877ab6..f6a40d65 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -154,6 +154,7 @@ export type FormState = { proxyId: string | null; selectLinks: string[]; clickSelector: string; + saveStorage: WorkflowParams["config"]["saveStorage"]; }; export type FormStateField = keyof FormState; @@ -215,6 +216,7 @@ export const getDefaultFormState = (): FormState => ({ selectLinks: DEFAULT_SELECT_LINKS, clickSelector: DEFAULT_AUTOCLICK_SELECTOR, customBehavior: false, + saveStorage: false, }); export const mapSeedToUrl = (arr: Seed[]) => @@ -379,6 +381,7 @@ export function getInitialFormState(params: { crawlerChannel: params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel, proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId, + saveStorage: params.initialWorkflow.config.saveStorage, ...formState, }; }