Add saveStorage option to workflow (#2757)

Fixes #2753 

- Adds `saveStorage` to `RawCrawlConfig` model in backend
- Adds option to Browser Settings pane of workflow editor
- Adds option to config details component
- Adds setting to docs
This commit is contained in:
Tessa Walsh 2025-08-01 01:58:15 -04:00 committed by GitHub
parent 5a4add84a8
commit 3a05002491
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 34 additions and 0 deletions

View File

@ -370,6 +370,8 @@ class RawCrawlConfig(BaseModel):
selectLinks: List[str] = ["a[href]->href"]
clickSelector: str = "a"
saveStorage: Optional[bool] = False
# ============================================================================
class CrawlConfigIn(BaseModel):

View File

@ -262,6 +262,10 @@ This setting will only be shown if multiple different release channels are avail
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
### Save Local and Session Storage
When enabled, instructs the crawler to save the browser's `localStorage` and `sessionStorage` data for each page in the web archive as part of the `WARC-JSON-Metadata` field. This option may be necessary to properly archive and replay certain websites. Use caution when sharing WACZ files created with this option enabled, as the saved browser storage may contain sensitive information.
### User Agent
Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/).

View File

@ -270,6 +270,10 @@ export class ConfigDetails extends BtrixElement {
msg("Block Ads by Domain"),
seedsConfig?.blockAds,
)}
${this.renderSetting(
msg("Save Local and Session Storage"),
seedsConfig?.saveStorage,
)}
${this.renderSetting(
msg("User Agent"),
seedsConfig?.userAgent

View File

@ -1980,6 +1980,19 @@ https://archiveweb.page/images/${"logo.svg"}`}
</sl-checkbox>
`)}
${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
${inputCol(html`
<sl-checkbox name="saveStorage" ?checked=${this.formState.saveStorage}>
${msg("Save local and session storage")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(
html`${infoTextFor["saveStorage"]}
${this.renderUserGuideLink({
hash: "save-local-and-session-storage",
content: msg("Implications for shared archives"),
})}.`,
false,
)}
${inputCol(html`
<sl-input
name="userAgent"
@ -3064,6 +3077,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "useSitemap"
| "failOnFailedSeed"
| "failOnContentCheck"
| "saveStorage"
> {
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
@ -3082,6 +3096,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
useSitemap: false,
failOnFailedSeed: this.formState.failOnFailedSeed,
failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
};
return config;
@ -3094,6 +3109,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "useSitemap"
| "failOnFailedSeed"
| "failOnContentCheck"
| "saveStorage"
> {
const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList
@ -3125,6 +3141,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
useSitemap: this.formState.useSitemap,
failOnFailedSeed: false,
failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
};
return config;
}

View File

@ -79,6 +79,9 @@ export const infoTextFor = {
failOnContentCheck: msg(
`Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
),
saveStorage: msg(
`Include data from the browser's local and session storage in the web archive.`,
),
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
export default infoTextFor;

View File

@ -50,6 +50,7 @@ export type SeedConfig = Expand<
selectLinks: string[];
customBehaviors: string[];
clickSelector: string;
saveStorage?: boolean;
}
>;

View File

@ -154,6 +154,7 @@ export type FormState = {
proxyId: string | null;
selectLinks: string[];
clickSelector: string;
saveStorage: WorkflowParams["config"]["saveStorage"];
};
export type FormStateField = keyof FormState;
@ -215,6 +216,7 @@ export const getDefaultFormState = (): FormState => ({
selectLinks: DEFAULT_SELECT_LINKS,
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
customBehavior: false,
saveStorage: false,
});
export const mapSeedToUrl = (arr: Seed[]) =>
@ -379,6 +381,7 @@ export function getInitialFormState(params: {
crawlerChannel:
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
saveStorage: params.initialWorkflow.config.saveStorage,
...formState,
};
}