Add saveStorage option to workflow (#2757)

Fixes #2753 

- Adds `saveStorage` to `RawCrawlConfig` model in backend
- Adds option to Browser Settings pane of workflow editor
- Adds option to config details component
- Adds setting to docs
This commit is contained in:
Tessa Walsh 2025-08-01 01:58:15 -04:00 committed by GitHub
parent 5a4add84a8
commit 3a05002491
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 34 additions and 0 deletions

View File

@ -370,6 +370,8 @@ class RawCrawlConfig(BaseModel):
selectLinks: List[str] = ["a[href]->href"] selectLinks: List[str] = ["a[href]->href"]
clickSelector: str = "a" clickSelector: str = "a"
saveStorage: Optional[bool] = False
# ============================================================================ # ============================================================================
class CrawlConfigIn(BaseModel): class CrawlConfigIn(BaseModel):

View File

@ -262,6 +262,10 @@ This setting will only be shown if multiple different release channels are avail
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler. Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
### Save Local and Session Storage
When enabled, instructs the crawler to save the browser's `localStorage` and `sessionStorage` data for each page in the web archive as part of the `WARC-JSON-Metadata` field. This option may be necessary to properly archive and replay certain websites. Use caution when sharing WACZ files created with this option enabled, as the saved browser storage may contain sensitive information.
### User Agent ### User Agent
Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/). Sets the browser's [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) in outgoing requests to the specified value. If left blank, the crawler will use the Brave browser's default user agent. For a list of common user agents see [useragents.me](https://www.useragents.me/).

View File

@ -270,6 +270,10 @@ export class ConfigDetails extends BtrixElement {
msg("Block Ads by Domain"), msg("Block Ads by Domain"),
seedsConfig?.blockAds, seedsConfig?.blockAds,
)} )}
${this.renderSetting(
msg("Save Local and Session Storage"),
seedsConfig?.saveStorage,
)}
${this.renderSetting( ${this.renderSetting(
msg("User Agent"), msg("User Agent"),
seedsConfig?.userAgent seedsConfig?.userAgent

View File

@ -1980,6 +1980,19 @@ https://archiveweb.page/images/${"logo.svg"}`}
</sl-checkbox> </sl-checkbox>
`)} `)}
${this.renderHelpTextCol(infoTextFor["blockAds"], false)} ${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
${inputCol(html`
<sl-checkbox name="saveStorage" ?checked=${this.formState.saveStorage}>
${msg("Save local and session storage")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(
html`${infoTextFor["saveStorage"]}
${this.renderUserGuideLink({
hash: "save-local-and-session-storage",
content: msg("Implications for shared archives"),
})}.`,
false,
)}
${inputCol(html` ${inputCol(html`
<sl-input <sl-input
name="userAgent" name="userAgent"
@ -3064,6 +3077,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "useSitemap" | "useSitemap"
| "failOnFailedSeed" | "failOnFailedSeed"
| "failOnContentCheck" | "failOnContentCheck"
| "saveStorage"
> { > {
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON; const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
@ -3082,6 +3096,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
useSitemap: false, useSitemap: false,
failOnFailedSeed: this.formState.failOnFailedSeed, failOnFailedSeed: this.formState.failOnFailedSeed,
failOnContentCheck: this.formState.failOnContentCheck, failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
}; };
return config; return config;
@ -3094,6 +3109,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "useSitemap" | "useSitemap"
| "failOnFailedSeed" | "failOnFailedSeed"
| "failOnContentCheck" | "failOnContentCheck"
| "saveStorage"
> { > {
const primarySeedUrl = this.formState.primarySeedUrl; const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList const includeUrlList = this.formState.customIncludeUrlList
@ -3125,6 +3141,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
useSitemap: this.formState.useSitemap, useSitemap: this.formState.useSitemap,
failOnFailedSeed: false, failOnFailedSeed: false,
failOnContentCheck: this.formState.failOnContentCheck, failOnContentCheck: this.formState.failOnContentCheck,
saveStorage: this.formState.saveStorage,
}; };
return config; return config;
} }

View File

@ -79,6 +79,9 @@ export const infoTextFor = {
failOnContentCheck: msg( failOnContentCheck: msg(
`Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`, `Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
), ),
saveStorage: msg(
`Include data from the browser's local and session storage in the web archive.`,
),
} as const satisfies Partial<Record<Field, string | TemplateResult>>; } as const satisfies Partial<Record<Field, string | TemplateResult>>;
export default infoTextFor; export default infoTextFor;

View File

@ -50,6 +50,7 @@ export type SeedConfig = Expand<
selectLinks: string[]; selectLinks: string[];
customBehaviors: string[]; customBehaviors: string[];
clickSelector: string; clickSelector: string;
saveStorage?: boolean;
} }
>; >;

View File

@ -154,6 +154,7 @@ export type FormState = {
proxyId: string | null; proxyId: string | null;
selectLinks: string[]; selectLinks: string[];
clickSelector: string; clickSelector: string;
saveStorage: WorkflowParams["config"]["saveStorage"];
}; };
export type FormStateField = keyof FormState; export type FormStateField = keyof FormState;
@ -215,6 +216,7 @@ export const getDefaultFormState = (): FormState => ({
selectLinks: DEFAULT_SELECT_LINKS, selectLinks: DEFAULT_SELECT_LINKS,
clickSelector: DEFAULT_AUTOCLICK_SELECTOR, clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
customBehavior: false, customBehavior: false,
saveStorage: false,
}); });
export const mapSeedToUrl = (arr: Seed[]) => export const mapSeedToUrl = (arr: Seed[]) =>
@ -379,6 +381,7 @@ export function getInitialFormState(params: {
crawlerChannel: crawlerChannel:
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel, params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId, proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
saveStorage: params.initialWorkflow.config.saveStorage,
...formState, ...formState,
}; };
} }