Add custom user agent to workflows (#1465)

Fixes #1341

Adds "User Agent" field to workflow editor under the Browser Settings
tab. If not set, the crawler will use the browser's default user agent.

Also added to docs and to the workflow details page (if set).

---------

Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
Tessa Walsh 2024-01-17 17:33:50 -05:00 committed by GitHub
parent 7282274502
commit 07fa46d9aa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 41 additions and 9 deletions

View File

@ -255,6 +255,8 @@ class RawCrawlConfig(BaseModel):
logging: Optional[str] = None
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
userAgent: Optional[str] = None
# ============================================================================
class CrawlConfigIn(BaseModel):

View File

@ -59,7 +59,7 @@ metadata:
data:
CRAWL_ARGS: >-
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --userAgentSuffix {{ .Values.user_agent_suffix | quote }} --userAgent {{ .Values.user_agent | quote }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --restartsOnError --headless {{ .Values.crawler_extra_args }}
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --restartsOnError --headless {{ .Values.crawler_extra_args }}
---
apiVersion: v1

View File

@ -26,13 +26,6 @@ crawler_extract_full_text: false
# if set, each workflow can have a lower limit, but not higher
max_pages_per_crawl: 50000
# User Agent Options
# set to add suffix to default browser User Agent
# user_agent_suffix:
# set to override User Agent completely (also overrides user_agent_suffix if both are set)
# user_agent:
# default template for generate wacz files
# supports following interpolated vars:

View File

@ -162,6 +162,10 @@ This setting will only be shown if multiple different release channels are avail
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
### User Agent
Sets the browser's user agent in outgoing requests to the specified value. If left blank, the crawler will use the browser's default user agent.
### Language
Sets the browser's language setting. Useful for crawling websites that detect the browser's language setting and serve content accordingly.

View File

@ -227,6 +227,12 @@ export class ConfigDetails extends LiteElement {
msg("Block Ads by Domain"),
crawlConfig?.config.blockAds
)}
${this.renderSetting(
msg("User Agent"),
crawlConfig?.config.userAgent
? crawlConfig?.config.userAgent
: msg("Default User Agent")
)}
${crawlConfig?.config.lang
? this.renderSetting(
msg("Language"),

View File

@ -118,6 +118,7 @@ type FormState = {
autoAddCollections: string[];
description: WorkflowParams["description"];
autoscrollBehavior: boolean;
userAgent: string | null;
crawlerChannel: string;
};
@ -195,6 +196,7 @@ const getDefaultFormState = (): FormState => ({
autoAddCollections: [],
description: null,
autoscrollBehavior: true,
userAgent: null,
crawlerChannel: "default",
});
const defaultProgressState = getDefaultProgressState();
@ -587,6 +589,8 @@ export class CrawlConfigEditor extends LiteElement {
autoscrollBehavior: this.initialWorkflow.config.behaviors
? this.initialWorkflow.config.behaviors.includes("autoscroll")
: defaultFormState.autoscrollBehavior,
userAgent:
this.initialWorkflow.config.userAgent ?? defaultFormState.userAgent,
crawlerChannel:
this.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
...formState,
@ -1662,6 +1666,27 @@ https://archiveweb.page/images/${"logo.svg"}`}
>.`),
false
)}
${this.renderFormCol(html`
<sl-input
name="userAgent"
label=${msg("User Agent")}
autocomplete="off"
placeholder=${msg("Default")}
value=${this.formState.userAgent}
>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(html`Set custom user agent for crawler browsers to use in requests.
For common user agents see
<a
href="https://www.useragents.me/"
class="text-blue-600 hover:text-blue-500"
target="_blank"
rel="noopener noreferrer nofollow"
>Useragents.me</a
>.`)
)}
${this.renderFormCol(html`
<btrix-language-select
.value=${this.formState.lang as LanguageCode}
@ -2385,7 +2410,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
behaviorTimeout: this.formState.behaviorTimeoutSeconds,
pageLoadTimeout: this.formState.pageLoadTimeoutSeconds,
pageExtraDelay: this.formState.pageExtraDelaySeconds,
userAgent: this.formState.userAgent,
limit: this.formState.pageLimit,
lang: this.formState.lang || "",
blockAds: this.formState.blockAds,

View File

@ -23,6 +23,7 @@ const defaultValue = {
pageExtraDelay: null,
useSitemap: false,
failOnFailedSeed: false,
userAgent: null,
},
tags: [],
crawlTimeout: null,

View File

@ -31,6 +31,7 @@ export type SeedConfig = Expand<
useSitemap: boolean;
failOnFailedSeed: boolean;
depth?: number | null;
userAgent?: string | null;
}
>;