Add custom user agent to workflows (#1465)
Fixes #1341 Adds "User Agent" field to workflow editor under the Browser Settings tab. If not set, the crawler will use the browser's default user agent. Also added to docs and to the workflow details page (if set). --------- Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics> Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
7282274502
commit
07fa46d9aa
@ -255,6 +255,8 @@ class RawCrawlConfig(BaseModel):
|
||||
logging: Optional[str] = None
|
||||
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
||||
|
||||
userAgent: Optional[str] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlConfigIn(BaseModel):
|
||||
|
@ -59,7 +59,7 @@ metadata:
|
||||
|
||||
data:
|
||||
CRAWL_ARGS: >-
|
||||
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --userAgentSuffix {{ .Values.user_agent_suffix | quote }} --userAgent {{ .Values.user_agent | quote }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --restartsOnError --headless {{ .Values.crawler_extra_args }}
|
||||
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --restartsOnError --headless {{ .Values.crawler_extra_args }}
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
|
@ -26,13 +26,6 @@ crawler_extract_full_text: false
|
||||
# if set, each workflow can have a lower limit, but not higher
|
||||
max_pages_per_crawl: 50000
|
||||
|
||||
# User Agent Options
|
||||
# set to add suffix to default browser User Agent
|
||||
# user_agent_suffix:
|
||||
|
||||
# set to override User Agent completely (also overrides user_agent_suffix if both are set)
|
||||
# user_agent:
|
||||
|
||||
|
||||
# default template for generate wacz files
|
||||
# supports following interpolated vars:
|
||||
|
@ -162,6 +162,10 @@ This setting will only be shown if multiple different release channels are avail
|
||||
|
||||
Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler.
|
||||
|
||||
### User Agent
|
||||
|
||||
Sets the browser's user agent in outgoing requests to the specified value. If left blank, the crawler will use the browser's default user agent.
|
||||
|
||||
### Language
|
||||
|
||||
Sets the browser's language setting. Useful for crawling websites that detect the browser's language setting and serve content accordingly.
|
||||
|
@ -227,6 +227,12 @@ export class ConfigDetails extends LiteElement {
|
||||
msg("Block Ads by Domain"),
|
||||
crawlConfig?.config.blockAds
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("User Agent"),
|
||||
crawlConfig?.config.userAgent
|
||||
? crawlConfig?.config.userAgent
|
||||
: msg("Default User Agent")
|
||||
)}
|
||||
${crawlConfig?.config.lang
|
||||
? this.renderSetting(
|
||||
msg("Language"),
|
||||
|
@ -118,6 +118,7 @@ type FormState = {
|
||||
autoAddCollections: string[];
|
||||
description: WorkflowParams["description"];
|
||||
autoscrollBehavior: boolean;
|
||||
userAgent: string | null;
|
||||
crawlerChannel: string;
|
||||
};
|
||||
|
||||
@ -195,6 +196,7 @@ const getDefaultFormState = (): FormState => ({
|
||||
autoAddCollections: [],
|
||||
description: null,
|
||||
autoscrollBehavior: true,
|
||||
userAgent: null,
|
||||
crawlerChannel: "default",
|
||||
});
|
||||
const defaultProgressState = getDefaultProgressState();
|
||||
@ -587,6 +589,8 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
autoscrollBehavior: this.initialWorkflow.config.behaviors
|
||||
? this.initialWorkflow.config.behaviors.includes("autoscroll")
|
||||
: defaultFormState.autoscrollBehavior,
|
||||
userAgent:
|
||||
this.initialWorkflow.config.userAgent ?? defaultFormState.userAgent,
|
||||
crawlerChannel:
|
||||
this.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
|
||||
...formState,
|
||||
@ -1662,6 +1666,27 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
>.`),
|
||||
false
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="userAgent"
|
||||
label=${msg("User Agent")}
|
||||
autocomplete="off"
|
||||
placeholder=${msg("Default")}
|
||||
value=${this.formState.userAgent}
|
||||
>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(html`Set custom user agent for crawler browsers to use in requests.
|
||||
For common user agents see
|
||||
<a
|
||||
href="https://www.useragents.me/"
|
||||
class="text-blue-600 hover:text-blue-500"
|
||||
target="_blank"
|
||||
rel="noopener noreferrer nofollow"
|
||||
>Useragents.me</a
|
||||
>.`)
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<btrix-language-select
|
||||
.value=${this.formState.lang as LanguageCode}
|
||||
@ -2385,7 +2410,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
behaviorTimeout: this.formState.behaviorTimeoutSeconds,
|
||||
pageLoadTimeout: this.formState.pageLoadTimeoutSeconds,
|
||||
pageExtraDelay: this.formState.pageExtraDelaySeconds,
|
||||
|
||||
userAgent: this.formState.userAgent,
|
||||
limit: this.formState.pageLimit,
|
||||
lang: this.formState.lang || "",
|
||||
blockAds: this.formState.blockAds,
|
||||
|
@ -23,6 +23,7 @@ const defaultValue = {
|
||||
pageExtraDelay: null,
|
||||
useSitemap: false,
|
||||
failOnFailedSeed: false,
|
||||
userAgent: null,
|
||||
},
|
||||
tags: [],
|
||||
crawlTimeout: null,
|
||||
|
@ -31,6 +31,7 @@ export type SeedConfig = Expand<
|
||||
useSitemap: boolean;
|
||||
failOnFailedSeed: boolean;
|
||||
depth?: number | null;
|
||||
userAgent?: string | null;
|
||||
}
|
||||
>;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user