From 07fa46d9aad0bfb0c8145b5b0e64a8f70960687e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 17 Jan 2024 17:33:50 -0500 Subject: [PATCH] Add custom user agent to workflows (#1465) Fixes #1341 Adds "User Agent" field to workflow editor under the Browser Settings tab. If not set, the crawler will use the browser's default user agent. Also added to docs and to the workflow details page (if set). --------- Co-authored-by: Henry Wilkinson Co-authored-by: Ilya Kreymer --- backend/btrixcloud/models.py | 2 ++ chart/templates/configmap.yaml | 2 +- chart/values.yaml | 7 ----- docs/user-guide/workflow-setup.md | 4 +++ frontend/src/components/ui/config-details.ts | 6 +++++ frontend/src/pages/org/workflow-editor.ts | 27 +++++++++++++++++++- frontend/src/pages/org/workflows-new.ts | 1 + frontend/src/types/crawler.ts | 1 + 8 files changed, 41 insertions(+), 9 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 49e745aa..9a71f5b7 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -255,6 +255,8 @@ class RawCrawlConfig(BaseModel): logging: Optional[str] = None behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" + userAgent: Optional[str] = None + # ============================================================================ class CrawlConfigIn(BaseModel): diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 1be43bcf..9eb8d9e4 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -59,7 +59,7 @@ metadata: data: CRAWL_ARGS: >- - --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --userAgentSuffix {{ .Values.user_agent_suffix | quote }} --userAgent {{ .Values.user_agent | quote }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --restartsOnError --headless {{ .Values.crawler_extra_args }} + --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --restartsOnError --headless {{ .Values.crawler_extra_args }} --- apiVersion: v1 diff --git a/chart/values.yaml b/chart/values.yaml index 062734b3..2471243e 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -26,13 +26,6 @@ crawler_extract_full_text: false # if set, each workflow can have a lower limit, but not higher max_pages_per_crawl: 50000 -# User Agent Options -# set to add suffix to default browser User Agent -# user_agent_suffix: - -# set to override User Agent completely (also overrides user_agent_suffix if both are set) -# user_agent: - # default template for generate wacz files # supports following interpolated vars: diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index 62928b0a..ff767007 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -162,6 +162,10 @@ This setting will only be shown if multiple different release channels are avail Will prevent any content from the domains listed in [Steven Black's Unified Hosts file](https://github.com/StevenBlack/hosts) (ads & malware) from being captured by the crawler. +### User Agent + +Sets the browser's user agent in outgoing requests to the specified value. If left blank, the crawler will use the browser's default user agent. + ### Language Sets the browser's language setting. Useful for crawling websites that detect the browser's language setting and serve content accordingly. diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index bffae5d2..33987c90 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -227,6 +227,12 @@ export class ConfigDetails extends LiteElement { msg("Block Ads by Domain"), crawlConfig?.config.blockAds )} + ${this.renderSetting( + msg("User Agent"), + crawlConfig?.config.userAgent + ? crawlConfig?.config.userAgent + : msg("Default User Agent") + )} ${crawlConfig?.config.lang ? this.renderSetting( msg("Language"), diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 16274c1e..3c73c571 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -118,6 +118,7 @@ type FormState = { autoAddCollections: string[]; description: WorkflowParams["description"]; autoscrollBehavior: boolean; + userAgent: string | null; crawlerChannel: string; }; @@ -195,6 +196,7 @@ const getDefaultFormState = (): FormState => ({ autoAddCollections: [], description: null, autoscrollBehavior: true, + userAgent: null, crawlerChannel: "default", }); const defaultProgressState = getDefaultProgressState(); @@ -587,6 +589,8 @@ export class CrawlConfigEditor extends LiteElement { autoscrollBehavior: this.initialWorkflow.config.behaviors ? this.initialWorkflow.config.behaviors.includes("autoscroll") : defaultFormState.autoscrollBehavior, + userAgent: + this.initialWorkflow.config.userAgent ?? defaultFormState.userAgent, crawlerChannel: this.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel, ...formState, @@ -1662,6 +1666,27 @@ https://archiveweb.page/images/${"logo.svg"}`} >.`), false )} + ${this.renderFormCol(html` + + + `)} + ${this.renderHelpTextCol( + msg(html`Set custom user agent for crawler browsers to use in requests. + For common user agents see + Useragents.me.`) + )} ${this.renderFormCol(html` ;