From 5684e896af36553f4159c02d1ec07a7b667adcc0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 16 Jan 2025 15:44:00 -0500 Subject: [PATCH] Add support for autoclick (#2313) Fixes #2259 This PR brings backend and frontend support for the new autoclick behavior in Browsertrix, introduces in Browsertrix 1.5.0+ On the backend, we introduce `min_autoclick_crawler_image` to `values.yaml`, with a default value of `"docker.io/webrecorder/browsertrix-crawler:1.5.0"`. If this is set and the crawler version for a new crawl is less than this value, the autoclick behavior is removed from the behaviors list in the configmap created for the crawl. The one caveat for this is that a crawler image tag like "latest" will always be parsed as greater than `min_autoclick_crawler_image`, so there is the potential for the crawler to run into issues if using a non-numeric image tag with an older version of the crawler. For production we use hardcoded specific versions of the crawler except for the dev channel, which from here on out will including autoclick support, so I think this should be okay (and is also true of the existing implementation for checking `min_qa_crawler_image`). On the frontend, I've added a checkbox (unchecked by default) in the "Limits" section just below the current checkbox for autoscroll. We might want to move these to a different section eventually - I'm not sure Limits is the right place for them - but I wanted to be consistent with things as they are. --------- Co-authored-by: Ilya Kreymer --- backend/btrixcloud/operator/crawls.py | 33 ++++++++++++++++++- chart/templates/configmap.yaml | 2 ++ chart/values.yaml | 3 ++ .../crawl-workflows/workflow-editor.ts | 33 ++++++++++++++++--- frontend/src/utils/workflow.ts | 5 +++ 5 files changed, 71 insertions(+), 5 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 737397fb..9c2a696b 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -340,6 +340,31 @@ class CrawlOperator(BaseOperator): return self.load_from_yaml("redis.yaml", params) + def _filter_autoclick_behavior( + self, behaviors: Optional[str], crawler_image: str + ) -> Optional[str]: + """Remove autoclick behavior if crawler version doesn't support it""" + min_autoclick_crawler_image = os.environ.get("MIN_AUTOCLICK_CRAWLER_IMAGE") + + if ( + min_autoclick_crawler_image + and behaviors + and "autoclick" in behaviors + and crawler_image + and crawler_image < min_autoclick_crawler_image + ): + print( + "Crawler version < min_autoclick_crawler_image, removing autoclick behavior", + flush=True, + ) + behaviors_list = behaviors.split(",") + filtered_behaviors = [ + behavior for behavior in behaviors_list if behavior != "autoclick" + ] + return ",".join(filtered_behaviors) + + return behaviors + async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params): name = f"crawl-config-{crawl.id}" @@ -357,7 +382,13 @@ class CrawlOperator(BaseOperator): crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid) - params["config"] = json.dumps(crawlconfig.get_raw_config()) + raw_config = crawlconfig.get_raw_config() + + raw_config["behaviors"] = self._filter_autoclick_behavior( + raw_config["behaviors"], params["crawler_image"] + ) + + params["config"] = json.dumps(raw_config) return self.load_from_yaml("crawl_configmap.yaml", params) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index f29f7788..eec66c33 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -61,6 +61,8 @@ data: MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}" + MIN_AUTOCLICK_CRAWLER_IMAGE: "{{ .Values.min_autoclick_crawler_image }}" + NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}" MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" diff --git a/chart/values.yaml b/chart/values.yaml index d4c068ba..28c08736 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -250,6 +250,9 @@ crawler_namespace: "crawlers" # if set, will restrict QA to image names that are >= than this value # min_qa_crawler_image: "" +# if set, will restrict autoclick behavior to image names that are >= than this value +min_autoclick_crawler_image: "docker.io/webrecorder/browsertrix-crawler:1.5.0" + # optional: enable to use a persist volume claim for all crawls # can be enabled to use a multi-write shared filesystem # crawler_pv_claim: "nfs-shared-crawls" diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 9c5ae5a9..d0023fb5 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1299,6 +1299,20 @@ https://archiveweb.page/images/${"logo.svg"}`} ), false, )} + ${inputCol( + html` + ${msg("Autoclick behavior")} + `, + )} + ${this.renderHelpTextCol( + msg( + `When enabled the browser will automatically click on links that don't navigate to other pages.`, + ), + false, + )} ${inputCol(html` ({ autoAddCollections: [], description: null, autoscrollBehavior: true, + autoclickBehavior: false, userAgent: null, crawlerChannel: "default", proxyId: null, @@ -286,6 +288,9 @@ export function getInitialFormState(params: { autoscrollBehavior: params.initialWorkflow.config.behaviors ? params.initialWorkflow.config.behaviors.includes("autoscroll") : defaultFormState.autoscrollBehavior, + autoclickBehavior: params.initialWorkflow.config.behaviors + ? params.initialWorkflow.config.behaviors.includes("autoclick") + : defaultFormState.autoclickBehavior, userAgent: params.initialWorkflow.config.userAgent ?? defaultFormState.userAgent, crawlerChannel: