From 8ea16393c5e121a228ce10ad4dec9c1c2fb3006a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 10 Jun 2025 19:13:57 +0000 Subject: [PATCH] Optimize single-page crawl workflows (#2656) For single page crawls: - Always force 1 browser to be used, ignoring browser windows/scale setting - Don't use custom PVC volumes in crawler / redis, just use emptyDir - no chance of crawler being interrupted and restarted on different machine for a single page. Adds a 'is_single_page' check to CrawlConfig, checking for either limit or scopeType / no extra hops. Fixes #2655 --- backend/btrixcloud/crawlconfigs.py | 28 ++++++++++++++++++--- backend/btrixcloud/crawlmanager.py | 2 ++ backend/btrixcloud/k8sapi.py | 4 +++ backend/btrixcloud/operator/crawls.py | 17 ++++++++----- backend/btrixcloud/operator/cronjobs.py | 1 + backend/btrixcloud/operator/models.py | 1 + backend/test/conftest.py | 2 +- backend/test/test_crawlconfigs.py | 32 ++++++++++++++++++++++++ backend/test/test_filter_sort_results.py | 12 ++++----- chart/app-templates/crawl_job.yaml | 2 ++ chart/app-templates/crawler.yaml | 7 +++++- chart/app-templates/redis.yaml | 6 +++++ chart/values.yaml | 6 ++--- 13 files changed, 100 insertions(+), 20 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 09f0b911..dd74e4cc 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -45,6 +45,7 @@ from .models import ( CrawlerProxy, CrawlerProxies, ValidateCustomBehavior, + RawCrawlConfig, ) from .utils import ( dt_now, @@ -223,15 +224,18 @@ class CrawlConfigOps: ) -> CrawlConfigAddedResponse: """Add new crawl config""" + # ensure crawlChannel is valid + if not self.get_channel_crawler_image(config_in.crawlerChannel): + raise HTTPException(status_code=404, detail="crawler_not_found") + # Overrides scale if set if config_in.browserWindows is None: config_in.browserWindows = browser_windows_from_scale( cast(int, config_in.scale) ) - # ensure crawlChannel is valid - if not self.get_channel_crawler_image(config_in.crawlerChannel): - raise HTTPException(status_code=404, detail="crawler_not_found") + if self.is_single_page(config_in.config): + config_in.browserWindows = 1 profileid = None if isinstance(config_in.profileid, UUID): @@ -321,6 +325,19 @@ class CrawlConfigOps: execMinutesQuotaReached=exec_mins_quota_reached, ) + def is_single_page(self, config: RawCrawlConfig): + """return true if this config represents a single page crawl""" + if not config.seeds or len(config.seeds) != 1: + return False + + if config.limit == 1: + return True + + extra_hops = config.seeds[0].extraHops or config.extraHops + scope_type = config.seeds[0].scopeType or config.scopeType + + return extra_hops == 0 and scope_type == "page" + def _validate_link_selectors(self, link_selectors: List[str]): """Validate link selectors @@ -435,6 +452,10 @@ class CrawlConfigOps: if update.config and update.config.lang: validate_language_code(update.config.lang) + if update.config or update.browserWindows: + if self.is_single_page(update.config or orig_crawl_config.config): + update.browserWindows = 1 + # indicates if any k8s crawl config settings changed changed = False changed = changed or ( @@ -1021,6 +1042,7 @@ class CrawlConfigOps: warc_prefix=self.get_warc_prefix(org, crawlconfig), storage_filename=storage_filename, profile_filename=profile_filename or "", + is_single_page=self.is_single_page(crawlconfig.config), ) await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True) return crawl_id diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 94dfcce1..d4d6681d 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -220,6 +220,7 @@ class CrawlManager(K8sAPI): warc_prefix: str, storage_filename: str, profile_filename: str, + is_single_page: bool, ) -> str: """create new crawl job from config""" cid = str(crawlconfig.id) @@ -244,6 +245,7 @@ class CrawlManager(K8sAPI): storage_filename=storage_filename, profile_filename=profile_filename, proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID, + is_single_page=is_single_page, ) async def reload_running_crawl_config(self, crawl_id: str): diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 41495f60..e25f0b99 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -95,6 +95,7 @@ class K8sAPI: profile_filename: str = "", qa_source: str = "", proxy_id: str = "", + is_single_page: bool = False, ): """load job template from yaml""" if not crawl_id: @@ -119,6 +120,7 @@ class K8sAPI: "profile_filename": profile_filename, "qa_source": qa_source, "proxy_id": proxy_id, + "is_single_page": "1" if is_single_page else "0", } data = self.templates.env.get_template("crawl_job.yaml").render(params) @@ -142,6 +144,7 @@ class K8sAPI: profile_filename: str = "", qa_source: str = "", proxy_id: str = "", + is_single_page: bool = False, ) -> str: """load and init crawl job via k8s api""" crawl_id, data = self.new_crawl_job_yaml( @@ -161,6 +164,7 @@ class K8sAPI: profile_filename=profile_filename, qa_source=qa_source, proxy_id=proxy_id, + is_single_page=is_single_page, ) # create job directly diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index aed9cf66..c52314d0 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -101,6 +101,8 @@ class CrawlOperator(BaseOperator): paused_expires_delta: timedelta + num_browsers_per_pod: int + def __init__(self, *args): super().__init__(*args) @@ -125,6 +127,8 @@ class CrawlOperator(BaseOperator): self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes) + self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) + def init_routes(self, app): """init routes for this operator""" @@ -181,6 +185,7 @@ class CrawlOperator(BaseOperator): max_crawl_size=int(spec.get("maxCrawlSize") or 0), scheduled=spec.get("manual") != "1", qa_source_crawl_id=spec.get("qaSourceCrawlId"), + is_single_page=spec.get("isSinglePage") == "1", ) if crawl.qa_source_crawl_id: @@ -301,7 +306,7 @@ class CrawlOperator(BaseOperator): status.stopReason = stop_reason await self.mark_finished(crawl, status, state) - children = self._load_redis(params, status, data.children) + children = self._load_redis(params, status, crawl, data.children) storage_path = crawl.storage.get_storage_extra_path(oid) storage_secret = crawl.storage.get_storage_secret_name(oid) @@ -368,10 +373,8 @@ class CrawlOperator(BaseOperator): # crawl_scale is the number of pods to create crawler_scale = scale_from_browser_windows(crawl.browser_windows) - browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) - for i in range(0, crawler_scale): - if status.pagesFound < i * browsers_per_pod: + if status.pagesFound < i * self.num_browsers_per_pod: break children.extend( @@ -392,7 +395,7 @@ class CrawlOperator(BaseOperator): "resyncAfterSeconds": status.resync_after, } - def _load_redis(self, params, status: CrawlStatus, children): + def _load_redis(self, params, status: CrawlStatus, crawl: CrawlSpec, children): name = f"redis-{params['id']}" has_pod = name in children[POD] @@ -400,6 +403,8 @@ class CrawlOperator(BaseOperator): params["name"] = name params["cpu"] = pod_info.newCpu or params.get("redis_cpu") params["memory"] = pod_info.newMemory or params.get("redis_memory") + params["no_pvc"] = crawl.is_single_page + restart_reason = None if has_pod: restart_reason = pod_info.should_restart_pod() @@ -870,7 +875,7 @@ class CrawlOperator(BaseOperator): if redis_pod in pods: # if has other pods, keep redis pod until they are removed if len(pods) > 1: - new_children = self._load_redis(params, status, children) + new_children = self._load_redis(params, status, crawl, children) await self.increment_pod_exec_time(pods, crawl, status) # keep pvs until pods are removed diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index cb515c4b..fdd76c44 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -140,6 +140,7 @@ class CronJobOperator(BaseOperator): storage_filename=self.crawl_config_ops.default_filename_template, profile_filename=profile_filename or "", proxy_id=crawlconfig.proxyId or "", + is_single_page=self.crawl_config_ops.is_single_page(crawlconfig.config), ) return MCDecoratorSyncResponse(attachments=list(yaml.safe_load_all(crawljob))) diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 02f4d2a2..d98db940 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -86,6 +86,7 @@ class CrawlSpec(BaseModel): max_crawl_size: int = 0 qa_source_crawl_id: Optional[str] = "" proxy_id: Optional[str] = None + is_single_page: bool = False @property def db_crawl_id(self) -> str: diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 31dd92dd..fc8b767c 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -331,7 +331,7 @@ def sample_crawl_data(): return { "runNow": False, "name": "Test Crawl", - "config": {"seeds": [{"url": "https://example.com/"}]}, + "config": {"seeds": [{"url": "https://example.com/"}], "extraHops": 1}, "tags": ["tag1", "tag2"], } diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index b4a70a12..c821bab9 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -6,6 +6,7 @@ from .conftest import API_PREFIX cid = None +cid_single_page = None UPDATED_NAME = "Updated name" UPDATED_DESCRIPTION = "Updated description" UPDATED_TAGS = ["tag3", "tag4"] @@ -67,6 +68,37 @@ def test_verify_default_browser_windows( assert data["browserWindows"] == 2 +def test_add_crawl_config_single_page( + crawler_auth_headers, default_org_id, sample_crawl_data +): + # Create crawl config + sample_crawl_data["config"]["limit"] = 1 + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + + data = r.json() + global cid_single_page + cid_single_page = data["id"] + + +def test_verify_default_browser_windows_single_page( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid_single_page}/", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + + data = r.json() + assert data.get("scale") is None + assert data["browserWindows"] == 1 + + def test_custom_browser_windows( crawler_auth_headers, default_org_id, sample_crawl_data ): diff --git a/backend/test/test_filter_sort_results.py b/backend/test/test_filter_sort_results.py index 77c7e185..239531cb 100644 --- a/backend/test/test_filter_sort_results.py +++ b/backend/test/test_filter_sort_results.py @@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 7 - assert r.json()["total"] == 7 + assert len(r.json()["items"]) == 8 + assert r.json()["total"] == 8 def test_get_config_by_modified_by( @@ -23,8 +23,8 @@ def test_get_config_by_modified_by( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 7 - assert r.json()["total"] == 7 + assert len(r.json()["items"]) == 8 + assert r.json()["total"] == 8 def test_get_configs_by_first_seed( @@ -362,9 +362,9 @@ def test_sort_crawl_configs( headers=crawler_auth_headers, ) data = r.json() - assert data["total"] == 13 + assert data["total"] == 14 items = data["items"] - assert len(items) == 13 + assert len(items) == 14 last_created = None for config in items: diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml index 4b749fab..c921c830 100644 --- a/chart/app-templates/crawl_job.yaml +++ b/chart/app-templates/crawl_job.yaml @@ -39,3 +39,5 @@ spec: pausedAt: "{{ pausedAt }}" + isSinglePage: "{{ is_single_page }}" + diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index be8992ad..10a2a223 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -1,3 +1,4 @@ +{% if not no_pvc %} # ------- # PVC # ------- @@ -23,7 +24,7 @@ spec: storageClassName: {{ volume_storage_class }} {% endif %} - +{% endif %} # ------- # CRAWLER @@ -67,8 +68,12 @@ spec: name: qa-replay-{{ qa_source_crawl_id }} {% endif %} - name: crawl-data + {% if not no_pvc %} persistentVolumeClaim: claimName: {{ name }} + {% else %} + emptyDir: {} + {% endif %} {% if proxy_id %} - name: proxies secret: diff --git a/chart/app-templates/redis.yaml b/chart/app-templates/redis.yaml index 8d31e210..e366fbca 100644 --- a/chart/app-templates/redis.yaml +++ b/chart/app-templates/redis.yaml @@ -1,3 +1,4 @@ +{% if not no_pvc %} # ------- # PVC # ------- @@ -22,6 +23,7 @@ spec: {% if volume_storage_class %} storageClassName: {{ volume_storage_class }} {% endif %} +{% endif %} # -------- # REDIS @@ -51,8 +53,12 @@ spec: path: redis.conf - name: redis-data + {% if not no_pvc %} persistentVolumeClaim: claimName: {{ name }} + {% else %} + emptyDir: {} + {% endif %} affinity: nodeAffinity: diff --git a/chart/values.yaml b/chart/values.yaml index d2c6d58c..b9466dfa 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -277,11 +277,11 @@ crawler_memory_base: 1024Mi # number of browser workers per crawler instances crawler_browser_instances: 2 -# number of browser workers per crawler instances for QA runs +# number of browser workers per QA pod to run for QA runs # defaults to 'crawler_browser_instances' if not set -# qa_browser_instances: 2 +qa_browser_instances: 1 -# fixed scale (number of crawler pods) for QA runs +# fixed scale (number of QA pods) to run qa_scale: 1 # this value is added to crawler_cpu_base, for each additional browser