From 8ea16393c5e121a228ce10ad4dec9c1c2fb3006a Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Tue, 10 Jun 2025 19:13:57 +0000
Subject: [PATCH] Optimize single-page crawl workflows (#2656)

For single page crawls:
- Always force 1 browser to be used, ignoring browser windows/scale
setting
- Don't use custom PVC volumes in crawler / redis, just use emptyDir -
no chance of crawler being interrupted and restarted on different
machine for a single page.

Adds a 'is_single_page' check to CrawlConfig, checking for either limit
or scopeType / no extra hops.

Fixes #2655
---
 backend/btrixcloud/crawlconfigs.py       | 28 ++++++++++++++++++---
 backend/btrixcloud/crawlmanager.py       |  2 ++
 backend/btrixcloud/k8sapi.py             |  4 +++
 backend/btrixcloud/operator/crawls.py    | 17 ++++++++-----
 backend/btrixcloud/operator/cronjobs.py  |  1 +
 backend/btrixcloud/operator/models.py    |  1 +
 backend/test/conftest.py                 |  2 +-
 backend/test/test_crawlconfigs.py        | 32 ++++++++++++++++++++++++
 backend/test/test_filter_sort_results.py | 12 ++++-----
 chart/app-templates/crawl_job.yaml       |  2 ++
 chart/app-templates/crawler.yaml         |  7 +++++-
 chart/app-templates/redis.yaml           |  6 +++++
 chart/values.yaml                        |  6 ++---
 13 files changed, 100 insertions(+), 20 deletions(-)

diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
index 09f0b911..dd74e4cc 100644
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@@ -45,6 +45,7 @@ from .models import (
     CrawlerProxy,
     CrawlerProxies,
     ValidateCustomBehavior,
+    RawCrawlConfig,
 )
 from .utils import (
     dt_now,
@@ -223,15 +224,18 @@ class CrawlConfigOps:
     ) -> CrawlConfigAddedResponse:
         """Add new crawl config"""
 
+        # ensure crawlChannel is valid
+        if not self.get_channel_crawler_image(config_in.crawlerChannel):
+            raise HTTPException(status_code=404, detail="crawler_not_found")
+
         # Overrides scale if set
         if config_in.browserWindows is None:
             config_in.browserWindows = browser_windows_from_scale(
                 cast(int, config_in.scale)
             )
 
-        # ensure crawlChannel is valid
-        if not self.get_channel_crawler_image(config_in.crawlerChannel):
-            raise HTTPException(status_code=404, detail="crawler_not_found")
+        if self.is_single_page(config_in.config):
+            config_in.browserWindows = 1
 
         profileid = None
         if isinstance(config_in.profileid, UUID):
@@ -321,6 +325,19 @@ class CrawlConfigOps:
             execMinutesQuotaReached=exec_mins_quota_reached,
         )
 
+    def is_single_page(self, config: RawCrawlConfig):
+        """return true if this config represents a single page crawl"""
+        if not config.seeds or len(config.seeds) != 1:
+            return False
+
+        if config.limit == 1:
+            return True
+
+        extra_hops = config.seeds[0].extraHops or config.extraHops
+        scope_type = config.seeds[0].scopeType or config.scopeType
+
+        return extra_hops == 0 and scope_type == "page"
+
     def _validate_link_selectors(self, link_selectors: List[str]):
         """Validate link selectors
 
@@ -435,6 +452,10 @@ class CrawlConfigOps:
         if update.config and update.config.lang:
             validate_language_code(update.config.lang)
 
+        if update.config or update.browserWindows:
+            if self.is_single_page(update.config or orig_crawl_config.config):
+                update.browserWindows = 1
+
         # indicates if any k8s crawl config settings changed
         changed = False
         changed = changed or (
@@ -1021,6 +1042,7 @@ class CrawlConfigOps:
                 warc_prefix=self.get_warc_prefix(org, crawlconfig),
                 storage_filename=storage_filename,
                 profile_filename=profile_filename or "",
+                is_single_page=self.is_single_page(crawlconfig.config),
             )
             await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
             return crawl_id
diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py
index 94dfcce1..d4d6681d 100644
--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@@ -220,6 +220,7 @@ class CrawlManager(K8sAPI):
         warc_prefix: str,
         storage_filename: str,
         profile_filename: str,
+        is_single_page: bool,
     ) -> str:
         """create new crawl job from config"""
         cid = str(crawlconfig.id)
@@ -244,6 +245,7 @@ class CrawlManager(K8sAPI):
             storage_filename=storage_filename,
             profile_filename=profile_filename,
             proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
+            is_single_page=is_single_page,
         )
 
     async def reload_running_crawl_config(self, crawl_id: str):
diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py
index 41495f60..e25f0b99 100644
--- a/backend/btrixcloud/k8sapi.py
+++ b/backend/btrixcloud/k8sapi.py
@@ -95,6 +95,7 @@ class K8sAPI:
         profile_filename: str = "",
         qa_source: str = "",
         proxy_id: str = "",
+        is_single_page: bool = False,
     ):
         """load job template from yaml"""
         if not crawl_id:
@@ -119,6 +120,7 @@ class K8sAPI:
             "profile_filename": profile_filename,
             "qa_source": qa_source,
             "proxy_id": proxy_id,
+            "is_single_page": "1" if is_single_page else "0",
         }
 
         data = self.templates.env.get_template("crawl_job.yaml").render(params)
@@ -142,6 +144,7 @@ class K8sAPI:
         profile_filename: str = "",
         qa_source: str = "",
         proxy_id: str = "",
+        is_single_page: bool = False,
     ) -> str:
         """load and init crawl job via k8s api"""
         crawl_id, data = self.new_crawl_job_yaml(
@@ -161,6 +164,7 @@ class K8sAPI:
             profile_filename=profile_filename,
             qa_source=qa_source,
             proxy_id=proxy_id,
+            is_single_page=is_single_page,
         )
 
         # create job directly
diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py
index aed9cf66..c52314d0 100644
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@@ -101,6 +101,8 @@ class CrawlOperator(BaseOperator):
 
     paused_expires_delta: timedelta
 
+    num_browsers_per_pod: int
+
     def __init__(self, *args):
         super().__init__(*args)
 
@@ -125,6 +127,8 @@ class CrawlOperator(BaseOperator):
 
         self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes)
 
+        self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
+
     def init_routes(self, app):
         """init routes for this operator"""
 
@@ -181,6 +185,7 @@ class CrawlOperator(BaseOperator):
             max_crawl_size=int(spec.get("maxCrawlSize") or 0),
             scheduled=spec.get("manual") != "1",
             qa_source_crawl_id=spec.get("qaSourceCrawlId"),
+            is_single_page=spec.get("isSinglePage") == "1",
         )
 
         if crawl.qa_source_crawl_id:
@@ -301,7 +306,7 @@ class CrawlOperator(BaseOperator):
                 status.stopReason = stop_reason
                 await self.mark_finished(crawl, status, state)
 
-        children = self._load_redis(params, status, data.children)
+        children = self._load_redis(params, status, crawl, data.children)
 
         storage_path = crawl.storage.get_storage_extra_path(oid)
         storage_secret = crawl.storage.get_storage_secret_name(oid)
@@ -368,10 +373,8 @@ class CrawlOperator(BaseOperator):
         # crawl_scale is the number of pods to create
         crawler_scale = scale_from_browser_windows(crawl.browser_windows)
 
-        browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
-
         for i in range(0, crawler_scale):
-            if status.pagesFound < i * browsers_per_pod:
+            if status.pagesFound < i * self.num_browsers_per_pod:
                 break
 
             children.extend(
@@ -392,7 +395,7 @@ class CrawlOperator(BaseOperator):
             "resyncAfterSeconds": status.resync_after,
         }
 
-    def _load_redis(self, params, status: CrawlStatus, children):
+    def _load_redis(self, params, status: CrawlStatus, crawl: CrawlSpec, children):
         name = f"redis-{params['id']}"
         has_pod = name in children[POD]
 
@@ -400,6 +403,8 @@ class CrawlOperator(BaseOperator):
         params["name"] = name
         params["cpu"] = pod_info.newCpu or params.get("redis_cpu")
         params["memory"] = pod_info.newMemory or params.get("redis_memory")
+        params["no_pvc"] = crawl.is_single_page
+
         restart_reason = None
         if has_pod:
             restart_reason = pod_info.should_restart_pod()
@@ -870,7 +875,7 @@ class CrawlOperator(BaseOperator):
         if redis_pod in pods:
             # if has other pods, keep redis pod until they are removed
             if len(pods) > 1:
-                new_children = self._load_redis(params, status, children)
+                new_children = self._load_redis(params, status, crawl, children)
                 await self.increment_pod_exec_time(pods, crawl, status)
 
         # keep pvs until pods are removed
diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py
index cb515c4b..fdd76c44 100644
--- a/backend/btrixcloud/operator/cronjobs.py
+++ b/backend/btrixcloud/operator/cronjobs.py
@@ -140,6 +140,7 @@ class CronJobOperator(BaseOperator):
             storage_filename=self.crawl_config_ops.default_filename_template,
             profile_filename=profile_filename or "",
             proxy_id=crawlconfig.proxyId or "",
+            is_single_page=self.crawl_config_ops.is_single_page(crawlconfig.config),
         )
 
         return MCDecoratorSyncResponse(attachments=list(yaml.safe_load_all(crawljob)))
diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py
index 02f4d2a2..d98db940 100644
--- a/backend/btrixcloud/operator/models.py
+++ b/backend/btrixcloud/operator/models.py
@@ -86,6 +86,7 @@ class CrawlSpec(BaseModel):
     max_crawl_size: int = 0
     qa_source_crawl_id: Optional[str] = ""
     proxy_id: Optional[str] = None
+    is_single_page: bool = False
 
     @property
     def db_crawl_id(self) -> str:
diff --git a/backend/test/conftest.py b/backend/test/conftest.py
index 31dd92dd..fc8b767c 100644
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@@ -331,7 +331,7 @@ def sample_crawl_data():
     return {
         "runNow": False,
         "name": "Test Crawl",
-        "config": {"seeds": [{"url": "https://example.com/"}]},
+        "config": {"seeds": [{"url": "https://example.com/"}], "extraHops": 1},
         "tags": ["tag1", "tag2"],
     }
 
diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py
index b4a70a12..c821bab9 100644
--- a/backend/test/test_crawlconfigs.py
+++ b/backend/test/test_crawlconfigs.py
@@ -6,6 +6,7 @@ from .conftest import API_PREFIX
 
 
 cid = None
+cid_single_page = None
 UPDATED_NAME = "Updated name"
 UPDATED_DESCRIPTION = "Updated description"
 UPDATED_TAGS = ["tag3", "tag4"]
@@ -67,6 +68,37 @@ def test_verify_default_browser_windows(
     assert data["browserWindows"] == 2
 
 
+def test_add_crawl_config_single_page(
+    crawler_auth_headers, default_org_id, sample_crawl_data
+):
+    # Create crawl config
+    sample_crawl_data["config"]["limit"] = 1
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=crawler_auth_headers,
+        json=sample_crawl_data,
+    )
+    assert r.status_code == 200
+
+    data = r.json()
+    global cid_single_page
+    cid_single_page = data["id"]
+
+
+def test_verify_default_browser_windows_single_page(
+    crawler_auth_headers, default_org_id, sample_crawl_data
+):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid_single_page}/",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+
+    data = r.json()
+    assert data.get("scale") is None
+    assert data["browserWindows"] == 1
+
+
 def test_custom_browser_windows(
     crawler_auth_headers, default_org_id, sample_crawl_data
 ):
diff --git a/backend/test/test_filter_sort_results.py b/backend/test/test_filter_sort_results.py
index 77c7e185..239531cb 100644
--- a/backend/test/test_filter_sort_results.py
+++ b/backend/test/test_filter_sort_results.py
@@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_
         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}",
         headers=crawler_auth_headers,
     )
-    assert len(r.json()["items"]) == 7
-    assert r.json()["total"] == 7
+    assert len(r.json()["items"]) == 8
+    assert r.json()["total"] == 8
 
 
 def test_get_config_by_modified_by(
@@ -23,8 +23,8 @@ def test_get_config_by_modified_by(
         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}",
         headers=crawler_auth_headers,
     )
-    assert len(r.json()["items"]) == 7
-    assert r.json()["total"] == 7
+    assert len(r.json()["items"]) == 8
+    assert r.json()["total"] == 8
 
 
 def test_get_configs_by_first_seed(
@@ -362,9 +362,9 @@ def test_sort_crawl_configs(
         headers=crawler_auth_headers,
     )
     data = r.json()
-    assert data["total"] == 13
+    assert data["total"] == 14
     items = data["items"]
-    assert len(items) == 13
+    assert len(items) == 14
 
     last_created = None
     for config in items:
diff --git a/chart/app-templates/crawl_job.yaml b/chart/app-templates/crawl_job.yaml
index 4b749fab..c921c830 100644
--- a/chart/app-templates/crawl_job.yaml
+++ b/chart/app-templates/crawl_job.yaml
@@ -39,3 +39,5 @@ spec:
 
   pausedAt: "{{ pausedAt }}"
 
+  isSinglePage: "{{ is_single_page }}"
+
diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml
index be8992ad..10a2a223 100644
--- a/chart/app-templates/crawler.yaml
+++ b/chart/app-templates/crawler.yaml
@@ -1,3 +1,4 @@
+{% if not no_pvc %}
 # -------
 # PVC
 # -------
@@ -23,7 +24,7 @@ spec:
   storageClassName: {{ volume_storage_class }}
   {% endif %}
 
-
+{% endif %}
 
 # -------
 # CRAWLER
@@ -67,8 +68,12 @@ spec:
         name: qa-replay-{{ qa_source_crawl_id }}
     {% endif %}
     - name: crawl-data
+    {% if not no_pvc %}
       persistentVolumeClaim:
         claimName: {{ name }}
+    {% else %}
+      emptyDir: {}
+    {% endif %}
     {% if proxy_id %}
     - name: proxies
       secret:
diff --git a/chart/app-templates/redis.yaml b/chart/app-templates/redis.yaml
index 8d31e210..e366fbca 100644
--- a/chart/app-templates/redis.yaml
+++ b/chart/app-templates/redis.yaml
@@ -1,3 +1,4 @@
+{% if not no_pvc %}
 # -------
 # PVC
 # -------
@@ -22,6 +23,7 @@ spec:
   {% if volume_storage_class %}
   storageClassName: {{ volume_storage_class }}
   {% endif %}
+{% endif %}
 
 # --------
 # REDIS
@@ -51,8 +53,12 @@ spec:
             path: redis.conf
 
     - name: redis-data
+      {% if not no_pvc %}
       persistentVolumeClaim:
         claimName: {{ name }}
+      {% else %}
+      emptyDir: {}
+      {% endif %}
 
   affinity:
     nodeAffinity:
diff --git a/chart/values.yaml b/chart/values.yaml
index d2c6d58c..b9466dfa 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -277,11 +277,11 @@ crawler_memory_base: 1024Mi
 # number of browser workers per crawler instances
 crawler_browser_instances: 2
 
-# number of browser workers per crawler instances for QA runs
+# number of browser workers per QA pod to run for QA runs
 # defaults to 'crawler_browser_instances' if not set
-# qa_browser_instances: 2
+qa_browser_instances: 1
 
-# fixed scale (number of crawler pods) for QA runs
+# fixed scale (number of QA pods) to run
 qa_scale: 1
 
 # this value is added to crawler_cpu_base, for each additional browser