crawlconfig: fix default filename template, make configurable (#835)

* crawlconfig: fix default filename template, make configurable - make default crawl file template configurable with 'default_crawl_filename_template' value in values.yaml - set to '@ts-@hostsuffix.wacz' by default - allow updating via 'crawlFilenameTemplate' in crawlconfig patch, which updates configmap - tests: add test for custom 'default_crawl_filename_template'
2023-05-08 14:03:27 -07:00 · 2023-05-08 14:03:27 -07:00 · 70319594c2
commit 70319594c2
parent fd7e81b8b7
7 changed files with 50 additions and 24 deletions
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -7,6 +7,7 @@ from enum import Enum
 import uuid
 import asyncio
 import re
+import os
 from datetime import datetime
 import urllib.parse

@ -116,6 +117,8 @@ class CrawlConfigIn(BaseModel):
    crawlTimeout: Optional[int] = 0
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1

+    crawlFilenameTemplate: Optional[str]
+

 # ============================================================================
 class ConfigRevision(BaseMongoModel):
@ -233,6 +236,7 @@ class UpdateCrawlConfig(BaseModel):
    profileid: Optional[str]
    crawlTimeout: Optional[int]
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)]
+    crawlFilenameTemplate: Optional[str]
    config: Optional[RawCrawlConfig]


@ -252,6 +256,7 @@ class CrawlConfigOps:
        self.profiles = profiles
        self.profiles.set_crawlconfigs(self)
        self.crawl_ops = None
+        self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]

        self.router = APIRouter(
            prefix="/crawlconfigs",
@ -331,10 +336,9 @@ class CrawlConfigOps:

        crawlconfig = CrawlConfig.from_dict(data)

-        suffix = f"{self.sanitize(str(crawlconfig.id))}-{self.sanitize(user.name)}"
-
-        # pylint: disable=line-too-long
-        out_filename = f"data/{self.sanitize(str(crawlconfig.id))}-@id/{suffix}-@ts-@hostsuffix.wacz"
+        out_filename = (
+            data.get("crawlFilenameTemplate") or self.default_filename_template
+        )

        crawl_id = await self.crawl_manager.add_crawl_config(
            crawlconfig=crawlconfig,
@ -382,6 +386,9 @@ class CrawlConfigOps:
        changed = changed or (
            self.check_attr_changed(orig_crawl_config, update, "crawlTimeout")
        )
+        changed = changed or (
+            self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate")
+        )
        changed = changed or (
            self.check_attr_changed(orig_crawl_config, update, "schedule")
        )
--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@ -97,12 +97,12 @@ class CrawlManager(K8sAPI):
        # Create Config Map
        await self._create_config_map(
            crawlconfig,
-            STORE_PATH=storage_path,
-            STORE_FILENAME=out_filename,
-            STORAGE_NAME=storage_name,
            USER_ID=str(crawlconfig.modifiedBy),
            ORG_ID=str(crawlconfig.oid),
            CRAWL_CONFIG_ID=str(crawlconfig.id),
+            STORE_PATH=storage_path,
+            STORE_FILENAME=out_filename,
+            STORAGE_NAME=storage_name,
            PROFILE_FILENAME=profile_filename,
            INITIAL_SCALE=str(crawlconfig.scale),
            CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout)
@ -147,9 +147,8 @@ class CrawlManager(K8sAPI):
        ):
            await self._update_config_map(
                crawlconfig,
-                update.scale,
+                update,
                profile_filename,
-                update.crawlTimeout,
                has_config_update,
            )

@ -397,22 +396,24 @@ class CrawlManager(K8sAPI):
    async def _update_config_map(
        self,
        crawlconfig,
-        scale=None,
+        update,
        profile_filename=None,
-        crawl_timeout=None,
        update_config=False,
    ):
        config_map = await self.get_configmap(crawlconfig.id)

-        if scale is not None:
-            config_map.data["INITIAL_SCALE"] = str(scale)
+        if update.scale is not None:
+            config_map.data["INITIAL_SCALE"] = str(update.scale)
+
+        if update.crawlTimeout is not None:
+            config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout)
+
+        if update.crawlFilenameTemplate is not None:
+            config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate

        if profile_filename is not None:
            config_map.data["PROFILE_FILENAME"] = profile_filename

-        if crawl_timeout is not None:
-            config_map.data["CRAWL_TIMEOUT"] = str(crawl_timeout)
-
        if update_config:
            config_map.data["crawl-config.json"] = json.dumps(
                crawlconfig.get_raw_config()
--- a/backend/btrixcloud/templates/crawler.yaml
+++ b/backend/btrixcloud/templates/crawler.yaml
@ -124,19 +124,19 @@ spec:

          env:
            - name: CRAWL_ID
-              value: {{ id }}
+              value: "{{ id }}"

            - name: WEBHOOK_URL
-              value: {{ redis_url }}/crawls-done
+              value: "{{ redis_url }}/crawls-done"

            - name: STORE_PATH
-              value: {{ store_path }}
+              value: "{{ store_path }}"

            - name: STORE_FILENAME
-              value: {{ store_filename }}
+              value: "{{ store_filename }}"

            - name: STORE_USER
-              value: {{ userid }}
+              value: "{{ userid }}"

          resources:
            limits:
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -3,6 +3,7 @@ import hashlib
 import time
 import io
 import zipfile
+import re

 from .conftest import API_PREFIX, HOST_PREFIX
 from .test_collections import UPDATED_NAME as COLLECTION_NAME
@ -58,6 +59,10 @@ def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
    assert len(data["resources"]) == 1
    assert data["resources"][0]["path"]

+    # ensure filename matches specified pattern
+    # set in default_crawl_filename_template
+    assert re.search('/[\\d]+-testing-[\\w-]+\.wacz', data["resources"][0]["path"])
+
    assert data["tags"] == ["wr-test-1", "wr-test-2"]

    global wacz_path
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@ -54,6 +54,8 @@ data:

  DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}"

+  DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
+
  MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"

  IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
--- a/chart/test/test.yaml
+++ b/chart/test/test.yaml
@ -4,6 +4,8 @@
 backend_pull_policy: "Never"
 frontend_pull_policy: "Never"

+default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz"
+
 operator_resync_seconds: 5

 mongo_auth:
--- a/chart/values.yaml
+++ b/chart/values.yaml
@ -34,15 +34,20 @@ max_pages_per_crawl: 0
 # if set to "1", allow inviting same user to same org multiple times
 allow_dupe_invites: "0"

-# number of workers for backend api
-backend_workers: 4
-
 # number of seconds before pending invites expire - default is 7 days
 invite_expire_seconds: 604800

 # base url for replayweb.page
 rwp_base_url: "https://replayweb.page/"

+# default template for generate wacz files
+# supports following interpolated vars:
+# @ts - current timestamp
+# @hostname - full hostname
+# @hostsuffix - last 14-characters of hostname
+# @id - full crawl id
+default_crawl_filename_template: "@ts-@hostsuffix.wacz"
+
 superuser:
  # set this to enable a superuser admin
  email: admin@example.com
@ -62,8 +67,12 @@ backend_pull_policy: "Always"

 backend_password_secret: "c9085f33ecce4347aa1d69339e16c499"

+# number of backend pods
 backend_num_replicas: 1

+# number of workers per pod
+backend_workers: 2
+
 backend_requests_cpu: "10m"
 backend_limits_cpu: "768m"