More friendly WARC prefix inside WACZ based on Org slug + Crawl Name / First Seed URL. (#1537)

Supports setting WARC prefix for WARCs inside WACZ to `<org slug>-<slug [crawl name | first seed host]>`. - Prefix set via WARC_PREFIX env var, supported in browsertrix-crawler 1.0.0-beta.4 or higher If crawl name is provided, uses crawl name, other hostname of first seed. The name is 'sluggified', using lowercase alphanum characters separated by dashes. Ex: in an organization called `Default Org`, a crawl of `https://specs.webrecorder.net/` and no name will have WARCs named: `default-org-specs-webrecorder-net-....warc.gz` If the crawl is given the name `SPECS`, the WARCs will be named `default-org-specs-manual-....warc.gz` Fixes #412 in a default way.
2024-02-22 14:00:19 -08:00 · 2024-02-22 14:00:19 -08:00 · 8ae032ff88
commit 8ae032ff88
parent ba18abc063
6 changed files with 64 additions and 22 deletions
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -33,7 +33,7 @@ from .models import (
    CrawlerChannel,
    CrawlerChannels,
 )
-from .utils import dt_now
+from .utils import dt_now, slug_from_name

 if TYPE_CHECKING:
    from .orgs import OrgOps
@ -232,6 +232,7 @@ class CrawlConfigOps:
            run_now=run_now,
            out_filename=out_filename,
            profile_filename=profile_filename or "",
+            warc_prefix=self.get_warc_prefix(org, crawlconfig),
        )

        if crawl_id and run_now:
@ -298,6 +299,7 @@ class CrawlConfigOps:
            run_now=False,
            out_filename=self.default_filename_template,
            profile_filename=profile_filename or "",
+            warc_prefix=self.get_warc_prefix(org, crawlconfig),
        )

    async def update_crawl_config(
@ -841,7 +843,10 @@ class CrawlConfigOps:

        try:
            crawl_id = await self.crawl_manager.create_crawl_job(
-                crawlconfig, org.storage, userid=str(user.id)
+                crawlconfig,
+                org.storage,
+                userid=str(user.id),
+                warc_prefix=self.get_warc_prefix(org, crawlconfig),
            )
            await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
            return crawl_id
@ -897,6 +902,21 @@ class CrawlConfigOps:
        """Get crawler image name by id"""
        return self.crawler_images_map.get(crawler_channel or "")

+    def get_warc_prefix(self, org: Organization, crawlconfig: CrawlConfig) -> str:
+        """Generate WARC prefix slug from org slug, name or url
+        if no name is provided, hostname is used from url, otherwise
+        url is ignored"""
+        name = crawlconfig.name
+        if not name:
+            if crawlconfig.config.seeds and len(crawlconfig.config.seeds):
+                url = crawlconfig.config.seeds[0].url
+                parts = urllib.parse.urlsplit(url)
+                name = parts.netloc
+
+        name = slug_from_name(name or "")
+        prefix = org.slug + "-" + name
+        return prefix[:80]
+

 # ============================================================================
 # pylint: disable=too-many-locals
--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@ -119,6 +119,7 @@ class CrawlManager(K8sAPI):
        run_now: bool,
        out_filename: str,
        profile_filename: str,
+        warc_prefix: str,
    ) -> Optional[str]:
        """add new crawl, store crawl config in configmap"""

@ -139,7 +140,10 @@ class CrawlManager(K8sAPI):

        if run_now:
            crawl_id = await self.create_crawl_job(
-                crawlconfig, storage, str(crawlconfig.modifiedBy)
+                crawlconfig,
+                storage,
+                str(crawlconfig.modifiedBy),
+                warc_prefix,
            )

        await self._update_scheduled_job(crawlconfig)
@ -151,6 +155,7 @@ class CrawlManager(K8sAPI):
        crawlconfig: CrawlConfig,
        storage: StorageRef,
        userid: str,
+        warc_prefix: str,
    ) -> str:
        """create new crawl job from config"""
        cid = str(crawlconfig.id)
@ -169,6 +174,7 @@ class CrawlManager(K8sAPI):
            crawlconfig.crawlTimeout,
            crawlconfig.maxCrawlSize,
            manual=True,
+            warc_prefix=warc_prefix,
        )

    async def update_crawl_config(
--- a/backend/btrixcloud/k8sapi.py
+++ b/backend/btrixcloud/k8sapi.py
@ -86,6 +86,7 @@ class K8sAPI:
        max_crawl_size=0,
        manual=True,
        crawl_id=None,
+        warc_prefix="",
    ):
        """load job template from yaml"""
        if not crawl_id:
@ -104,6 +105,7 @@ class K8sAPI:
            "storage_name": str(storage),
            "manual": "1" if manual else "0",
            "crawler_channel": crawler_channel,
+            "warc_prefix": warc_prefix,
        }

        data = self.templates.env.get_template("crawl_job.yaml").render(params)
--- a/backend/btrixcloud/operator.py
+++ b/backend/btrixcloud/operator.py
@ -538,6 +538,8 @@ class BtrixOperator(K8sAPI):
        params["storage_filename"] = configmap["STORE_FILENAME"]
        params["restart_time"] = spec.get("restartTime")

+        params["warc_prefix"] = spec.get("warcPrefix")
+
        params["redis_url"] = redis_url

        if spec.get("restartTime") != status.restartTime:
@ -1651,26 +1653,10 @@ class BtrixOperator(K8sAPI):

        org = await self.org_ops.get_org_by_id(UUID(oid))

-        crawl_id, crawljob = self.new_crawl_job_yaml(
-            cid,
-            userid=userid,
-            oid=oid,
-            storage=org.storage,
-            crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
-            scale=int(configmap.get("INITIAL_SCALE", 1)),
-            crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
-            max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
-            manual=False,
-            crawl_id=crawl_id,
-        )
-
-        attachments = list(yaml.safe_load_all(crawljob))
-
-        if crawl_id in crawljobs:
-            attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
+        warc_prefix = None

        if not actual_state:
-            # pylint: disable=duplicate-code
+            # cronjob doesn't exist yet
            crawlconfig = await self.crawl_config_ops.get_crawl_config(
                UUID(cid), UUID(oid)
            )
@ -1686,11 +1672,35 @@ class BtrixOperator(K8sAPI):
                print(f"error: missing user for id {userid}")
                return {"attachments": []}

+            warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
+
            await self.crawl_config_ops.add_new_crawl(
-                crawl_id, crawlconfig, user, manual=False
+                crawl_id,
+                crawlconfig,
+                user,
+                manual=False,
            )
            print("Scheduled Crawl Created: " + crawl_id)

+        crawl_id, crawljob = self.new_crawl_job_yaml(
+            cid,
+            userid=userid,
+            oid=oid,
+            storage=org.storage,
+            crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
+            scale=int(configmap.get("INITIAL_SCALE", 1)),
+            crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
+            max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
+            manual=False,
+            crawl_id=crawl_id,
+            warc_prefix=warc_prefix,
+        )
+
+        attachments = list(yaml.safe_load_all(crawljob))
+
+        if crawl_id in crawljobs:
+            attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
+
        return {
            "attachments": attachments,
        }
--- a/chart/app-templates/crawl_job.yaml
+++ b/chart/app-templates/crawl_job.yaml
@ -24,6 +24,7 @@ spec:
  manual: {{ manual }}
  crawlerChannel: "{{ crawler_channel }}"
  ttlSecondsAfterFinished: 30
+  warcPrefix: "{{ warc_prefix }}"

  storageName: "{{ storage_name }}"

--- a/chart/app-templates/crawler.yaml
+++ b/chart/app-templates/crawler.yaml
@ -149,6 +149,9 @@ spec:
        - name: STORE_USER
          value: "{{ userid }}"

+        - name: WARC_PREFIX
+          value: "{{ warc_prefix }}"
+
    {% if crawler_socks_proxy_host %}
        - name: SOCKS_HOST
          value: "{{ crawler_socks_proxy_host }}"