backend work:

- support {configname}-{username}-@ts-@hostsuffix.wacz as output filename, sanitize username and config name - support returning 'starting' for crawl status if no ips or 0/0 pages found. - fix updating scale via POST crawlconfig update - fix duplicate user error on superuser init
2022-03-15 18:18:22 -07:00 · 2022-03-15 18:18:22 -07:00 · e6467c3374
commit e6467c3374
parent 4b2f89db91
5 changed files with 57 additions and 34 deletions
--- a/backend/crawlconfigs.py
+++ b/backend/crawlconfigs.py
@ -6,6 +6,7 @@ from typing import List, Union, Optional
 from enum import Enum
 import uuid
 import asyncio
+import re
 from datetime import datetime

 import pymongo
@ -169,6 +170,7 @@ class CrawlConfigOps:
        )

        self.coll_ops = None
+        self._file_rx = re.compile("\\W+")

        asyncio.create_task(self.init_index())

@ -182,6 +184,10 @@ class CrawlConfigOps:
        """ set collection ops """
        self.coll_ops = coll_ops

+    def sanitize(self, string=""):
+        """ sanitize string for use in wacz filename"""
+        return self._file_rx.sub("-", string.lower())
+
    async def add_crawl_config(
        self, config: CrawlConfigIn, archive: Archive, user: User
    ):
@ -216,8 +222,14 @@ class CrawlConfigOps:

        crawlconfig = CrawlConfig.from_dict(data)

+        # pylint: disable=line-too-long
+        out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz"
+
        new_name = await self.crawl_manager.add_crawl_config(
-            crawlconfig=crawlconfig, storage=archive.storage, run_now=config.runNow
+            crawlconfig=crawlconfig,
+            storage=archive.storage,
+            run_now=config.runNow,
+            out_filename=out_filename,
        )

        return result, new_name
@ -234,10 +246,10 @@ class CrawlConfigOps:
            raise HTTPException(status_code=400, detail="no_update_data")

        # update schedule in crawl manager first
-        if update.schedule is not None:
+        if update.schedule is not None or update.scale is not None:
            try:
-                await self.crawl_manager.update_crawl_schedule(
-                    str(cid), update.schedule
+                await self.crawl_manager.update_crawl_schedule_or_scale(
+                    str(cid), update.schedule, update.scale
                )
            except Exception:
                # pylint: disable=raise-missing-from
@ -443,14 +455,6 @@ def init_crawl_config_api(
    ):
        return await ops.update_crawl_config(uuid.UUID(cid), update)

-    # depcreated: to remove in favor of general patch
-    @router.patch("/{cid}/schedule", dependencies=[Depends(archive_crawl_dep)])
-    async def update_crawl_schedule(
-        update: UpdateCrawlConfig,
-        cid: str,
-    ):
-        return await ops.update_crawl_config(uuid.UUID(cid), update)
-
    @router.post("/{cid}/run")
    async def run_now(
        cid: str,
--- a/backend/crawls.py
+++ b/backend/crawls.py
@ -422,6 +422,8 @@ class CrawlOps:

        for crawl, (done, total) in zip(crawl_list, pairwise(results)):
            crawl.stats = {"done": done, "found": total}
+            if total == 0 and done == 0 and crawl.state == "running":
+                crawl.state = "starting"

    async def cache_ips(self, crawl: CrawlOut):
        """ cache ips for ws auth check """
--- a/backend/dockerman.py
+++ b/backend/dockerman.py
@ -170,7 +170,8 @@ class DockerManager:
    async def update_archive_storage(self, aid, userid, storage):
        """ No storage kept for docker manager """

-    async def add_crawl_config(self, crawlconfig, storage, run_now):
+    # pylint: disable=unused-argument
+    async def add_crawl_config(self, crawlconfig, storage, run_now, out_filename):
        """ Add new crawl config """
        cid = str(crawlconfig.id)
        userid = str(crawlconfig.userid)
@ -212,7 +213,7 @@ class DockerManager:

        return ""

-    async def update_crawl_schedule(self, cid, schedule):
+    async def update_crawl_schedule_or_scale(self, cid, schedule=None, scale=None):
        """ Update the schedule for existing crawl config """

        if schedule:
--- a/backend/k8sman.py
+++ b/backend/k8sman.py
@ -164,7 +164,7 @@ class K8SManager:
                name=archive_storage_name, namespace=self.namespace, body=crawl_secret
            )

-    async def add_crawl_config(self, crawlconfig, storage, run_now):
+    async def add_crawl_config(self, crawlconfig, storage, run_now, out_filename):
        """add new crawl as cron job, store crawl config in configmap"""
        cid = str(crawlconfig.id)
        userid = str(crawlconfig.userid)
@ -209,6 +209,7 @@ class K8SManager:
            storage_path,
            labels,
            annotations,
+            out_filename,
            crawlconfig.crawlTimeout,
            crawlconfig.scale,
        )
@ -242,8 +243,8 @@ class K8SManager:

        return ""

-    async def update_crawl_schedule(self, cid, schedule):
-        """ Update the schedule for existing crawl config """
+    async def update_crawl_schedule_or_scale(self, cid, schedule=None, scale=None):
+        """ Update the schedule or scale for existing crawl config """

        cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
            namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}"
@ -254,16 +255,25 @@ class K8SManager:

        cron_job = cron_jobs.items[0]

-        real_schedule = schedule or DEFAULT_NO_SCHEDULE
+        updated = False

-        if real_schedule != cron_job.spec.schedule:
-            cron_job.spec.schedule = real_schedule
-            cron_job.spec.suspend = not schedule
+        if schedule is not None:
+            real_schedule = schedule or DEFAULT_NO_SCHEDULE

-            cron_job.spec.job_template.metadata.annotations[
-                "btrix.run.schedule"
-            ] = schedule
+            if real_schedule != cron_job.spec.schedule:
+                cron_job.spec.schedule = real_schedule
+                cron_job.spec.suspend = not schedule

+                cron_job.spec.job_template.metadata.annotations[
+                    "btrix.run.schedule"
+                ] = schedule
+            updated = True
+
+        if scale is not None:
+            cron_job.spec.job_template.spec.parallelism = scale
+            updated = True
+
+        if updated:
            await self.batch_beta_api.patch_namespaced_cron_job(
                name=cron_job.metadata.name, namespace=self.namespace, body=cron_job
            )
@ -397,17 +407,17 @@ class K8SManager:
            if not status:
                return None

-            crawl = self._make_crawl_for_job(job, status, False, CrawlOut)
-
            pods = await self.core_api.list_namespaced_pod(
                namespace=self.namespace,
                label_selector=f"job-name={name},btrix.archive={aid}",
            )

-            crawl.watchIPs = [
-                pod.status.pod_ip for pod in pods.items if pod.status.pod_ip
-            ]
-            return crawl
+            watch_ips = [pod.status.pod_ip for pod in pods.items if pod.status.pod_ip]
+
+            if status == "running" and not watch_ips:
+                status = "starting"
+
+            return self._make_crawl_for_job(job, status, False, CrawlOut, watch_ips)

        # pylint: disable=broad-except
        except Exception:
@ -517,7 +527,9 @@ class K8SManager:
        return None

    # pylint: disable=no-self-use
-    def _make_crawl_for_job(self, job, state, finish_now=False, crawl_cls=Crawl):
+    def _make_crawl_for_job(
+        self, job, state, finish_now=False, crawl_cls=Crawl, watch_ips=None
+    ):
        """ Make a crawl object from a job"""
        return crawl_cls(
            id=job.metadata.name,
@ -529,10 +541,11 @@ class K8SManager:
            # schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
            manual=job.metadata.annotations.get("btrix.run.manual") == "1",
            started=job.status.start_time.replace(tzinfo=None),
+            watchIPs=watch_ips or [],
+            colls=json.loads(job.metadata.annotations.get("btrix.colls", [])),
            finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None)
            if finish_now
            else None,
-            colls=json.loads(job.metadata.annotations.get("btrix.colls", [])),
        )

    async def _delete_job(self, name):
@ -669,6 +682,7 @@ class K8SManager:
        storage_path,
        labels,
        annotations,
+        out_filename,
        crawl_timeout,
        parallel,
    ):
@ -731,7 +745,7 @@ class K8SManager:
                                    {"name": "STORE_PATH", "value": storage_path},
                                    {
                                        "name": "STORE_FILENAME",
-                                        "value": "@ts-@hostname.wacz",
+                                        "value": out_filename,
                                    },
                                ],
                                "resources": resources,
--- a/backend/users.py
+++ b/backend/users.py
@ -14,6 +14,8 @@ import passlib.pwd
 from fastapi import Request, Response, HTTPException, Depends, WebSocket
 from fastapi.security import OAuth2PasswordBearer

+from pymongo.errors import DuplicateKeyError
+
 from fastapi_users import FastAPIUsers, models, BaseUserManager
 from fastapi_users.manager import UserAlreadyExists
 from fastapi_users.authentication import (
@ -174,7 +176,7 @@ class UserManager(BaseUserManager[UserCreate, UserDB]):
            print(f"Super user {email} created", flush=True)
            print(res, flush=True)

-        except UserAlreadyExists:
+        except (DuplicateKeyError, UserAlreadyExists):
            print(f"User {email} already exists", flush=True)

    async def on_after_register_custom(