Dynamically calculate crawl stats for crawlconfig endpoints (#623)

2023-02-22 22:17:45 -05:00 · 2023-02-22 22:17:45 -05:00 · 567e851235
commit 567e851235
parent cbab425fec
4 changed files with 157 additions and 16 deletions
--- a/backend/btrixcloud/crawl_job.py
+++ b/backend/btrixcloud/crawl_job.py
@ -212,28 +212,15 @@ class CrawlJob(ABC):
        await self.update_crawl(state=state, finished=self.finished)

        if completed:
-            await self.inc_crawl_complete_stats(state)
+            await self.inc_crawl_complete_stats()

-    async def inc_crawl_complete_stats(self, state):
+    async def inc_crawl_complete_stats(self):
        """Increment Crawl Stats"""

        duration = int((self.finished - self.started).total_seconds())

        print(f"Duration: {duration}", flush=True)

-        # init crawl config stats
-        await self.crawl_configs.find_one_and_update(
-            {"_id": self.cid, "inactive": {"$ne": True}},
-            {
-                "$inc": {"crawlCount": 1},
-                "$set": {
-                    "lastCrawlId": self.job_id,
-                    "lastCrawlTime": self.finished,
-                    "lastCrawlState": state,
-                },
-            },
-        )
-
        # init org crawl stats
        yymm = datetime.utcnow().strftime("%Y-%m")
        await self.orgs.find_one_and_update(
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -139,8 +139,11 @@ class CrawlConfig(BaseMongoModel):

    crawlAttemptCount: Optional[int] = 0

+    # These fields would ideally be in CrawlConfigOut, but are being
+    # kept here to prevent the need for a migration. Eventually, we
+    # may want to add a migration and move them, as these values are
+    # now generated dynamically in API endpoints as needed.
    crawlCount: Optional[int] = 0
-
    lastCrawlId: Optional[str]
    lastCrawlTime: Optional[datetime]
    lastCrawlState: Optional[str]
@ -400,6 +403,7 @@ class CrawlConfigOps:
        configs = []
        for res in results:
            config = CrawlConfigOut.from_dict(res)
+            config = await self._annotate_with_crawl_stats(config)
            # pylint: disable=invalid-name
            config.currCrawlId = running.get(config.id)
            configs.append(config)
@ -430,6 +434,25 @@ class CrawlConfigOps:

        return None

+    async def _annotate_with_crawl_stats(self, crawlconfig: CrawlConfigOut):
+        """Annotate crawlconfig with information about associated crawls"""
+        crawls = await self.crawl_ops.list_crawls(cid=crawlconfig.id)
+
+        crawlconfig.crawlCount = len(crawls)
+
+        finished_crawls = [crawl for crawl in crawls if crawl.finished]
+        if not finished_crawls:
+            return crawlconfig
+
+        sorted_crawls = sorted(finished_crawls, key=lambda crawl: crawl.finished)
+        last_crawl = sorted_crawls[-1]
+
+        crawlconfig.lastCrawlId = str(last_crawl.id)
+        crawlconfig.lastCrawlTime = last_crawl.finished
+        crawlconfig.lastCrawlState = last_crawl.state
+
+        return crawlconfig
+
    async def get_crawl_config_out(self, cid: uuid.UUID, org: Organization):
        """Return CrawlConfigOut, including state of currently running crawl, if active
        also include inactive crawl configs"""
@ -455,6 +478,8 @@ class CrawlConfigOps:
                crawlconfig.profileid, org
            )

+        crawlconfig = await self._annotate_with_crawl_stats(crawlconfig)
+
        return crawlconfig

    async def get_crawl_config(
--- a/backend/test_nightly/conftest.py
+++ b/backend/test_nightly/conftest.py
@ -105,3 +105,48 @@ def crawl_id_wr_specs(admin_auth_headers, default_org_id):
        if data["state"] == "complete":
            return crawl_id
        time.sleep(5)
+
+
+@pytest.fixture(scope="session")
+def crawl_config_info(admin_auth_headers, default_org_id):
+    # Start crawl.
+    crawl_data = {
+        "runNow": True,
+        "name": "Crawl config test",
+        "config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1},
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_config_id = data["added"]
+    crawl_id = data["run_now_job"]
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            break
+        time.sleep(5)
+
+    # Run second crawl from crawlconfig and return info when it finishes
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    second_crawl_id = data["started"]
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            return (crawl_config_id, crawl_id, second_crawl_id)
+        time.sleep(5)
--- a/backend/test_nightly/test_crawlconfig_crawl_stats.py
+++ b/backend/test_nightly/test_crawlconfig_crawl_stats.py
@ -0,0 +1,84 @@
+import requests
+
+from .conftest import API_PREFIX
+
+
+def test_crawlconfig_crawl_stats(admin_auth_headers, default_org_id, crawl_config_info):
+    crawl_config_id, crawl_id, second_crawl_id = crawl_config_info
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    first_crawl_finished = data["finished"]
+    assert first_crawl_finished
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    second_crawl_finished = data["finished"]
+    assert second_crawl_finished
+
+    # Verify crawl stats from /crawlconfigs
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["crawlAttemptCount"] == 2
+    assert data["crawlCount"] == 2
+    assert data["lastCrawlId"] == second_crawl_id
+    assert data["lastCrawlState"] == "complete"
+    assert data["lastCrawlTime"] == second_crawl_finished
+
+    # Delete second crawl
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
+        headers=admin_auth_headers,
+        json={"crawl_ids": [second_crawl_id]},
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["deleted"]
+
+    # Verify crawl stats from /crawlconfigs
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["crawlAttemptCount"] == 2
+    assert data["crawlCount"] == 1
+    assert data["lastCrawlId"] == first_crawl_id
+    assert data["lastCrawlState"] == "complete"
+    assert data["lastCrawlTime"] == first_crawl_finished
+
+    # Delete first crawl
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
+        headers=admin_auth_headers,
+        json={"crawl_ids": [crawl_id]},
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["deleted"]
+
+    # Verify crawl stats from /crawlconfigs
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["crawlAttemptCount"] == 2
+    assert data["crawlCount"] == 0
+    assert not data["lastCrawlId"]
+    assert not data["lastCrawlState"]
+    assert not data["lastCrawlTime"]