diff --git a/backend/btrixcloud/crawl_job.py b/backend/btrixcloud/crawl_job.py index 0064bc05..be0e945c 100644 --- a/backend/btrixcloud/crawl_job.py +++ b/backend/btrixcloud/crawl_job.py @@ -212,28 +212,15 @@ class CrawlJob(ABC): await self.update_crawl(state=state, finished=self.finished) if completed: - await self.inc_crawl_complete_stats(state) + await self.inc_crawl_complete_stats() - async def inc_crawl_complete_stats(self, state): + async def inc_crawl_complete_stats(self): """Increment Crawl Stats""" duration = int((self.finished - self.started).total_seconds()) print(f"Duration: {duration}", flush=True) - # init crawl config stats - await self.crawl_configs.find_one_and_update( - {"_id": self.cid, "inactive": {"$ne": True}}, - { - "$inc": {"crawlCount": 1}, - "$set": { - "lastCrawlId": self.job_id, - "lastCrawlTime": self.finished, - "lastCrawlState": state, - }, - }, - ) - # init org crawl stats yymm = datetime.utcnow().strftime("%Y-%m") await self.orgs.find_one_and_update( diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 06dc442c..461d7a18 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -139,8 +139,11 @@ class CrawlConfig(BaseMongoModel): crawlAttemptCount: Optional[int] = 0 + # These fields would ideally be in CrawlConfigOut, but are being + # kept here to prevent the need for a migration. Eventually, we + # may want to add a migration and move them, as these values are + # now generated dynamically in API endpoints as needed. crawlCount: Optional[int] = 0 - lastCrawlId: Optional[str] lastCrawlTime: Optional[datetime] lastCrawlState: Optional[str] @@ -400,6 +403,7 @@ class CrawlConfigOps: configs = [] for res in results: config = CrawlConfigOut.from_dict(res) + config = await self._annotate_with_crawl_stats(config) # pylint: disable=invalid-name config.currCrawlId = running.get(config.id) configs.append(config) @@ -430,6 +434,25 @@ class CrawlConfigOps: return None + async def _annotate_with_crawl_stats(self, crawlconfig: CrawlConfigOut): + """Annotate crawlconfig with information about associated crawls""" + crawls = await self.crawl_ops.list_crawls(cid=crawlconfig.id) + + crawlconfig.crawlCount = len(crawls) + + finished_crawls = [crawl for crawl in crawls if crawl.finished] + if not finished_crawls: + return crawlconfig + + sorted_crawls = sorted(finished_crawls, key=lambda crawl: crawl.finished) + last_crawl = sorted_crawls[-1] + + crawlconfig.lastCrawlId = str(last_crawl.id) + crawlconfig.lastCrawlTime = last_crawl.finished + crawlconfig.lastCrawlState = last_crawl.state + + return crawlconfig + async def get_crawl_config_out(self, cid: uuid.UUID, org: Organization): """Return CrawlConfigOut, including state of currently running crawl, if active also include inactive crawl configs""" @@ -455,6 +478,8 @@ class CrawlConfigOps: crawlconfig.profileid, org ) + crawlconfig = await self._annotate_with_crawl_stats(crawlconfig) + return crawlconfig async def get_crawl_config( diff --git a/backend/test_nightly/conftest.py b/backend/test_nightly/conftest.py index dfa39a62..b876218a 100644 --- a/backend/test_nightly/conftest.py +++ b/backend/test_nightly/conftest.py @@ -105,3 +105,48 @@ def crawl_id_wr_specs(admin_auth_headers, default_org_id): if data["state"] == "complete": return crawl_id time.sleep(5) + + +@pytest.fixture(scope="session") +def crawl_config_info(admin_auth_headers, default_org_id): + # Start crawl. + crawl_data = { + "runNow": True, + "name": "Crawl config test", + "config": {"seeds": ["https://specs.webrecorder.net/"], "limit": 1}, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_config_id = data["added"] + crawl_id = data["run_now_job"] + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + break + time.sleep(5) + + # Run second crawl from crawlconfig and return info when it finishes + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run", + headers=admin_auth_headers, + ) + data = r.json() + second_crawl_id = data["started"] + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + return (crawl_config_id, crawl_id, second_crawl_id) + time.sleep(5) diff --git a/backend/test_nightly/test_crawlconfig_crawl_stats.py b/backend/test_nightly/test_crawlconfig_crawl_stats.py new file mode 100644 index 00000000..7a47cf40 --- /dev/null +++ b/backend/test_nightly/test_crawlconfig_crawl_stats.py @@ -0,0 +1,84 @@ +import requests + +from .conftest import API_PREFIX + + +def test_crawlconfig_crawl_stats(admin_auth_headers, default_org_id, crawl_config_info): + crawl_config_id, crawl_id, second_crawl_id = crawl_config_info + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + first_crawl_finished = data["finished"] + assert first_crawl_finished + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + second_crawl_finished = data["finished"] + assert second_crawl_finished + + # Verify crawl stats from /crawlconfigs + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["crawlAttemptCount"] == 2 + assert data["crawlCount"] == 2 + assert data["lastCrawlId"] == second_crawl_id + assert data["lastCrawlState"] == "complete" + assert data["lastCrawlTime"] == second_crawl_finished + + # Delete second crawl + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", + headers=admin_auth_headers, + json={"crawl_ids": [second_crawl_id]}, + ) + assert r.status_code == 200 + data = r.json() + assert data["deleted"] + + # Verify crawl stats from /crawlconfigs + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["crawlAttemptCount"] == 2 + assert data["crawlCount"] == 1 + assert data["lastCrawlId"] == first_crawl_id + assert data["lastCrawlState"] == "complete" + assert data["lastCrawlTime"] == first_crawl_finished + + # Delete first crawl + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete", + headers=admin_auth_headers, + json={"crawl_ids": [crawl_id]}, + ) + assert r.status_code == 200 + data = r.json() + assert data["deleted"] + + # Verify crawl stats from /crawlconfigs + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["crawlAttemptCount"] == 2 + assert data["crawlCount"] == 0 + assert not data["lastCrawlId"] + assert not data["lastCrawlState"] + assert not data["lastCrawlTime"]