From 9499ebfbbaee2567b1bf4fdadf8a0d1a76fa18f7 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 29 Jan 2022 12:08:02 -0800 Subject: [PATCH] Crawls API improvements (#117) * crawls api improvements (fixes #110) - add GET /crawls/{crawlid} api to return single crawl - resolve crawlconfig name, add as `configName` to crawl model - add 'created' date for crawlconfigs - flatten list to single 'crawls' list, instead of separate 'finished' and 'running' (running crawls added first) - include 'fileCount' and 'fileSize', remove files - remove `files` from crawl list response, also remove `aid` - remove `schedule` from crawl data altogether, (available in crawl config) - add ListCrawls response model --- backend/crawlconfigs.py | 6 ++- backend/crawls.py | 117 ++++++++++++++++++++++++++++++++++------ backend/k8sman.py | 22 +++++++- 3 files changed, 126 insertions(+), 19 deletions(-) diff --git a/backend/crawlconfigs.py b/backend/crawlconfigs.py index bd4e568d..45806b5c 100644 --- a/backend/crawlconfigs.py +++ b/backend/crawlconfigs.py @@ -47,8 +47,6 @@ class RawCrawlConfig(BaseModel): seeds: List[Union[str, Seed]] - # collection: Optional[str] = "my-web-archive" - scopeType: Optional[ScopeType] = ScopeType.PREFIX scope: Union[str, List[str], None] = "" exclude: Union[str, List[str], None] = "" @@ -97,6 +95,8 @@ class CrawlConfig(BaseMongoModel): name: Optional[str] + created: Optional[datetime] + colls: Optional[List[str]] = [] crawlTimeout: Optional[int] = 0 @@ -169,6 +169,8 @@ class CrawlOps: result = await self.crawl_configs.insert_one(data) + data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None) + crawlconfig = CrawlConfig.from_dict(data) new_name = await self.crawl_manager.add_crawl_config( diff --git a/backend/crawls.py b/backend/crawls.py index 6825283b..528979d6 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -47,7 +47,7 @@ class Crawl(BaseMongoModel): aid: str cid: str - schedule: Optional[str] + # schedule: Optional[str] manual: Optional[bool] started: datetime @@ -65,6 +65,38 @@ class Crawl(BaseMongoModel): colls: Optional[List[str]] = [] +# ============================================================================ +class ListCrawlOut(BaseMongoModel): + """ Crawl output model for list view """ + id: str + + user: str + username: Optional[str] + + cid: str + configName: Optional[str] + + manual: Optional[bool] + + started: datetime + finished: Optional[datetime] + + state: str + + stats: Optional[Dict[str, str]] + + fileSize: int = 0 + fileCount: int = 0 + + colls: Optional[List[str]] = [] + + +# ============================================================================ +class ListCrawls(BaseModel): + """ Response model for list of crawls """ + crawls: List[ListCrawlOut] + + # ============================================================================ class CrawlCompleteIn(BaseModel): """ Completed Crawl Webhook POST message """ @@ -183,25 +215,71 @@ class CrawlOps: if collid: query["colls"] = collid - cursor = self.crawls.find(query) - results = await cursor.to_list(length=1000) - return [Crawl.from_dict(res) for res in results] + # cursor = self.crawls.find(query) + cursor = self.crawls.aggregate( + [ + {"$match": query}, + { + "$lookup": { + "from": "crawl_configs", + "localField": "cid", + "foreignField": "_id", + "as": "configName", + }, + }, + {"$set": {"configName": {"$arrayElemAt": ["$configName.name", 0]}}}, + {"$set": {"fileSize": {"$sum": "$files.size"}}}, + {"$set": {"fileCount": {"$size": "$files"}}}, + {"$unset": ["files"]}, + ] + ) - async def list_crawls(self, aid: str): + results = await cursor.to_list(length=1000) + return [ListCrawlOut.from_dict(res) for res in results] + + async def list_crawls(self, archive: Archive): """ list finished and running crawl data """ - running_crawls = await self.crawl_manager.list_running_crawls(aid=aid) + running_crawls = await self.crawl_manager.list_running_crawls(aid=archive.id) await self.get_redis_stats(running_crawls) - finished_crawls = await self.list_finished_crawls(aid=aid) + finished_crawls = await self.list_finished_crawls(aid=archive.id) - return { - "running": [ - crawl.dict(exclude_none=True, exclude_unset=True) - for crawl in running_crawls - ], - "finished": finished_crawls, - } + crawls = [] + + for crawl in running_crawls: + list_crawl = ListCrawlOut(**crawl.dict()) + crawls.append(await self._resolve_crawl(list_crawl, archive)) + + crawls.extend(finished_crawls) + + return ListCrawls(crawls=crawls) + + async def get_crawl(self, crawlid: str, archive: Archive): + """ Get data for single crawl """ + crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id) + if crawl: + await self.get_redis_stats([crawl]) + + else: + res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id}) + if not res: + raise HTTPException( + status_code=404, detail=f"Crawl not found: {crawlid}" + ) + + crawl = Crawl.from_dict(res) + + return await self._resolve_crawl(crawl, archive) + + async def _resolve_crawl(self, crawl, archive): + """ Resolve running crawl data """ + config = await self.crawl_configs.get_crawl_config(crawl.cid, archive) + + if config: + crawl.configName = config.name + + return crawl # pylint: disable=too-many-arguments async def get_redis_stats(self, crawl_list): @@ -240,9 +318,9 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv archive_crawl_dep = archives.archive_crawl_dep - @app.get("/archives/{aid}/crawls", tags=["crawls"]) + @app.get("/archives/{aid}/crawls", tags=["crawls"], response_model=ListCrawls) async def list_crawls(archive: Archive = Depends(archive_crawl_dep)): - return await ops.list_crawls(archive.id) + return await ops.list_crawls(archive) @app.post( "/archives/{aid}/crawls/{crawl_id}/cancel", @@ -304,6 +382,13 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv return {"deleted": res} + @app.get( + "/archives/{aid}/crawls/{crawl_id}", + tags=["crawls"], + ) + async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)): + return await ops.get_crawl(crawl_id, archive) + @app.get( "/archives/{aid}/crawls/{crawl_id}/running", tags=["crawls"], diff --git a/backend/k8sman.py b/backend/k8sman.py index 2df34234..4faccd5a 100644 --- a/backend/k8sman.py +++ b/backend/k8sman.py @@ -285,6 +285,26 @@ class K8SManager: if job.status.active ] + async def get_running_crawl(self, name, aid): + """Get running crawl (job) with given name, or none + if not found/not running""" + try: + job = await self.batch_api.read_namespaced_job( + name=name, namespace=self.namespace + ) + + if not job or job.metadata.labels["btrix.archive"] != aid: + return None + + if job.status.active: + return self._make_crawl_for_job(job, "running") + + # pylint: disable=broad-except + except Exception: + pass + + return None + async def init_crawl_screencast(self, crawl_id, aid): """ Init service for this job/crawl_id to support screencasting """ labels = {"btrix.archive": aid} @@ -475,7 +495,7 @@ class K8SManager: user=job.metadata.labels["btrix.user"], aid=job.metadata.labels["btrix.archive"], cid=job.metadata.labels["btrix.crawlconfig"], - schedule=job.metadata.annotations.get("btrix.run.schedule", ""), + # schedule=job.metadata.annotations.get("btrix.run.schedule", ""), manual=job.metadata.annotations.get("btrix.run.manual") == "1", started=job.status.start_time.replace(tzinfo=None), finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None)