Crawls API improvements (#117)

* crawls api improvements (fixes #110)
- add GET /crawls/{crawlid} api to return single crawl
- resolve crawlconfig name, add as `configName` to crawl model
- add 'created' date for crawlconfigs 
- flatten list to single 'crawls' list, instead of separate 'finished' and 'running' (running crawls added first)
- include 'fileCount' and 'fileSize', remove files
- remove `files` from crawl list response, also remove `aid`
- remove `schedule` from crawl data altogether, (available in crawl config)
- add ListCrawls response model
This commit is contained in:
Ilya Kreymer 2022-01-29 12:08:02 -08:00 committed by GitHub
parent 2636f33123
commit 9499ebfbba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 126 additions and 19 deletions

View File

@ -47,8 +47,6 @@ class RawCrawlConfig(BaseModel):
seeds: List[Union[str, Seed]]
# collection: Optional[str] = "my-web-archive"
scopeType: Optional[ScopeType] = ScopeType.PREFIX
scope: Union[str, List[str], None] = ""
exclude: Union[str, List[str], None] = ""
@ -97,6 +95,8 @@ class CrawlConfig(BaseMongoModel):
name: Optional[str]
created: Optional[datetime]
colls: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
@ -169,6 +169,8 @@ class CrawlOps:
result = await self.crawl_configs.insert_one(data)
data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None)
crawlconfig = CrawlConfig.from_dict(data)
new_name = await self.crawl_manager.add_crawl_config(

View File

@ -47,7 +47,7 @@ class Crawl(BaseMongoModel):
aid: str
cid: str
schedule: Optional[str]
# schedule: Optional[str]
manual: Optional[bool]
started: datetime
@ -65,6 +65,38 @@ class Crawl(BaseMongoModel):
colls: Optional[List[str]] = []
# ============================================================================
class ListCrawlOut(BaseMongoModel):
""" Crawl output model for list view """
id: str
user: str
username: Optional[str]
cid: str
configName: Optional[str]
manual: Optional[bool]
started: datetime
finished: Optional[datetime]
state: str
stats: Optional[Dict[str, str]]
fileSize: int = 0
fileCount: int = 0
colls: Optional[List[str]] = []
# ============================================================================
class ListCrawls(BaseModel):
""" Response model for list of crawls """
crawls: List[ListCrawlOut]
# ============================================================================
class CrawlCompleteIn(BaseModel):
""" Completed Crawl Webhook POST message """
@ -183,25 +215,71 @@ class CrawlOps:
if collid:
query["colls"] = collid
cursor = self.crawls.find(query)
results = await cursor.to_list(length=1000)
return [Crawl.from_dict(res) for res in results]
# cursor = self.crawls.find(query)
cursor = self.crawls.aggregate(
[
{"$match": query},
{
"$lookup": {
"from": "crawl_configs",
"localField": "cid",
"foreignField": "_id",
"as": "configName",
},
},
{"$set": {"configName": {"$arrayElemAt": ["$configName.name", 0]}}},
{"$set": {"fileSize": {"$sum": "$files.size"}}},
{"$set": {"fileCount": {"$size": "$files"}}},
{"$unset": ["files"]},
]
)
async def list_crawls(self, aid: str):
results = await cursor.to_list(length=1000)
return [ListCrawlOut.from_dict(res) for res in results]
async def list_crawls(self, archive: Archive):
""" list finished and running crawl data """
running_crawls = await self.crawl_manager.list_running_crawls(aid=aid)
running_crawls = await self.crawl_manager.list_running_crawls(aid=archive.id)
await self.get_redis_stats(running_crawls)
finished_crawls = await self.list_finished_crawls(aid=aid)
finished_crawls = await self.list_finished_crawls(aid=archive.id)
return {
"running": [
crawl.dict(exclude_none=True, exclude_unset=True)
for crawl in running_crawls
],
"finished": finished_crawls,
}
crawls = []
for crawl in running_crawls:
list_crawl = ListCrawlOut(**crawl.dict())
crawls.append(await self._resolve_crawl(list_crawl, archive))
crawls.extend(finished_crawls)
return ListCrawls(crawls=crawls)
async def get_crawl(self, crawlid: str, archive: Archive):
""" Get data for single crawl """
crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id)
if crawl:
await self.get_redis_stats([crawl])
else:
res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id})
if not res:
raise HTTPException(
status_code=404, detail=f"Crawl not found: {crawlid}"
)
crawl = Crawl.from_dict(res)
return await self._resolve_crawl(crawl, archive)
async def _resolve_crawl(self, crawl, archive):
""" Resolve running crawl data """
config = await self.crawl_configs.get_crawl_config(crawl.cid, archive)
if config:
crawl.configName = config.name
return crawl
# pylint: disable=too-many-arguments
async def get_redis_stats(self, crawl_list):
@ -240,9 +318,9 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
archive_crawl_dep = archives.archive_crawl_dep
@app.get("/archives/{aid}/crawls", tags=["crawls"])
@app.get("/archives/{aid}/crawls", tags=["crawls"], response_model=ListCrawls)
async def list_crawls(archive: Archive = Depends(archive_crawl_dep)):
return await ops.list_crawls(archive.id)
return await ops.list_crawls(archive)
@app.post(
"/archives/{aid}/crawls/{crawl_id}/cancel",
@ -304,6 +382,13 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
return {"deleted": res}
@app.get(
"/archives/{aid}/crawls/{crawl_id}",
tags=["crawls"],
)
async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl(crawl_id, archive)
@app.get(
"/archives/{aid}/crawls/{crawl_id}/running",
tags=["crawls"],

View File

@ -285,6 +285,26 @@ class K8SManager:
if job.status.active
]
async def get_running_crawl(self, name, aid):
"""Get running crawl (job) with given name, or none
if not found/not running"""
try:
job = await self.batch_api.read_namespaced_job(
name=name, namespace=self.namespace
)
if not job or job.metadata.labels["btrix.archive"] != aid:
return None
if job.status.active:
return self._make_crawl_for_job(job, "running")
# pylint: disable=broad-except
except Exception:
pass
return None
async def init_crawl_screencast(self, crawl_id, aid):
""" Init service for this job/crawl_id to support screencasting """
labels = {"btrix.archive": aid}
@ -475,7 +495,7 @@ class K8SManager:
user=job.metadata.labels["btrix.user"],
aid=job.metadata.labels["btrix.archive"],
cid=job.metadata.labels["btrix.crawlconfig"],
schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
# schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
manual=job.metadata.annotations.get("btrix.run.manual") == "1",
started=job.status.start_time.replace(tzinfo=None),
finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None)