Crawls API improvements (#117)
* crawls api improvements (fixes #110) - add GET /crawls/{crawlid} api to return single crawl - resolve crawlconfig name, add as `configName` to crawl model - add 'created' date for crawlconfigs - flatten list to single 'crawls' list, instead of separate 'finished' and 'running' (running crawls added first) - include 'fileCount' and 'fileSize', remove files - remove `files` from crawl list response, also remove `aid` - remove `schedule` from crawl data altogether, (available in crawl config) - add ListCrawls response model
This commit is contained in:
parent
2636f33123
commit
9499ebfbba
@ -47,8 +47,6 @@ class RawCrawlConfig(BaseModel):
|
||||
|
||||
seeds: List[Union[str, Seed]]
|
||||
|
||||
# collection: Optional[str] = "my-web-archive"
|
||||
|
||||
scopeType: Optional[ScopeType] = ScopeType.PREFIX
|
||||
scope: Union[str, List[str], None] = ""
|
||||
exclude: Union[str, List[str], None] = ""
|
||||
@ -97,6 +95,8 @@ class CrawlConfig(BaseMongoModel):
|
||||
|
||||
name: Optional[str]
|
||||
|
||||
created: Optional[datetime]
|
||||
|
||||
colls: Optional[List[str]] = []
|
||||
|
||||
crawlTimeout: Optional[int] = 0
|
||||
@ -169,6 +169,8 @@ class CrawlOps:
|
||||
|
||||
result = await self.crawl_configs.insert_one(data)
|
||||
|
||||
data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None)
|
||||
|
||||
crawlconfig = CrawlConfig.from_dict(data)
|
||||
|
||||
new_name = await self.crawl_manager.add_crawl_config(
|
||||
|
||||
@ -47,7 +47,7 @@ class Crawl(BaseMongoModel):
|
||||
aid: str
|
||||
cid: str
|
||||
|
||||
schedule: Optional[str]
|
||||
# schedule: Optional[str]
|
||||
manual: Optional[bool]
|
||||
|
||||
started: datetime
|
||||
@ -65,6 +65,38 @@ class Crawl(BaseMongoModel):
|
||||
colls: Optional[List[str]] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ListCrawlOut(BaseMongoModel):
|
||||
""" Crawl output model for list view """
|
||||
id: str
|
||||
|
||||
user: str
|
||||
username: Optional[str]
|
||||
|
||||
cid: str
|
||||
configName: Optional[str]
|
||||
|
||||
manual: Optional[bool]
|
||||
|
||||
started: datetime
|
||||
finished: Optional[datetime]
|
||||
|
||||
state: str
|
||||
|
||||
stats: Optional[Dict[str, str]]
|
||||
|
||||
fileSize: int = 0
|
||||
fileCount: int = 0
|
||||
|
||||
colls: Optional[List[str]] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ListCrawls(BaseModel):
|
||||
""" Response model for list of crawls """
|
||||
crawls: List[ListCrawlOut]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlCompleteIn(BaseModel):
|
||||
""" Completed Crawl Webhook POST message """
|
||||
@ -183,25 +215,71 @@ class CrawlOps:
|
||||
if collid:
|
||||
query["colls"] = collid
|
||||
|
||||
cursor = self.crawls.find(query)
|
||||
results = await cursor.to_list(length=1000)
|
||||
return [Crawl.from_dict(res) for res in results]
|
||||
# cursor = self.crawls.find(query)
|
||||
cursor = self.crawls.aggregate(
|
||||
[
|
||||
{"$match": query},
|
||||
{
|
||||
"$lookup": {
|
||||
"from": "crawl_configs",
|
||||
"localField": "cid",
|
||||
"foreignField": "_id",
|
||||
"as": "configName",
|
||||
},
|
||||
},
|
||||
{"$set": {"configName": {"$arrayElemAt": ["$configName.name", 0]}}},
|
||||
{"$set": {"fileSize": {"$sum": "$files.size"}}},
|
||||
{"$set": {"fileCount": {"$size": "$files"}}},
|
||||
{"$unset": ["files"]},
|
||||
]
|
||||
)
|
||||
|
||||
async def list_crawls(self, aid: str):
|
||||
results = await cursor.to_list(length=1000)
|
||||
return [ListCrawlOut.from_dict(res) for res in results]
|
||||
|
||||
async def list_crawls(self, archive: Archive):
|
||||
""" list finished and running crawl data """
|
||||
running_crawls = await self.crawl_manager.list_running_crawls(aid=aid)
|
||||
running_crawls = await self.crawl_manager.list_running_crawls(aid=archive.id)
|
||||
|
||||
await self.get_redis_stats(running_crawls)
|
||||
|
||||
finished_crawls = await self.list_finished_crawls(aid=aid)
|
||||
finished_crawls = await self.list_finished_crawls(aid=archive.id)
|
||||
|
||||
return {
|
||||
"running": [
|
||||
crawl.dict(exclude_none=True, exclude_unset=True)
|
||||
for crawl in running_crawls
|
||||
],
|
||||
"finished": finished_crawls,
|
||||
}
|
||||
crawls = []
|
||||
|
||||
for crawl in running_crawls:
|
||||
list_crawl = ListCrawlOut(**crawl.dict())
|
||||
crawls.append(await self._resolve_crawl(list_crawl, archive))
|
||||
|
||||
crawls.extend(finished_crawls)
|
||||
|
||||
return ListCrawls(crawls=crawls)
|
||||
|
||||
async def get_crawl(self, crawlid: str, archive: Archive):
|
||||
""" Get data for single crawl """
|
||||
crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id)
|
||||
if crawl:
|
||||
await self.get_redis_stats([crawl])
|
||||
|
||||
else:
|
||||
res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id})
|
||||
if not res:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Crawl not found: {crawlid}"
|
||||
)
|
||||
|
||||
crawl = Crawl.from_dict(res)
|
||||
|
||||
return await self._resolve_crawl(crawl, archive)
|
||||
|
||||
async def _resolve_crawl(self, crawl, archive):
|
||||
""" Resolve running crawl data """
|
||||
config = await self.crawl_configs.get_crawl_config(crawl.cid, archive)
|
||||
|
||||
if config:
|
||||
crawl.configName = config.name
|
||||
|
||||
return crawl
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
async def get_redis_stats(self, crawl_list):
|
||||
@ -240,9 +318,9 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
|
||||
|
||||
archive_crawl_dep = archives.archive_crawl_dep
|
||||
|
||||
@app.get("/archives/{aid}/crawls", tags=["crawls"])
|
||||
@app.get("/archives/{aid}/crawls", tags=["crawls"], response_model=ListCrawls)
|
||||
async def list_crawls(archive: Archive = Depends(archive_crawl_dep)):
|
||||
return await ops.list_crawls(archive.id)
|
||||
return await ops.list_crawls(archive)
|
||||
|
||||
@app.post(
|
||||
"/archives/{aid}/crawls/{crawl_id}/cancel",
|
||||
@ -304,6 +382,13 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
|
||||
|
||||
return {"deleted": res}
|
||||
|
||||
@app.get(
|
||||
"/archives/{aid}/crawls/{crawl_id}",
|
||||
tags=["crawls"],
|
||||
)
|
||||
async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
|
||||
return await ops.get_crawl(crawl_id, archive)
|
||||
|
||||
@app.get(
|
||||
"/archives/{aid}/crawls/{crawl_id}/running",
|
||||
tags=["crawls"],
|
||||
|
||||
@ -285,6 +285,26 @@ class K8SManager:
|
||||
if job.status.active
|
||||
]
|
||||
|
||||
async def get_running_crawl(self, name, aid):
|
||||
"""Get running crawl (job) with given name, or none
|
||||
if not found/not running"""
|
||||
try:
|
||||
job = await self.batch_api.read_namespaced_job(
|
||||
name=name, namespace=self.namespace
|
||||
)
|
||||
|
||||
if not job or job.metadata.labels["btrix.archive"] != aid:
|
||||
return None
|
||||
|
||||
if job.status.active:
|
||||
return self._make_crawl_for_job(job, "running")
|
||||
|
||||
# pylint: disable=broad-except
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
async def init_crawl_screencast(self, crawl_id, aid):
|
||||
""" Init service for this job/crawl_id to support screencasting """
|
||||
labels = {"btrix.archive": aid}
|
||||
@ -475,7 +495,7 @@ class K8SManager:
|
||||
user=job.metadata.labels["btrix.user"],
|
||||
aid=job.metadata.labels["btrix.archive"],
|
||||
cid=job.metadata.labels["btrix.crawlconfig"],
|
||||
schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
|
||||
# schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
|
||||
manual=job.metadata.annotations.get("btrix.run.manual") == "1",
|
||||
started=job.status.start_time.replace(tzinfo=None),
|
||||
finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user