Crawls API improvements (#117)
* crawls api improvements (fixes #110) - add GET /crawls/{crawlid} api to return single crawl - resolve crawlconfig name, add as `configName` to crawl model - add 'created' date for crawlconfigs - flatten list to single 'crawls' list, instead of separate 'finished' and 'running' (running crawls added first) - include 'fileCount' and 'fileSize', remove files - remove `files` from crawl list response, also remove `aid` - remove `schedule` from crawl data altogether, (available in crawl config) - add ListCrawls response model
This commit is contained in:
parent
2636f33123
commit
9499ebfbba
@ -47,8 +47,6 @@ class RawCrawlConfig(BaseModel):
|
|||||||
|
|
||||||
seeds: List[Union[str, Seed]]
|
seeds: List[Union[str, Seed]]
|
||||||
|
|
||||||
# collection: Optional[str] = "my-web-archive"
|
|
||||||
|
|
||||||
scopeType: Optional[ScopeType] = ScopeType.PREFIX
|
scopeType: Optional[ScopeType] = ScopeType.PREFIX
|
||||||
scope: Union[str, List[str], None] = ""
|
scope: Union[str, List[str], None] = ""
|
||||||
exclude: Union[str, List[str], None] = ""
|
exclude: Union[str, List[str], None] = ""
|
||||||
@ -97,6 +95,8 @@ class CrawlConfig(BaseMongoModel):
|
|||||||
|
|
||||||
name: Optional[str]
|
name: Optional[str]
|
||||||
|
|
||||||
|
created: Optional[datetime]
|
||||||
|
|
||||||
colls: Optional[List[str]] = []
|
colls: Optional[List[str]] = []
|
||||||
|
|
||||||
crawlTimeout: Optional[int] = 0
|
crawlTimeout: Optional[int] = 0
|
||||||
@ -169,6 +169,8 @@ class CrawlOps:
|
|||||||
|
|
||||||
result = await self.crawl_configs.insert_one(data)
|
result = await self.crawl_configs.insert_one(data)
|
||||||
|
|
||||||
|
data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None)
|
||||||
|
|
||||||
crawlconfig = CrawlConfig.from_dict(data)
|
crawlconfig = CrawlConfig.from_dict(data)
|
||||||
|
|
||||||
new_name = await self.crawl_manager.add_crawl_config(
|
new_name = await self.crawl_manager.add_crawl_config(
|
||||||
|
|||||||
@ -47,7 +47,7 @@ class Crawl(BaseMongoModel):
|
|||||||
aid: str
|
aid: str
|
||||||
cid: str
|
cid: str
|
||||||
|
|
||||||
schedule: Optional[str]
|
# schedule: Optional[str]
|
||||||
manual: Optional[bool]
|
manual: Optional[bool]
|
||||||
|
|
||||||
started: datetime
|
started: datetime
|
||||||
@ -65,6 +65,38 @@ class Crawl(BaseMongoModel):
|
|||||||
colls: Optional[List[str]] = []
|
colls: Optional[List[str]] = []
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class ListCrawlOut(BaseMongoModel):
|
||||||
|
""" Crawl output model for list view """
|
||||||
|
id: str
|
||||||
|
|
||||||
|
user: str
|
||||||
|
username: Optional[str]
|
||||||
|
|
||||||
|
cid: str
|
||||||
|
configName: Optional[str]
|
||||||
|
|
||||||
|
manual: Optional[bool]
|
||||||
|
|
||||||
|
started: datetime
|
||||||
|
finished: Optional[datetime]
|
||||||
|
|
||||||
|
state: str
|
||||||
|
|
||||||
|
stats: Optional[Dict[str, str]]
|
||||||
|
|
||||||
|
fileSize: int = 0
|
||||||
|
fileCount: int = 0
|
||||||
|
|
||||||
|
colls: Optional[List[str]] = []
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class ListCrawls(BaseModel):
|
||||||
|
""" Response model for list of crawls """
|
||||||
|
crawls: List[ListCrawlOut]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlCompleteIn(BaseModel):
|
class CrawlCompleteIn(BaseModel):
|
||||||
""" Completed Crawl Webhook POST message """
|
""" Completed Crawl Webhook POST message """
|
||||||
@ -183,25 +215,71 @@ class CrawlOps:
|
|||||||
if collid:
|
if collid:
|
||||||
query["colls"] = collid
|
query["colls"] = collid
|
||||||
|
|
||||||
cursor = self.crawls.find(query)
|
# cursor = self.crawls.find(query)
|
||||||
results = await cursor.to_list(length=1000)
|
cursor = self.crawls.aggregate(
|
||||||
return [Crawl.from_dict(res) for res in results]
|
[
|
||||||
|
{"$match": query},
|
||||||
|
{
|
||||||
|
"$lookup": {
|
||||||
|
"from": "crawl_configs",
|
||||||
|
"localField": "cid",
|
||||||
|
"foreignField": "_id",
|
||||||
|
"as": "configName",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{"$set": {"configName": {"$arrayElemAt": ["$configName.name", 0]}}},
|
||||||
|
{"$set": {"fileSize": {"$sum": "$files.size"}}},
|
||||||
|
{"$set": {"fileCount": {"$size": "$files"}}},
|
||||||
|
{"$unset": ["files"]},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
async def list_crawls(self, aid: str):
|
results = await cursor.to_list(length=1000)
|
||||||
|
return [ListCrawlOut.from_dict(res) for res in results]
|
||||||
|
|
||||||
|
async def list_crawls(self, archive: Archive):
|
||||||
""" list finished and running crawl data """
|
""" list finished and running crawl data """
|
||||||
running_crawls = await self.crawl_manager.list_running_crawls(aid=aid)
|
running_crawls = await self.crawl_manager.list_running_crawls(aid=archive.id)
|
||||||
|
|
||||||
await self.get_redis_stats(running_crawls)
|
await self.get_redis_stats(running_crawls)
|
||||||
|
|
||||||
finished_crawls = await self.list_finished_crawls(aid=aid)
|
finished_crawls = await self.list_finished_crawls(aid=archive.id)
|
||||||
|
|
||||||
return {
|
crawls = []
|
||||||
"running": [
|
|
||||||
crawl.dict(exclude_none=True, exclude_unset=True)
|
for crawl in running_crawls:
|
||||||
for crawl in running_crawls
|
list_crawl = ListCrawlOut(**crawl.dict())
|
||||||
],
|
crawls.append(await self._resolve_crawl(list_crawl, archive))
|
||||||
"finished": finished_crawls,
|
|
||||||
}
|
crawls.extend(finished_crawls)
|
||||||
|
|
||||||
|
return ListCrawls(crawls=crawls)
|
||||||
|
|
||||||
|
async def get_crawl(self, crawlid: str, archive: Archive):
|
||||||
|
""" Get data for single crawl """
|
||||||
|
crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id)
|
||||||
|
if crawl:
|
||||||
|
await self.get_redis_stats([crawl])
|
||||||
|
|
||||||
|
else:
|
||||||
|
res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id})
|
||||||
|
if not res:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Crawl not found: {crawlid}"
|
||||||
|
)
|
||||||
|
|
||||||
|
crawl = Crawl.from_dict(res)
|
||||||
|
|
||||||
|
return await self._resolve_crawl(crawl, archive)
|
||||||
|
|
||||||
|
async def _resolve_crawl(self, crawl, archive):
|
||||||
|
""" Resolve running crawl data """
|
||||||
|
config = await self.crawl_configs.get_crawl_config(crawl.cid, archive)
|
||||||
|
|
||||||
|
if config:
|
||||||
|
crawl.configName = config.name
|
||||||
|
|
||||||
|
return crawl
|
||||||
|
|
||||||
# pylint: disable=too-many-arguments
|
# pylint: disable=too-many-arguments
|
||||||
async def get_redis_stats(self, crawl_list):
|
async def get_redis_stats(self, crawl_list):
|
||||||
@ -240,9 +318,9 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
|
|||||||
|
|
||||||
archive_crawl_dep = archives.archive_crawl_dep
|
archive_crawl_dep = archives.archive_crawl_dep
|
||||||
|
|
||||||
@app.get("/archives/{aid}/crawls", tags=["crawls"])
|
@app.get("/archives/{aid}/crawls", tags=["crawls"], response_model=ListCrawls)
|
||||||
async def list_crawls(archive: Archive = Depends(archive_crawl_dep)):
|
async def list_crawls(archive: Archive = Depends(archive_crawl_dep)):
|
||||||
return await ops.list_crawls(archive.id)
|
return await ops.list_crawls(archive)
|
||||||
|
|
||||||
@app.post(
|
@app.post(
|
||||||
"/archives/{aid}/crawls/{crawl_id}/cancel",
|
"/archives/{aid}/crawls/{crawl_id}/cancel",
|
||||||
@ -304,6 +382,13 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
|
|||||||
|
|
||||||
return {"deleted": res}
|
return {"deleted": res}
|
||||||
|
|
||||||
|
@app.get(
|
||||||
|
"/archives/{aid}/crawls/{crawl_id}",
|
||||||
|
tags=["crawls"],
|
||||||
|
)
|
||||||
|
async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
|
||||||
|
return await ops.get_crawl(crawl_id, archive)
|
||||||
|
|
||||||
@app.get(
|
@app.get(
|
||||||
"/archives/{aid}/crawls/{crawl_id}/running",
|
"/archives/{aid}/crawls/{crawl_id}/running",
|
||||||
tags=["crawls"],
|
tags=["crawls"],
|
||||||
|
|||||||
@ -285,6 +285,26 @@ class K8SManager:
|
|||||||
if job.status.active
|
if job.status.active
|
||||||
]
|
]
|
||||||
|
|
||||||
|
async def get_running_crawl(self, name, aid):
|
||||||
|
"""Get running crawl (job) with given name, or none
|
||||||
|
if not found/not running"""
|
||||||
|
try:
|
||||||
|
job = await self.batch_api.read_namespaced_job(
|
||||||
|
name=name, namespace=self.namespace
|
||||||
|
)
|
||||||
|
|
||||||
|
if not job or job.metadata.labels["btrix.archive"] != aid:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if job.status.active:
|
||||||
|
return self._make_crawl_for_job(job, "running")
|
||||||
|
|
||||||
|
# pylint: disable=broad-except
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
async def init_crawl_screencast(self, crawl_id, aid):
|
async def init_crawl_screencast(self, crawl_id, aid):
|
||||||
""" Init service for this job/crawl_id to support screencasting """
|
""" Init service for this job/crawl_id to support screencasting """
|
||||||
labels = {"btrix.archive": aid}
|
labels = {"btrix.archive": aid}
|
||||||
@ -475,7 +495,7 @@ class K8SManager:
|
|||||||
user=job.metadata.labels["btrix.user"],
|
user=job.metadata.labels["btrix.user"],
|
||||||
aid=job.metadata.labels["btrix.archive"],
|
aid=job.metadata.labels["btrix.archive"],
|
||||||
cid=job.metadata.labels["btrix.crawlconfig"],
|
cid=job.metadata.labels["btrix.crawlconfig"],
|
||||||
schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
|
# schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
|
||||||
manual=job.metadata.annotations.get("btrix.run.manual") == "1",
|
manual=job.metadata.annotations.get("btrix.run.manual") == "1",
|
||||||
started=job.status.start_time.replace(tzinfo=None),
|
started=job.status.start_time.replace(tzinfo=None),
|
||||||
finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None)
|
finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user