Add single crawl info api at /crawls/{crawl_id} (#418)

* backend: crawl info apis:
- add /crawls/{crawl_id} api endpoint which just lists the crawl info, without resolving the individual files
- move /crawls/{crawl_id}.json -> /crawls/{crawl_id}/replay.json for clarity that it's used for replay

* frontend: update api for new replay.json endpoint
This commit is contained in:
Ilya Kreymer 2022-12-19 14:54:48 -08:00 committed by GitHub
parent ad07b6ab43
commit dfca09fc9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 5 deletions

View File

@ -173,6 +173,7 @@ class CrawlOps:
archive: Optional[Archive] = None,
cid: uuid.UUID = None,
collid: uuid.UUID = None,
crawl_id: str = None,
exclude_files=True,
running_only=False,
):
@ -193,6 +194,9 @@ class CrawlOps:
if running_only:
query["state"] = {"$in": ["running", "starting", "stopping"]}
if crawl_id:
query["_id"] = crawl_id
# pylint: disable=duplicate-code
aggregate = [
{"$match": query},
@ -613,7 +617,7 @@ def init_crawls_api(
return {"deleted": res}
@app.get(
"/archives/all/crawls/{crawl_id}.json",
"/archives/all/crawls/{crawl_id}/replay.json",
tags=["crawls"],
response_model=CrawlOut,
)
@ -624,13 +628,43 @@ def init_crawls_api(
return await ops.get_crawl(crawl_id, None)
@app.get(
"/archives/{aid}/crawls/{crawl_id}.json",
"/archives/{aid}/crawls/{crawl_id}/replay.json",
tags=["crawls"],
response_model=CrawlOut,
)
async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl(crawl_id, archive)
@app.get(
"/archives/all/crawls/{crawl_id}",
tags=["crawls"],
response_model=ListCrawlOut,
)
async def list_single_crawl_admin(crawl_id, user: User = Depends(user_dep)):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
crawls = await ops.list_crawls(crawl_id=crawl_id)
print("crawls", crawls)
if len(crawls) < 1:
raise HTTPException(status_code=404, detail="crawl_not_found")
return crawls[0]
@app.get(
"/archives/{aid}/crawls/{crawl_id}",
tags=["crawls"],
response_model=ListCrawlOut,
)
async def list_single_crawl(
crawl_id, archive: Archive = Depends(archive_crawl_dep)
):
crawls = await ops.list_crawls(archive, crawl_id=crawl_id)
if len(crawls) < 1:
raise HTTPException(status_code=404, detail="crawl_not_found")
return crawls[0]
@app.post(
"/archives/{aid}/crawls/{crawl_id}/scale",
tags=["crawls"],

View File

@ -81,7 +81,7 @@ def test_wait_for_complete():
while True:
r = requests.get(
f"{api_prefix}/archives/{archive_id}/crawls/{crawl_id}.json",
f"{api_prefix}/archives/{archive_id}/crawls/{crawl_id}/replay.json",
headers=headers,
)
data = r.json()
@ -105,6 +105,13 @@ def test_wait_for_complete():
wacz_size = data["resources"][0]["size"]
wacz_hash = data["resources"][0]["hash"]
def test_crawl_info():
r = requests.get(
f"{api_prefix}/archives/{archive_id}/crawls/{crawl_id}",
headers=headers,
)
data = r.json()
assert data["fileSize"] == wacz_size
def test_download_wacz():
r = requests.get(host_prefix + wacz_path)

View File

@ -560,7 +560,7 @@ export class CrawlDetail extends LiteElement {
const bearer = this.authState?.headers?.Authorization?.split(" ", 2)[1];
// for now, just use the first file until multi-wacz support is fully implemented
const replaySource = `/api/archives/${this.crawl?.aid}/crawls/${this.crawlId}.json?auth_bearer=${bearer}`;
const replaySource = `/api/archives/${this.crawl?.aid}/crawls/${this.crawlId}/replay.json?auth_bearer=${bearer}`;
//const replaySource = this.crawl?.resources?.[0]?.path;
const canReplay = replaySource && this.hasFiles;
@ -881,7 +881,7 @@ export class CrawlDetail extends LiteElement {
private async getCrawl(): Promise<Crawl> {
const data: Crawl = await this.apiFetch(
`${this.crawlsAPIBaseUrl || this.crawlsBaseUrl}/${this.crawlId}.json`,
`${this.crawlsAPIBaseUrl || this.crawlsBaseUrl}/${this.crawlId}/replay.json`,
this.authState!
);