Add API endpoints for crawl statistics (#1461)
Fixes #1158 Introduces two new API endpoints that stream crawling statistics CSVs (with a suggested attachment filename header): - `GET /api/orgs/all/crawls/stats` - crawls from all orgs (superuser only) - `GET /api/orgs/{oid}/crawls/stats` - crawls from just one org (available to org crawler/admin users as well as superusers) Also includes tests for both endpoints.
This commit is contained in:
parent
99dd9b4acb
commit
38a01860b8
@ -6,7 +6,7 @@ import re
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Dict, Union
|
||||||
|
|
||||||
from fastapi import Depends, HTTPException
|
from fastapi import Depends, HTTPException
|
||||||
from fastapi.responses import StreamingResponse
|
from fastapi.responses import StreamingResponse
|
||||||
@ -14,7 +14,7 @@ from redis import asyncio as exceptions
|
|||||||
import pymongo
|
import pymongo
|
||||||
|
|
||||||
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
||||||
from .utils import dt_now, parse_jsonl_error_messages
|
from .utils import dt_now, parse_jsonl_error_messages, stream_dict_list_as_csv
|
||||||
from .basecrawls import BaseCrawlOps
|
from .basecrawls import BaseCrawlOps
|
||||||
from .models import (
|
from .models import (
|
||||||
UpdateCrawl,
|
UpdateCrawl,
|
||||||
@ -497,6 +497,74 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return [], 0
|
return [], 0
|
||||||
|
|
||||||
|
async def get_crawl_stats(
|
||||||
|
self, org: Optional[Organization] = None
|
||||||
|
) -> List[Dict[str, Union[str, int]]]:
|
||||||
|
"""Return crawl statistics"""
|
||||||
|
# pylint: disable=too-many-locals
|
||||||
|
org_slugs = await self.orgs.get_org_slugs_by_ids()
|
||||||
|
user_emails = await self.user_manager.get_user_emails_by_ids()
|
||||||
|
|
||||||
|
crawls_data: List[Dict[str, Union[str, int]]] = []
|
||||||
|
|
||||||
|
query: Dict[str, Union[str, UUID]] = {"type": "crawl"}
|
||||||
|
if org:
|
||||||
|
query["oid"] = org.id
|
||||||
|
|
||||||
|
async for crawl in self.crawls.find(query):
|
||||||
|
data: Dict[str, Union[str, int]] = {}
|
||||||
|
data["id"] = str(crawl.get("_id"))
|
||||||
|
|
||||||
|
oid = crawl.get("oid")
|
||||||
|
data["oid"] = str(oid)
|
||||||
|
data["org"] = org_slugs[oid]
|
||||||
|
|
||||||
|
data["cid"] = str(crawl.get("cid"))
|
||||||
|
crawl_name = crawl.get("name")
|
||||||
|
data["name"] = f'"{crawl_name}"' if crawl_name else ""
|
||||||
|
data["state"] = crawl.get("state")
|
||||||
|
|
||||||
|
userid = crawl.get("userid")
|
||||||
|
data["userid"] = str(userid)
|
||||||
|
data["user"] = user_emails.get(userid)
|
||||||
|
|
||||||
|
started = crawl.get("started")
|
||||||
|
finished = crawl.get("finished")
|
||||||
|
|
||||||
|
data["started"] = str(started)
|
||||||
|
data["finished"] = str(finished)
|
||||||
|
|
||||||
|
data["duration"] = 0
|
||||||
|
if started and finished:
|
||||||
|
duration = finished - started
|
||||||
|
duration_seconds = int(duration.total_seconds())
|
||||||
|
if duration_seconds:
|
||||||
|
data["duration"] = duration_seconds
|
||||||
|
|
||||||
|
done_stats = None
|
||||||
|
if crawl.get("stats") and crawl.get("stats").get("done"):
|
||||||
|
done_stats = crawl["stats"]["done"]
|
||||||
|
|
||||||
|
data["pages"] = 0
|
||||||
|
if done_stats:
|
||||||
|
data["pages"] = done_stats
|
||||||
|
|
||||||
|
data["filesize"] = crawl.get("fileSize", 0)
|
||||||
|
|
||||||
|
data["avg_page_time"] = 0
|
||||||
|
if (
|
||||||
|
done_stats
|
||||||
|
and done_stats != 0
|
||||||
|
and started
|
||||||
|
and finished
|
||||||
|
and duration_seconds
|
||||||
|
):
|
||||||
|
data["avg_page_time"] = int(duration_seconds / done_stats)
|
||||||
|
|
||||||
|
crawls_data.append(data)
|
||||||
|
|
||||||
|
return crawls_data
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
|
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
|
||||||
@ -646,6 +714,23 @@ def init_crawls_api(app, user_dep, *args):
|
|||||||
):
|
):
|
||||||
return await ops.delete_crawls(org, delete_list, "crawl", user)
|
return await ops.delete_crawls(org, delete_list, "crawl", user)
|
||||||
|
|
||||||
|
@app.get("/orgs/all/crawls/stats", tags=["crawls"])
|
||||||
|
async def get_all_orgs_crawl_stats(
|
||||||
|
user: User = Depends(user_dep),
|
||||||
|
):
|
||||||
|
if not user.is_superuser:
|
||||||
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
||||||
|
|
||||||
|
crawl_stats = await ops.get_crawl_stats()
|
||||||
|
return stream_dict_list_as_csv(crawl_stats, "crawling-stats.csv")
|
||||||
|
|
||||||
|
@app.get("/orgs/{oid}/crawls/stats", tags=["crawls"])
|
||||||
|
async def get_org_crawl_stats(
|
||||||
|
org: Organization = Depends(org_crawl_dep),
|
||||||
|
):
|
||||||
|
crawl_stats = await ops.get_crawl_stats(org)
|
||||||
|
return stream_dict_list_as_csv(crawl_stats, f"crawling-stats-{org.id}.csv")
|
||||||
|
|
||||||
@app.get(
|
@app.get(
|
||||||
"/orgs/all/crawls/{crawl_id}/replay.json",
|
"/orgs/all/crawls/{crawl_id}/replay.json",
|
||||||
tags=["crawls"],
|
tags=["crawls"],
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Organization API handling
|
Organization API handling
|
||||||
"""
|
"""
|
||||||
|
# pylint: disable=too-many-lines
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
@ -653,7 +654,7 @@ class OrgOps:
|
|||||||
slugs = await self.orgs.distinct("slug", {})
|
slugs = await self.orgs.distinct("slug", {})
|
||||||
return {"slugs": slugs}
|
return {"slugs": slugs}
|
||||||
|
|
||||||
async def get_all_org_slugs_with_ids(self):
|
async def get_org_slugs_by_ids(self):
|
||||||
"""Return dict with {id: slug} for all orgs."""
|
"""Return dict with {id: slug} for all orgs."""
|
||||||
slug_id_map = {}
|
slug_id_map = {}
|
||||||
async for org in self.orgs.find({}):
|
async for org in self.orgs.find({}):
|
||||||
@ -933,6 +934,6 @@ def init_orgs_api(app, mdb, user_manager, invites, user_dep):
|
|||||||
async def get_all_org_slugs_with_ids(user: User = Depends(user_dep)):
|
async def get_all_org_slugs_with_ids(user: User = Depends(user_dep)):
|
||||||
if not user.is_superuser:
|
if not user.is_superuser:
|
||||||
raise HTTPException(status_code=403, detail="Not Allowed")
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
||||||
return await ops.get_all_org_slugs_with_ids()
|
return await ops.get_org_slugs_by_ids()
|
||||||
|
|
||||||
return ops
|
return ops
|
||||||
|
@ -213,6 +213,13 @@ class UserManager:
|
|||||||
)
|
)
|
||||||
return await cursor.to_list(length=1000)
|
return await cursor.to_list(length=1000)
|
||||||
|
|
||||||
|
async def get_user_emails_by_ids(self):
|
||||||
|
"""return dict of user emails keyed by id"""
|
||||||
|
email_id_map = {}
|
||||||
|
async for user in self.users.find({}):
|
||||||
|
email_id_map[user["id"]] = user["email"]
|
||||||
|
return email_id_map
|
||||||
|
|
||||||
async def get_superuser(self) -> Optional[User]:
|
async def get_superuser(self) -> Optional[User]:
|
||||||
"""return current superuser, if any"""
|
"""return current superuser, if any"""
|
||||||
user_data = await self.users.find_one({"is_superuser": True})
|
user_data = await self.users.find_one({"is_superuser": True})
|
||||||
|
@ -1,15 +1,19 @@
|
|||||||
""" k8s utils """
|
""" k8s utils """
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
import os
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import signal
|
|
||||||
import atexit
|
import atexit
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import signal
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Optional, Dict, Union, List
|
||||||
|
|
||||||
|
from fastapi import HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
from slugify import slugify
|
from slugify import slugify
|
||||||
|
|
||||||
|
|
||||||
@ -97,3 +101,22 @@ def is_bool(stri: Optional[str]) -> bool:
|
|||||||
def slug_from_name(name: str) -> str:
|
def slug_from_name(name: str) -> str:
|
||||||
"""Generate slug from name"""
|
"""Generate slug from name"""
|
||||||
return slugify(name.replace("'", ""))
|
return slugify(name.replace("'", ""))
|
||||||
|
|
||||||
|
|
||||||
|
def stream_dict_list_as_csv(data: List[Dict[str, Union[str, int]]], filename: str):
|
||||||
|
"""Stream list of dictionaries as CSV with attachment filename header"""
|
||||||
|
if not data:
|
||||||
|
raise HTTPException(status_code=404, detail="crawls_not_found")
|
||||||
|
|
||||||
|
keys = data[0].keys()
|
||||||
|
|
||||||
|
buffer = io.StringIO()
|
||||||
|
dict_writer = csv.DictWriter(buffer, keys, quoting=csv.QUOTE_NONNUMERIC)
|
||||||
|
dict_writer.writeheader()
|
||||||
|
dict_writer.writerows(data)
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
iter([buffer.getvalue()]),
|
||||||
|
media_type="text/csv",
|
||||||
|
headers={"Content-Disposition": f"attachment;filename={filename}"},
|
||||||
|
)
|
||||||
|
@ -4,6 +4,8 @@ import time
|
|||||||
import io
|
import io
|
||||||
import zipfile
|
import zipfile
|
||||||
import re
|
import re
|
||||||
|
import csv
|
||||||
|
import codecs
|
||||||
|
|
||||||
from .conftest import API_PREFIX, HOST_PREFIX
|
from .conftest import API_PREFIX, HOST_PREFIX
|
||||||
from .test_collections import UPDATED_NAME as COLLECTION_NAME
|
from .test_collections import UPDATED_NAME as COLLECTION_NAME
|
||||||
@ -297,6 +299,81 @@ def test_update_crawl(
|
|||||||
assert not data["description"]
|
assert not data["description"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawl_stats_all_orgs_not_superadmin(crawler_auth_headers):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/all/crawls/stats", headers=crawler_auth_headers
|
||||||
|
)
|
||||||
|
assert r.status_code == 403
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawl_stats_all_orgs(admin_auth_headers):
|
||||||
|
with requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/all/crawls/stats", headers=admin_auth_headers, stream=True
|
||||||
|
) as r:
|
||||||
|
assert r.status_code == 200
|
||||||
|
|
||||||
|
# Wait for stream content
|
||||||
|
if not r.content:
|
||||||
|
while True:
|
||||||
|
if r.content:
|
||||||
|
break
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
buffer = r.iter_lines()
|
||||||
|
for row in csv.DictReader(
|
||||||
|
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
|
||||||
|
):
|
||||||
|
assert row["id"]
|
||||||
|
assert row["oid"]
|
||||||
|
assert row["org"]
|
||||||
|
assert row["cid"]
|
||||||
|
assert row["name"] or row["name"] == ""
|
||||||
|
assert row["state"]
|
||||||
|
assert row["userid"]
|
||||||
|
assert row["user"]
|
||||||
|
assert row["started"]
|
||||||
|
assert row["finished"] or row["finished"] is None
|
||||||
|
assert row["duration"] or row["duration"] == 0
|
||||||
|
assert row["pages"] or row["pages"] == 0
|
||||||
|
assert row["filesize"] or row["filesize"] == 0
|
||||||
|
assert row["avg_page_time"] or row["avg_page_time"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_crawl_stats(crawler_auth_headers, default_org_id):
|
||||||
|
with requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/stats",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
stream=True,
|
||||||
|
) as r:
|
||||||
|
assert r.status_code == 200
|
||||||
|
|
||||||
|
# Wait for stream content
|
||||||
|
if not r.content:
|
||||||
|
while True:
|
||||||
|
if r.content:
|
||||||
|
break
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
buffer = r.iter_lines()
|
||||||
|
for row in csv.DictReader(
|
||||||
|
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
|
||||||
|
):
|
||||||
|
assert row["id"]
|
||||||
|
assert row["oid"] == default_org_id
|
||||||
|
assert row["org"]
|
||||||
|
assert row["cid"]
|
||||||
|
assert row["name"] or row["name"] == ""
|
||||||
|
assert row["state"]
|
||||||
|
assert row["userid"]
|
||||||
|
assert row["user"]
|
||||||
|
assert row["started"]
|
||||||
|
assert row["finished"] or row["finished"] is None
|
||||||
|
assert row["duration"] or row["duration"] == 0
|
||||||
|
assert row["pages"] or row["pages"] == 0
|
||||||
|
assert row["filesize"] or row["filesize"] == 0
|
||||||
|
assert row["avg_page_time"] or row["avg_page_time"] == 0
|
||||||
|
|
||||||
|
|
||||||
def test_delete_crawls_crawler(
|
def test_delete_crawls_crawler(
|
||||||
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
||||||
):
|
):
|
||||||
|
Loading…
Reference in New Issue
Block a user