Add API endpoints for crawl statistics (#1461)

Fixes #1158 

Introduces two new API endpoints that stream crawling statistics CSVs
(with a suggested attachment filename header):

- `GET /api/orgs/all/crawls/stats` - crawls from all orgs (superuser
only)
- `GET /api/orgs/{oid}/crawls/stats` - crawls from just one org
(available to org crawler/admin users as well as superusers)

Also includes tests for both endpoints.
This commit is contained in:
Tessa Walsh 2024-01-10 16:30:47 -05:00 committed by GitHub
parent 99dd9b4acb
commit 38a01860b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 202 additions and 9 deletions

View File

@ -6,7 +6,7 @@ import re
import urllib.parse
from uuid import UUID
from typing import Optional, List
from typing import Optional, List, Dict, Union
from fastapi import Depends, HTTPException
from fastapi.responses import StreamingResponse
@ -14,7 +14,7 @@ from redis import asyncio as exceptions
import pymongo
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .utils import dt_now, parse_jsonl_error_messages
from .utils import dt_now, parse_jsonl_error_messages, stream_dict_list_as_csv
from .basecrawls import BaseCrawlOps
from .models import (
UpdateCrawl,
@ -497,6 +497,74 @@ class CrawlOps(BaseCrawlOps):
except Exception:
return [], 0
async def get_crawl_stats(
self, org: Optional[Organization] = None
) -> List[Dict[str, Union[str, int]]]:
"""Return crawl statistics"""
# pylint: disable=too-many-locals
org_slugs = await self.orgs.get_org_slugs_by_ids()
user_emails = await self.user_manager.get_user_emails_by_ids()
crawls_data: List[Dict[str, Union[str, int]]] = []
query: Dict[str, Union[str, UUID]] = {"type": "crawl"}
if org:
query["oid"] = org.id
async for crawl in self.crawls.find(query):
data: Dict[str, Union[str, int]] = {}
data["id"] = str(crawl.get("_id"))
oid = crawl.get("oid")
data["oid"] = str(oid)
data["org"] = org_slugs[oid]
data["cid"] = str(crawl.get("cid"))
crawl_name = crawl.get("name")
data["name"] = f'"{crawl_name}"' if crawl_name else ""
data["state"] = crawl.get("state")
userid = crawl.get("userid")
data["userid"] = str(userid)
data["user"] = user_emails.get(userid)
started = crawl.get("started")
finished = crawl.get("finished")
data["started"] = str(started)
data["finished"] = str(finished)
data["duration"] = 0
if started and finished:
duration = finished - started
duration_seconds = int(duration.total_seconds())
if duration_seconds:
data["duration"] = duration_seconds
done_stats = None
if crawl.get("stats") and crawl.get("stats").get("done"):
done_stats = crawl["stats"]["done"]
data["pages"] = 0
if done_stats:
data["pages"] = done_stats
data["filesize"] = crawl.get("fileSize", 0)
data["avg_page_time"] = 0
if (
done_stats
and done_stats != 0
and started
and finished
and duration_seconds
):
data["avg_page_time"] = int(duration_seconds / done_stats)
crawls_data.append(data)
return crawls_data
# ============================================================================
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
@ -646,6 +714,23 @@ def init_crawls_api(app, user_dep, *args):
):
return await ops.delete_crawls(org, delete_list, "crawl", user)
@app.get("/orgs/all/crawls/stats", tags=["crawls"])
async def get_all_orgs_crawl_stats(
user: User = Depends(user_dep),
):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
crawl_stats = await ops.get_crawl_stats()
return stream_dict_list_as_csv(crawl_stats, "crawling-stats.csv")
@app.get("/orgs/{oid}/crawls/stats", tags=["crawls"])
async def get_org_crawl_stats(
org: Organization = Depends(org_crawl_dep),
):
crawl_stats = await ops.get_crawl_stats(org)
return stream_dict_list_as_csv(crawl_stats, f"crawling-stats-{org.id}.csv")
@app.get(
"/orgs/all/crawls/{crawl_id}/replay.json",
tags=["crawls"],

View File

@ -1,6 +1,7 @@
"""
Organization API handling
"""
# pylint: disable=too-many-lines
import math
import os
import time
@ -653,7 +654,7 @@ class OrgOps:
slugs = await self.orgs.distinct("slug", {})
return {"slugs": slugs}
async def get_all_org_slugs_with_ids(self):
async def get_org_slugs_by_ids(self):
"""Return dict with {id: slug} for all orgs."""
slug_id_map = {}
async for org in self.orgs.find({}):
@ -933,6 +934,6 @@ def init_orgs_api(app, mdb, user_manager, invites, user_dep):
async def get_all_org_slugs_with_ids(user: User = Depends(user_dep)):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
return await ops.get_all_org_slugs_with_ids()
return await ops.get_org_slugs_by_ids()
return ops

View File

@ -213,6 +213,13 @@ class UserManager:
)
return await cursor.to_list(length=1000)
async def get_user_emails_by_ids(self):
"""return dict of user emails keyed by id"""
email_id_map = {}
async for user in self.users.find({}):
email_id_map[user["id"]] = user["email"]
return email_id_map
async def get_superuser(self) -> Optional[User]:
"""return current superuser, if any"""
user_data = await self.users.find_one({"is_superuser": True})

View File

@ -1,15 +1,19 @@
""" k8s utils """
from typing import Optional
import os
import asyncio
import json
import sys
import signal
import atexit
import csv
import io
import json
import signal
import os
import sys
from datetime import datetime
from typing import Optional, Dict, Union, List
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from slugify import slugify
@ -97,3 +101,22 @@ def is_bool(stri: Optional[str]) -> bool:
def slug_from_name(name: str) -> str:
"""Generate slug from name"""
return slugify(name.replace("'", ""))
def stream_dict_list_as_csv(data: List[Dict[str, Union[str, int]]], filename: str):
"""Stream list of dictionaries as CSV with attachment filename header"""
if not data:
raise HTTPException(status_code=404, detail="crawls_not_found")
keys = data[0].keys()
buffer = io.StringIO()
dict_writer = csv.DictWriter(buffer, keys, quoting=csv.QUOTE_NONNUMERIC)
dict_writer.writeheader()
dict_writer.writerows(data)
return StreamingResponse(
iter([buffer.getvalue()]),
media_type="text/csv",
headers={"Content-Disposition": f"attachment;filename={filename}"},
)

View File

@ -4,6 +4,8 @@ import time
import io
import zipfile
import re
import csv
import codecs
from .conftest import API_PREFIX, HOST_PREFIX
from .test_collections import UPDATED_NAME as COLLECTION_NAME
@ -297,6 +299,81 @@ def test_update_crawl(
assert not data["description"]
def test_crawl_stats_all_orgs_not_superadmin(crawler_auth_headers):
r = requests.get(
f"{API_PREFIX}/orgs/all/crawls/stats", headers=crawler_auth_headers
)
assert r.status_code == 403
def test_crawl_stats_all_orgs(admin_auth_headers):
with requests.get(
f"{API_PREFIX}/orgs/all/crawls/stats", headers=admin_auth_headers, stream=True
) as r:
assert r.status_code == 200
# Wait for stream content
if not r.content:
while True:
if r.content:
break
time.sleep(5)
buffer = r.iter_lines()
for row in csv.DictReader(
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
):
assert row["id"]
assert row["oid"]
assert row["org"]
assert row["cid"]
assert row["name"] or row["name"] == ""
assert row["state"]
assert row["userid"]
assert row["user"]
assert row["started"]
assert row["finished"] or row["finished"] is None
assert row["duration"] or row["duration"] == 0
assert row["pages"] or row["pages"] == 0
assert row["filesize"] or row["filesize"] == 0
assert row["avg_page_time"] or row["avg_page_time"] == 0
def test_crawl_stats(crawler_auth_headers, default_org_id):
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/stats",
headers=crawler_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
# Wait for stream content
if not r.content:
while True:
if r.content:
break
time.sleep(5)
buffer = r.iter_lines()
for row in csv.DictReader(
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
):
assert row["id"]
assert row["oid"] == default_org_id
assert row["org"]
assert row["cid"]
assert row["name"] or row["name"] == ""
assert row["state"]
assert row["userid"]
assert row["user"]
assert row["started"]
assert row["finished"] or row["finished"] is None
assert row["duration"] or row["duration"] == 0
assert row["pages"] or row["pages"] == 0
assert row["filesize"] or row["filesize"] == 0
assert row["avg_page_time"] or row["avg_page_time"] == 0
def test_delete_crawls_crawler(
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
):