Add QA run aggregate stats API endpoint (#1682)

Fixes #1659 Takes an arbitrary set of thresholds for text and screenshot matches as a comma-separated list of floats. Returns a list of groupings for each that include the lower boundary and count for all thresholds passed in.
2024-04-17 13:24:18 -04:00 · 2024-04-17 13:24:18 -04:00 · 30ab139ff2
commit 30ab139ff2
parent 835014d829
4 changed files with 191 additions and 0 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -35,6 +35,7 @@ from .models import (
    QARun,
    QARunOut,
    QARunWithResources,
+    QARunAggregateStatsOut,
    DeleteQARunList,
    Organization,
    User,
@ -917,6 +918,23 @@ class CrawlOps(BaseCrawlOps):

        return QARunWithResources(**qa_run_dict)

+    async def get_qa_run_aggregate_stats(
+        self,
+        crawl_id: str,
+        qa_run_id: str,
+        thresholds: Dict[str, List[float]],
+    ) -> QARunAggregateStatsOut:
+        """Get aggregate stats for QA run"""
+        screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
+            crawl_id, qa_run_id, thresholds, key="screenshotMatch"
+        )
+        text_results = await self.page_ops.get_qa_run_aggregate_counts(
+            crawl_id, qa_run_id, thresholds, key="textMatch"
+        )
+        return QARunAggregateStatsOut(
+            screenshotMatch=screenshot_results, textMatch=text_results
+        )
+

 # ============================================================================
 async def recompute_crawl_file_count_and_size(crawls, crawl_id):
@ -1125,6 +1143,37 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
    ):
        return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)

+    @app.get(
+        "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats",
+        tags=["qa"],
+        response_model=QARunAggregateStatsOut,
+    )
+    async def get_qa_run_aggregate_stats(
+        crawl_id,
+        qa_run_id,
+        screenshotThresholds: str,
+        textThresholds: str,
+        # pylint: disable=unused-argument
+        org: Organization = Depends(org_viewer_dep),
+    ):
+        thresholds: Dict[str, List[float]] = {}
+        try:
+            thresholds["screenshotMatch"] = [
+                float(threshold) for threshold in screenshotThresholds.split(",")
+            ]
+            thresholds["textMatch"] = [
+                float(threshold) for threshold in textThresholds.split(",")
+            ]
+        # pylint: disable=broad-exception-caught,raise-missing-from
+        except Exception:
+            raise HTTPException(status_code=400, detail="invalid_thresholds")
+
+        return await ops.get_qa_run_aggregate_stats(
+            crawl_id,
+            qa_run_id,
+            thresholds,
+        )
+
    @app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["qa"])
    async def start_crawl_qa_run(
        crawl_id: str,
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -738,6 +738,22 @@ class QARunOut(BaseModel):
    stats: CrawlStats = CrawlStats()


+# ============================================================================
+class QARunBucketStats(BaseModel):
+    """Model for per-bucket aggregate stats results"""
+
+    lowerBoundary: str
+    count: int
+
+
+# ============================================================================
+class QARunAggregateStatsOut(BaseModel):
+    """QA Run aggregate stats out"""
+
+    screenshotMatch: List[QARunBucketStats]
+    textMatch: List[QARunBucketStats]
+
+
 # ============================================================================
 class Crawl(BaseCrawl, CrawlConfigCore):
    """Store State of a Crawl (Finished or Running)"""
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -22,6 +22,7 @@ from .models import (
    PageNoteIn,
    PageNoteEdit,
    PageNoteDelete,
+    QARunBucketStats,
 )
 from .pagination import DEFAULT_PAGE_SIZE, paginated_format
 from .utils import from_k8s_date, str_list_to_bools
@ -514,6 +515,68 @@ class PageOps:
        for crawl_id in crawl_ids:
            await self.re_add_crawl_pages(crawl_id, oid)

+    async def get_qa_run_aggregate_counts(
+        self,
+        crawl_id: str,
+        qa_run_id: str,
+        thresholds: Dict[str, List[float]],
+        key: str = "screenshotMatch",
+    ):
+        """Get counts for pages in QA run in buckets by score key based on thresholds"""
+        boundaries = thresholds.get(key, [])
+        if not boundaries:
+            raise HTTPException(status_code=400, detail="missing_thresholds")
+
+        boundaries = sorted(boundaries)
+
+        # Make sure boundaries start with 0
+        if boundaries[0] != 0:
+            boundaries.insert(0, 0.0)
+
+        # Make sure we have upper boundary just over 1 to be inclusive of scores of 1
+        if boundaries[-1] <= 1:
+            boundaries.append(1.1)
+
+        aggregate = [
+            {"$match": {"crawl_id": crawl_id}},
+            {
+                "$bucket": {
+                    "groupBy": f"$qa.{qa_run_id}.{key}",
+                    "default": "No data",
+                    "boundaries": boundaries,
+                    "output": {
+                        "count": {"$sum": 1},
+                    },
+                }
+            },
+        ]
+        cursor = self.pages.aggregate(aggregate)
+        results = await cursor.to_list(length=len(boundaries))
+
+        return_data = []
+
+        for result in results:
+            return_data.append(
+                QARunBucketStats(
+                    lowerBoundary=str(result.get("_id")), count=result.get("count", 0)
+                )
+            )
+
+        # Add missing boundaries to result and re-sort
+        for boundary in boundaries:
+            if boundary < 1.0:
+                matching_return_data = [
+                    bucket
+                    for bucket in return_data
+                    if bucket.lowerBoundary == str(boundary)
+                ]
+                if not matching_return_data:
+                    return_data.append(
+                        QARunBucketStats(lowerBoundary=str(boundary), count=0)
+                    )
+
+        return sorted(return_data, key=lambda bucket: bucket.lowerBoundary)
+

 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
--- a/backend/test/test_qa.py
+++ b/backend/test/test_qa.py
@ -283,6 +283,69 @@ def test_qa_replay(
    assert data["resources"][0]["path"]


+def test_qa_stats(
+    crawler_crawl_id,
+    crawler_auth_headers,
+    default_org_id,
+    qa_run_id,
+    qa_run_pages_ready,
+):
+    # We'll want to improve this test by having more pages to test
+    # if we can figure out stable page scores to test against
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7,0.9&textThresholds=0.7,0.9",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+
+    data = r.json()
+    assert data["screenshotMatch"] == [
+        {"lowerBoundary": "0.0", "count": 0},
+        {"lowerBoundary": "0.7", "count": 0},
+        {"lowerBoundary": "0.9", "count": 1},
+    ]
+    assert data["textMatch"] == [
+        {"lowerBoundary": "0.0", "count": 0},
+        {"lowerBoundary": "0.7", "count": 0},
+        {"lowerBoundary": "0.9", "count": 1},
+    ]
+
+    # Test we get expected results with explicit 0 boundary
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0,0.7,0.9&textThresholds=0,0.7,0.9",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+
+    data = r.json()
+    assert data["screenshotMatch"] == [
+        {"lowerBoundary": "0.0", "count": 0},
+        {"lowerBoundary": "0.7", "count": 0},
+        {"lowerBoundary": "0.9", "count": 1},
+    ]
+    assert data["textMatch"] == [
+        {"lowerBoundary": "0.0", "count": 0},
+        {"lowerBoundary": "0.7", "count": 0},
+        {"lowerBoundary": "0.9", "count": 1},
+    ]
+
+    # Test that missing threshold values result in 422 HTTPException
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 422
+    assert r.json()["detail"][0]["msg"] == "field required"
+
+    # Test that invalid threshold values result in 400 HTTPException
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7&textThresholds=null",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 400
+    assert r.json()["detail"] == "invalid_thresholds"
+
+
 def test_run_qa_not_running(
    crawler_crawl_id,
    crawler_auth_headers,