Include number of Identical Files in QA stats and meter (#1848)

This PR adds Identical Files to the QA Page Match Analysis meter bars. To do this, the backend calculates the number of non-HTML pages once and includes it under the key `Files` in each of the `screenshotMatch` and `textMatch` QA stats return arrays. The backend additionally removes the file count from "No Data" to prevent these from being counted twice. --------- Co-authored-by: emma <hi@emma.cafe>
2024-06-06 13:15:19 -04:00 · 2024-06-06 13:15:19 -04:00 · a85f9496b0
commit a85f9496b0
parent e3ee63f9b0
4 changed files with 57 additions and 31 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -957,14 +957,16 @@ class CrawlOps(BaseCrawlOps):
        thresholds: Dict[str, List[float]],
    ) -> QARunAggregateStatsOut:
        """Get aggregate stats for QA run"""
+        file_count = await self.page_ops.get_crawl_file_count(crawl_id)
        screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
-            crawl_id, qa_run_id, thresholds, key="screenshotMatch"
+            crawl_id, qa_run_id, thresholds, file_count, key="screenshotMatch"
        )
        text_results = await self.page_ops.get_qa_run_aggregate_counts(
-            crawl_id, qa_run_id, thresholds, key="textMatch"
+            crawl_id, qa_run_id, thresholds, file_count, key="textMatch"
        )
        return QARunAggregateStatsOut(
-            screenshotMatch=screenshot_results, textMatch=text_results
+            screenshotMatch=screenshot_results,
+            textMatch=text_results,
        )


--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -501,6 +501,34 @@ class PageOps:

        return [PageOut.from_dict(data) for data in items], total

+    async def get_crawl_file_count(self, crawl_id: str):
+        """Get count of pages in crawl that are files and don't need to be QAed"""
+        aggregate = [
+            {
+                "$match": {
+                    "crawl_id": crawl_id,
+                    "loadState": 2,
+                    "mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
+                }
+            },
+            {"$count": "count"},
+        ]
+
+        cursor = self.pages.aggregate(aggregate)
+        results = await cursor.to_list(length=1)
+
+        if not results:
+            return 0
+
+        result = results[0]
+
+        try:
+            total = int(result["count"])
+        except (IndexError, ValueError):
+            total = 0
+
+        return total
+
    async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
        """Delete existing pages for crawl and re-add from WACZs."""
        await self.delete_crawl_pages(crawl_id, oid)
@ -520,6 +548,7 @@ class PageOps:
        crawl_id: str,
        qa_run_id: str,
        thresholds: Dict[str, List[float]],
+        file_count: int,
        key: str = "screenshotMatch",
    ):
        """Get counts for pages in QA run in buckets by score key based on thresholds"""
@ -556,11 +585,17 @@ class PageOps:
        return_data = []

        for result in results:
-            return_data.append(
-                QARunBucketStats(
-                    lowerBoundary=str(result.get("_id")), count=result.get("count", 0)
+            key = str(result.get("_id"))
+            if key == "No data":
+                count = result.get("count", 0) - file_count
+                return_data.append(QARunBucketStats(lowerBoundary=key, count=count))
+            else:
+                return_data.append(
+                    QARunBucketStats(lowerBoundary=key, count=result.get("count", 0))
                )
-            )
+
+        # Add file count
+        return_data.append(QARunBucketStats(lowerBoundary="Files", count=file_count))

        # Add missing boundaries to result and re-sort
        for boundary in boundaries:
--- a/backend/test/test_qa.py
+++ b/backend/test/test_qa.py
@ -329,11 +329,13 @@ def test_qa_stats(
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
    ]
    assert data["textMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
    ]

    # Test we get expected results with explicit 0 boundary
@ -348,11 +350,13 @@ def test_qa_stats(
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
    ]
    assert data["textMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
    ]

    # Test that missing threshold values result in 422 HTTPException
--- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts
+++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts
@ -44,7 +44,7 @@ import { formatNumber, getLocale } from "@/utils/localization";
 import { pluralOf } from "@/utils/pluralize";

 type QAStatsThreshold = {
-  lowerBoundary: `${number}` | "No data";
+  lowerBoundary: `${number}` | "No data" | "Files";
  count: number;
 };
 type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>;
@ -65,6 +65,11 @@ const qaStatsThresholds = [
    cssColor: "var(--sl-color-success-500)",
    label: msg("Good Match"),
  },
+  {
+    lowerBoundary: "Files",
+    cssColor: "var(--sl-color-neutral-500)",
+    label: msg("Identical Files"),
+  },
 ];

 const notApplicable = () =>
@ -527,26 +532,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
            })}
          </div>
          <div class="flex items-center gap-2 text-neutral-500">
-            ${when(
-              qaRun.state.startsWith("stop") ||
-                (qaRun.state === "complete" &&
-                  qaRun.stats.done < qaRun.stats.found),
-              () =>
-                html`<sl-tooltip
-                  content=${qaRun.state.startsWith("stop")
-                    ? msg("This analysis run was stopped and is not complete.")
-                    : msg(
-                        "Not all pages in this crawl were analyzed. This is likely because some pages are not HTML pages, but other types of documents.",
-                      )}
-                  class="[--max-width:theme(spacing.56)]"
-                >
-                  <sl-icon
-                    name="exclamation-triangle-fill"
-                    class="text-warning"
-                    label=${msg("Note about page counts")}
-                  ></sl-icon>
-                </sl-tooltip> `,
-            )}
            ${when(
              qaRun.stats,
              (stats) => html`
@ -653,13 +638,13 @@ export class ArchivedItemDetailQA extends TailwindElement {
                  ? msg("No Data")
                  : threshold?.label}
                <div class="text-xs opacity-80">
-                  ${bar.lowerBoundary !== "No data"
+                  ${!["No data", "Files"].includes(bar.lowerBoundary)
                    ? html`${idx === 0
                          ? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`
                          : idx === qaStatsThresholds.length - 1
                            ? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%`
-                            : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`}
-                        match <br />`
+                            : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100 || 100}%`}
+                        ${msg("match")} <br />`
                    : nothing}
                  ${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}
                </div>