From a85f9496b0f10c84e8f1dfa5dd030f49778719c8 Mon Sep 17 00:00:00 2001
From: Tessa Walsh <tessa@bitarchivist.net>
Date: Thu, 6 Jun 2024 13:15:19 -0400
Subject: [PATCH] Include number of Identical Files in QA stats and meter
 (#1848)

This PR adds Identical Files to the QA Page Match Analysis meter bars.
To do this, the backend calculates the number of non-HTML pages once and
includes it under the key `Files` in each of the `screenshotMatch` and
`textMatch` QA stats return arrays.

The backend additionally removes the file count from "No Data" to
prevent these from being counted twice.

---------

Co-authored-by: emma <hi@emma.cafe>
---
 backend/btrixcloud/crawls.py                  |  8 ++--
 backend/btrixcloud/pages.py                   | 43 +++++++++++++++++--
 backend/test/test_qa.py                       |  4 ++
 .../pages/org/archived-item-detail/ui/qa.ts   | 33 ++++----------
 4 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index 0e106e0e..f13121e0 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -957,14 +957,16 @@ class CrawlOps(BaseCrawlOps):
         thresholds: Dict[str, List[float]],
     ) -> QARunAggregateStatsOut:
         """Get aggregate stats for QA run"""
+        file_count = await self.page_ops.get_crawl_file_count(crawl_id)
         screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
-            crawl_id, qa_run_id, thresholds, key="screenshotMatch"
+            crawl_id, qa_run_id, thresholds, file_count, key="screenshotMatch"
         )
         text_results = await self.page_ops.get_qa_run_aggregate_counts(
-            crawl_id, qa_run_id, thresholds, key="textMatch"
+            crawl_id, qa_run_id, thresholds, file_count, key="textMatch"
         )
         return QARunAggregateStatsOut(
-            screenshotMatch=screenshot_results, textMatch=text_results
+            screenshotMatch=screenshot_results,
+            textMatch=text_results,
         )
 
 
diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
index 12db701e..9db5fffd 100644
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@@ -501,6 +501,34 @@ class PageOps:
 
         return [PageOut.from_dict(data) for data in items], total
 
+    async def get_crawl_file_count(self, crawl_id: str):
+        """Get count of pages in crawl that are files and don't need to be QAed"""
+        aggregate = [
+            {
+                "$match": {
+                    "crawl_id": crawl_id,
+                    "loadState": 2,
+                    "mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
+                }
+            },
+            {"$count": "count"},
+        ]
+
+        cursor = self.pages.aggregate(aggregate)
+        results = await cursor.to_list(length=1)
+
+        if not results:
+            return 0
+
+        result = results[0]
+
+        try:
+            total = int(result["count"])
+        except (IndexError, ValueError):
+            total = 0
+
+        return total
+
     async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
         """Delete existing pages for crawl and re-add from WACZs."""
         await self.delete_crawl_pages(crawl_id, oid)
@@ -520,6 +548,7 @@ class PageOps:
         crawl_id: str,
         qa_run_id: str,
         thresholds: Dict[str, List[float]],
+        file_count: int,
         key: str = "screenshotMatch",
     ):
         """Get counts for pages in QA run in buckets by score key based on thresholds"""
@@ -556,11 +585,17 @@ class PageOps:
         return_data = []
 
         for result in results:
-            return_data.append(
-                QARunBucketStats(
-                    lowerBoundary=str(result.get("_id")), count=result.get("count", 0)
+            key = str(result.get("_id"))
+            if key == "No data":
+                count = result.get("count", 0) - file_count
+                return_data.append(QARunBucketStats(lowerBoundary=key, count=count))
+            else:
+                return_data.append(
+                    QARunBucketStats(lowerBoundary=key, count=result.get("count", 0))
                 )
-            )
+
+        # Add file count
+        return_data.append(QARunBucketStats(lowerBoundary="Files", count=file_count))
 
         # Add missing boundaries to result and re-sort
         for boundary in boundaries:
diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py
index 85f09e1d..7c78d0c5 100644
--- a/backend/test/test_qa.py
+++ b/backend/test/test_qa.py
@@ -329,11 +329,13 @@ def test_qa_stats(
         {"lowerBoundary": "0.0", "count": 0},
         {"lowerBoundary": "0.7", "count": 0},
         {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
     ]
     assert data["textMatch"] == [
         {"lowerBoundary": "0.0", "count": 0},
         {"lowerBoundary": "0.7", "count": 0},
         {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
     ]
 
     # Test we get expected results with explicit 0 boundary
@@ -348,11 +350,13 @@ def test_qa_stats(
         {"lowerBoundary": "0.0", "count": 0},
         {"lowerBoundary": "0.7", "count": 0},
         {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
     ]
     assert data["textMatch"] == [
         {"lowerBoundary": "0.0", "count": 0},
         {"lowerBoundary": "0.7", "count": 0},
         {"lowerBoundary": "0.9", "count": 1},
+        {"lowerBoundary": "Files", "count": 0},
     ]
 
     # Test that missing threshold values result in 422 HTTPException
diff --git a/frontend/src/pages/org/archived-item-detail/ui/qa.ts b/frontend/src/pages/org/archived-item-detail/ui/qa.ts
index af0559db..5c5e841c 100644
--- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts
+++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts
@@ -44,7 +44,7 @@ import { formatNumber, getLocale } from "@/utils/localization";
 import { pluralOf } from "@/utils/pluralize";
 
 type QAStatsThreshold = {
-  lowerBoundary: `${number}` | "No data";
+  lowerBoundary: `${number}` | "No data" | "Files";
   count: number;
 };
 type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>;
@@ -65,6 +65,11 @@ const qaStatsThresholds = [
     cssColor: "var(--sl-color-success-500)",
     label: msg("Good Match"),
   },
+  {
+    lowerBoundary: "Files",
+    cssColor: "var(--sl-color-neutral-500)",
+    label: msg("Identical Files"),
+  },
 ];
 
 const notApplicable = () =>
@@ -527,26 +532,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
             })}
           </div>
           <div class="flex items-center gap-2 text-neutral-500">
-            ${when(
-              qaRun.state.startsWith("stop") ||
-                (qaRun.state === "complete" &&
-                  qaRun.stats.done < qaRun.stats.found),
-              () =>
-                html`<sl-tooltip
-                  content=${qaRun.state.startsWith("stop")
-                    ? msg("This analysis run was stopped and is not complete.")
-                    : msg(
-                        "Not all pages in this crawl were analyzed. This is likely because some pages are not HTML pages, but other types of documents.",
-                      )}
-                  class="[--max-width:theme(spacing.56)]"
-                >
-                  <sl-icon
-                    name="exclamation-triangle-fill"
-                    class="text-warning"
-                    label=${msg("Note about page counts")}
-                  ></sl-icon>
-                </sl-tooltip> `,
-            )}
             ${when(
               qaRun.stats,
               (stats) => html`
@@ -653,13 +638,13 @@ export class ArchivedItemDetailQA extends TailwindElement {
                   ? msg("No Data")
                   : threshold?.label}
                 <div class="text-xs opacity-80">
-                  ${bar.lowerBoundary !== "No data"
+                  ${!["No data", "Files"].includes(bar.lowerBoundary)
                     ? html`${idx === 0
                           ? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`
                           : idx === qaStatsThresholds.length - 1
                             ? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%`
-                            : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`}
-                        match <br />`
+                            : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100 || 100}%`}
+                        ${msg("match")} <br />`
                     : nothing}
                   ${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}
                 </div>