Include number of Identical Files in QA stats and meter (#1848)

This PR adds Identical Files to the QA Page Match Analysis meter bars. To do this, the backend calculates the number of non-HTML pages once and includes it under the key `Files` in each of the `screenshotMatch` and `textMatch` QA stats return arrays. The backend additionally removes the file count from "No Data" to prevent these from being counted twice. --------- Co-authored-by: emma <hi@emma.cafe>
2024-06-06 13:15:19 -04:00 · 2024-06-06 13:15:19 -04:00 · a85f9496b0
commit a85f9496b0
parent e3ee63f9b0
4 changed files with 57 additions and 31 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -957,14 +957,16 @@ class CrawlOps(BaseCrawlOps):
        thresholds: Dict[str, List[float]],
    ) -> QARunAggregateStatsOut:
        """Get aggregate stats for QA run"""
        file_count = await self.page_ops.get_crawl_file_count(crawl_id)
        screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
-            crawl_id, qa_run_id, thresholds, key="screenshotMatch"
+            crawl_id, qa_run_id, thresholds, file_count, key="screenshotMatch"
        )
        text_results = await self.page_ops.get_qa_run_aggregate_counts(
-            crawl_id, qa_run_id, thresholds, key="textMatch"
+            crawl_id, qa_run_id, thresholds, file_count, key="textMatch"
        )
        return QARunAggregateStatsOut(
-            screenshotMatch=screenshot_results, textMatch=text_results
+            screenshotMatch=screenshot_results,
            textMatch=text_results,
        )
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -501,6 +501,34 @@ class PageOps:
        return [PageOut.from_dict(data) for data in items], total
    async def get_crawl_file_count(self, crawl_id: str):
        """Get count of pages in crawl that are files and don't need to be QAed"""
        aggregate = [
            {
                "$match": {
                    "crawl_id": crawl_id,
                    "loadState": 2,
                    "mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
                }
            },
            {"$count": "count"},
        ]
        cursor = self.pages.aggregate(aggregate)
        results = await cursor.to_list(length=1)
        if not results:
            return 0
        result = results[0]
        try:
            total = int(result["count"])
        except (IndexError, ValueError):
            total = 0
        return total
    async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
        """Delete existing pages for crawl and re-add from WACZs."""
        await self.delete_crawl_pages(crawl_id, oid)
@ -520,6 +548,7 @@ class PageOps:
        crawl_id: str,
        qa_run_id: str,
        thresholds: Dict[str, List[float]],
        file_count: int,
        key: str = "screenshotMatch",
    ):
        """Get counts for pages in QA run in buckets by score key based on thresholds"""
@ -556,11 +585,17 @@ class PageOps:
        return_data = []
        for result in results:
-            return_data.append(
+            key = str(result.get("_id"))
-                QARunBucketStats(
+            if key == "No data":
-                    lowerBoundary=str(result.get("_id")), count=result.get("count", 0)
+                count = result.get("count", 0) - file_count
                return_data.append(QARunBucketStats(lowerBoundary=key, count=count))
            else:
                return_data.append(
                    QARunBucketStats(lowerBoundary=key, count=result.get("count", 0))
                )
-            )
+
        # Add file count
        return_data.append(QARunBucketStats(lowerBoundary="Files", count=file_count))
        # Add missing boundaries to result and re-sort
        for boundary in boundaries:
--- a/backend/test/test_qa.py
+++ b/backend/test/test_qa.py
@ -329,11 +329,13 @@ def test_qa_stats(
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
        {"lowerBoundary": "Files", "count": 0},
    ]
    assert data["textMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
        {"lowerBoundary": "Files", "count": 0},
    ]
    # Test we get expected results with explicit 0 boundary
@ -348,11 +350,13 @@ def test_qa_stats(
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
        {"lowerBoundary": "Files", "count": 0},
    ]
    assert data["textMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
        {"lowerBoundary": "Files", "count": 0},
    ]
    # Test that missing threshold values result in 422 HTTPException
--- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts
+++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts
@ -44,7 +44,7 @@ import { formatNumber, getLocale } from "@/utils/localization";
 import { pluralOf } from "@/utils/pluralize";
 type QAStatsThreshold = {
-  lowerBoundary: `${number}` | "No data";
+  lowerBoundary: `${number}` | "No data" | "Files";
  count: number;
 };
 type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>;
@ -65,6 +65,11 @@ const qaStatsThresholds = [
    cssColor: "var(--sl-color-success-500)",
    label: msg("Good Match"),
  },
  {
    lowerBoundary: "Files",
    cssColor: "var(--sl-color-neutral-500)",
    label: msg("Identical Files"),
  },
 ];
 const notApplicable = () =>
@ -527,26 +532,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
            })}
          </div>
          <div class="flex items-center gap-2 text-neutral-500">
            ${when(
              qaRun.state.startsWith("stop") ||
                (qaRun.state === "complete" &&
                  qaRun.stats.done < qaRun.stats.found),
              () =>
                html`<sl-tooltip
                  content=${qaRun.state.startsWith("stop")
                    ? msg("This analysis run was stopped and is not complete.")
                    : msg(
                        "Not all pages in this crawl were analyzed. This is likely because some pages are not HTML pages, but other types of documents.",
                      )}
                  class="[--max-width:theme(spacing.56)]"
                >
                  <sl-icon
                    name="exclamation-triangle-fill"
                    class="text-warning"
                    label=${msg("Note about page counts")}
                  ></sl-icon>
                </sl-tooltip> `,
            )}
            ${when(
              qaRun.stats,
              (stats) => html`
@ -653,13 +638,13 @@ export class ArchivedItemDetailQA extends TailwindElement {
                  ? msg("No Data")
                  : threshold?.label}
                <div class="text-xs opacity-80">
-                  ${bar.lowerBoundary !== "No data"
+                  ${!["No data", "Files"].includes(bar.lowerBoundary)
                    ? html`${idx === 0
                          ? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`
                          : idx === qaStatsThresholds.length - 1
                            ? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%`
-                            : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`}
+                            : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100 || 100}%`}
-                        match <br />`
+                        ${msg("match")} <br />`
                    : nothing}
                  ${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}
                </div>