From a85f9496b0f10c84e8f1dfa5dd030f49778719c8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 6 Jun 2024 13:15:19 -0400 Subject: [PATCH] Include number of Identical Files in QA stats and meter (#1848) This PR adds Identical Files to the QA Page Match Analysis meter bars. To do this, the backend calculates the number of non-HTML pages once and includes it under the key `Files` in each of the `screenshotMatch` and `textMatch` QA stats return arrays. The backend additionally removes the file count from "No Data" to prevent these from being counted twice. --------- Co-authored-by: emma --- backend/btrixcloud/crawls.py | 8 ++-- backend/btrixcloud/pages.py | 43 +++++++++++++++++-- backend/test/test_qa.py | 4 ++ .../pages/org/archived-item-detail/ui/qa.ts | 33 ++++---------- 4 files changed, 57 insertions(+), 31 deletions(-) diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 0e106e0e..f13121e0 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -957,14 +957,16 @@ class CrawlOps(BaseCrawlOps): thresholds: Dict[str, List[float]], ) -> QARunAggregateStatsOut: """Get aggregate stats for QA run""" + file_count = await self.page_ops.get_crawl_file_count(crawl_id) screenshot_results = await self.page_ops.get_qa_run_aggregate_counts( - crawl_id, qa_run_id, thresholds, key="screenshotMatch" + crawl_id, qa_run_id, thresholds, file_count, key="screenshotMatch" ) text_results = await self.page_ops.get_qa_run_aggregate_counts( - crawl_id, qa_run_id, thresholds, key="textMatch" + crawl_id, qa_run_id, thresholds, file_count, key="textMatch" ) return QARunAggregateStatsOut( - screenshotMatch=screenshot_results, textMatch=text_results + screenshotMatch=screenshot_results, + textMatch=text_results, ) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 12db701e..9db5fffd 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -501,6 +501,34 @@ class PageOps: return [PageOut.from_dict(data) for data in items], total + async def get_crawl_file_count(self, crawl_id: str): + """Get count of pages in crawl that are files and don't need to be QAed""" + aggregate = [ + { + "$match": { + "crawl_id": crawl_id, + "loadState": 2, + "mime": {"$not": {"$regex": "^.*html", "$options": "i"}}, + } + }, + {"$count": "count"}, + ] + + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + + if not results: + return 0 + + result = results[0] + + try: + total = int(result["count"]) + except (IndexError, ValueError): + total = 0 + + return total + async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): """Delete existing pages for crawl and re-add from WACZs.""" await self.delete_crawl_pages(crawl_id, oid) @@ -520,6 +548,7 @@ class PageOps: crawl_id: str, qa_run_id: str, thresholds: Dict[str, List[float]], + file_count: int, key: str = "screenshotMatch", ): """Get counts for pages in QA run in buckets by score key based on thresholds""" @@ -556,11 +585,17 @@ class PageOps: return_data = [] for result in results: - return_data.append( - QARunBucketStats( - lowerBoundary=str(result.get("_id")), count=result.get("count", 0) + key = str(result.get("_id")) + if key == "No data": + count = result.get("count", 0) - file_count + return_data.append(QARunBucketStats(lowerBoundary=key, count=count)) + else: + return_data.append( + QARunBucketStats(lowerBoundary=key, count=result.get("count", 0)) ) - ) + + # Add file count + return_data.append(QARunBucketStats(lowerBoundary="Files", count=file_count)) # Add missing boundaries to result and re-sort for boundary in boundaries: diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py index 85f09e1d..7c78d0c5 100644 --- a/backend/test/test_qa.py +++ b/backend/test/test_qa.py @@ -329,11 +329,13 @@ def test_qa_stats( {"lowerBoundary": "0.0", "count": 0}, {"lowerBoundary": "0.7", "count": 0}, {"lowerBoundary": "0.9", "count": 1}, + {"lowerBoundary": "Files", "count": 0}, ] assert data["textMatch"] == [ {"lowerBoundary": "0.0", "count": 0}, {"lowerBoundary": "0.7", "count": 0}, {"lowerBoundary": "0.9", "count": 1}, + {"lowerBoundary": "Files", "count": 0}, ] # Test we get expected results with explicit 0 boundary @@ -348,11 +350,13 @@ def test_qa_stats( {"lowerBoundary": "0.0", "count": 0}, {"lowerBoundary": "0.7", "count": 0}, {"lowerBoundary": "0.9", "count": 1}, + {"lowerBoundary": "Files", "count": 0}, ] assert data["textMatch"] == [ {"lowerBoundary": "0.0", "count": 0}, {"lowerBoundary": "0.7", "count": 0}, {"lowerBoundary": "0.9", "count": 1}, + {"lowerBoundary": "Files", "count": 0}, ] # Test that missing threshold values result in 422 HTTPException diff --git a/frontend/src/pages/org/archived-item-detail/ui/qa.ts b/frontend/src/pages/org/archived-item-detail/ui/qa.ts index af0559db..5c5e841c 100644 --- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts +++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts @@ -44,7 +44,7 @@ import { formatNumber, getLocale } from "@/utils/localization"; import { pluralOf } from "@/utils/pluralize"; type QAStatsThreshold = { - lowerBoundary: `${number}` | "No data"; + lowerBoundary: `${number}` | "No data" | "Files"; count: number; }; type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>; @@ -65,6 +65,11 @@ const qaStatsThresholds = [ cssColor: "var(--sl-color-success-500)", label: msg("Good Match"), }, + { + lowerBoundary: "Files", + cssColor: "var(--sl-color-neutral-500)", + label: msg("Identical Files"), + }, ]; const notApplicable = () => @@ -527,26 +532,6 @@ export class ArchivedItemDetailQA extends TailwindElement { })}
- ${when( - qaRun.state.startsWith("stop") || - (qaRun.state === "complete" && - qaRun.stats.done < qaRun.stats.found), - () => - html` - - `, - )} ${when( qaRun.stats, (stats) => html` @@ -653,13 +638,13 @@ export class ArchivedItemDetailQA extends TailwindElement { ? msg("No Data") : threshold?.label}
- ${bar.lowerBoundary !== "No data" + ${!["No data", "Files"].includes(bar.lowerBoundary) ? html`${idx === 0 ? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%` : idx === qaStatsThresholds.length - 1 ? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%` - : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`} - match
` + : `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100 || 100}%`} + ${msg("match")}
` : nothing} ${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}