Include number of Identical Files in QA stats and meter (#1848)

This PR adds Identical Files to the QA Page Match Analysis meter bars.
To do this, the backend calculates the number of non-HTML pages once and
includes it under the key `Files` in each of the `screenshotMatch` and
`textMatch` QA stats return arrays.

The backend additionally removes the file count from "No Data" to
prevent these from being counted twice.

---------

Co-authored-by: emma <hi@emma.cafe>
This commit is contained in:
Tessa Walsh 2024-06-06 13:15:19 -04:00 committed by GitHub
parent e3ee63f9b0
commit a85f9496b0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 57 additions and 31 deletions

View File

@ -957,14 +957,16 @@ class CrawlOps(BaseCrawlOps):
thresholds: Dict[str, List[float]],
) -> QARunAggregateStatsOut:
"""Get aggregate stats for QA run"""
file_count = await self.page_ops.get_crawl_file_count(crawl_id)
screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
crawl_id, qa_run_id, thresholds, key="screenshotMatch"
crawl_id, qa_run_id, thresholds, file_count, key="screenshotMatch"
)
text_results = await self.page_ops.get_qa_run_aggregate_counts(
crawl_id, qa_run_id, thresholds, key="textMatch"
crawl_id, qa_run_id, thresholds, file_count, key="textMatch"
)
return QARunAggregateStatsOut(
screenshotMatch=screenshot_results, textMatch=text_results
screenshotMatch=screenshot_results,
textMatch=text_results,
)

View File

@ -501,6 +501,34 @@ class PageOps:
return [PageOut.from_dict(data) for data in items], total
async def get_crawl_file_count(self, crawl_id: str):
"""Get count of pages in crawl that are files and don't need to be QAed"""
aggregate = [
{
"$match": {
"crawl_id": crawl_id,
"loadState": 2,
"mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
}
},
{"$count": "count"},
]
cursor = self.pages.aggregate(aggregate)
results = await cursor.to_list(length=1)
if not results:
return 0
result = results[0]
try:
total = int(result["count"])
except (IndexError, ValueError):
total = 0
return total
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
"""Delete existing pages for crawl and re-add from WACZs."""
await self.delete_crawl_pages(crawl_id, oid)
@ -520,6 +548,7 @@ class PageOps:
crawl_id: str,
qa_run_id: str,
thresholds: Dict[str, List[float]],
file_count: int,
key: str = "screenshotMatch",
):
"""Get counts for pages in QA run in buckets by score key based on thresholds"""
@ -556,11 +585,17 @@ class PageOps:
return_data = []
for result in results:
return_data.append(
QARunBucketStats(
lowerBoundary=str(result.get("_id")), count=result.get("count", 0)
key = str(result.get("_id"))
if key == "No data":
count = result.get("count", 0) - file_count
return_data.append(QARunBucketStats(lowerBoundary=key, count=count))
else:
return_data.append(
QARunBucketStats(lowerBoundary=key, count=result.get("count", 0))
)
)
# Add file count
return_data.append(QARunBucketStats(lowerBoundary="Files", count=file_count))
# Add missing boundaries to result and re-sort
for boundary in boundaries:

View File

@ -329,11 +329,13 @@ def test_qa_stats(
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
{"lowerBoundary": "Files", "count": 0},
]
assert data["textMatch"] == [
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
{"lowerBoundary": "Files", "count": 0},
]
# Test we get expected results with explicit 0 boundary
@ -348,11 +350,13 @@ def test_qa_stats(
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
{"lowerBoundary": "Files", "count": 0},
]
assert data["textMatch"] == [
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
{"lowerBoundary": "Files", "count": 0},
]
# Test that missing threshold values result in 422 HTTPException

View File

@ -44,7 +44,7 @@ import { formatNumber, getLocale } from "@/utils/localization";
import { pluralOf } from "@/utils/pluralize";
type QAStatsThreshold = {
lowerBoundary: `${number}` | "No data";
lowerBoundary: `${number}` | "No data" | "Files";
count: number;
};
type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>;
@ -65,6 +65,11 @@ const qaStatsThresholds = [
cssColor: "var(--sl-color-success-500)",
label: msg("Good Match"),
},
{
lowerBoundary: "Files",
cssColor: "var(--sl-color-neutral-500)",
label: msg("Identical Files"),
},
];
const notApplicable = () =>
@ -527,26 +532,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
})}
</div>
<div class="flex items-center gap-2 text-neutral-500">
${when(
qaRun.state.startsWith("stop") ||
(qaRun.state === "complete" &&
qaRun.stats.done < qaRun.stats.found),
() =>
html`<sl-tooltip
content=${qaRun.state.startsWith("stop")
? msg("This analysis run was stopped and is not complete.")
: msg(
"Not all pages in this crawl were analyzed. This is likely because some pages are not HTML pages, but other types of documents.",
)}
class="[--max-width:theme(spacing.56)]"
>
<sl-icon
name="exclamation-triangle-fill"
class="text-warning"
label=${msg("Note about page counts")}
></sl-icon>
</sl-tooltip> `,
)}
${when(
qaRun.stats,
(stats) => html`
@ -653,13 +638,13 @@ export class ArchivedItemDetailQA extends TailwindElement {
? msg("No Data")
: threshold?.label}
<div class="text-xs opacity-80">
${bar.lowerBoundary !== "No data"
${!["No data", "Files"].includes(bar.lowerBoundary)
? html`${idx === 0
? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`
: idx === qaStatsThresholds.length - 1
? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%`
: `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`}
match <br />`
: `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100 || 100}%`}
${msg("match")} <br />`
: nothing}
${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}
</div>