Include number of Identical Files in QA stats and meter (#1848)
This PR adds Identical Files to the QA Page Match Analysis meter bars. To do this, the backend calculates the number of non-HTML pages once and includes it under the key `Files` in each of the `screenshotMatch` and `textMatch` QA stats return arrays. The backend additionally removes the file count from "No Data" to prevent these from being counted twice. --------- Co-authored-by: emma <hi@emma.cafe>
This commit is contained in:
parent
e3ee63f9b0
commit
a85f9496b0
@ -957,14 +957,16 @@ class CrawlOps(BaseCrawlOps):
|
||||
thresholds: Dict[str, List[float]],
|
||||
) -> QARunAggregateStatsOut:
|
||||
"""Get aggregate stats for QA run"""
|
||||
file_count = await self.page_ops.get_crawl_file_count(crawl_id)
|
||||
screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
|
||||
crawl_id, qa_run_id, thresholds, key="screenshotMatch"
|
||||
crawl_id, qa_run_id, thresholds, file_count, key="screenshotMatch"
|
||||
)
|
||||
text_results = await self.page_ops.get_qa_run_aggregate_counts(
|
||||
crawl_id, qa_run_id, thresholds, key="textMatch"
|
||||
crawl_id, qa_run_id, thresholds, file_count, key="textMatch"
|
||||
)
|
||||
return QARunAggregateStatsOut(
|
||||
screenshotMatch=screenshot_results, textMatch=text_results
|
||||
screenshotMatch=screenshot_results,
|
||||
textMatch=text_results,
|
||||
)
|
||||
|
||||
|
||||
|
@ -501,6 +501,34 @@ class PageOps:
|
||||
|
||||
return [PageOut.from_dict(data) for data in items], total
|
||||
|
||||
async def get_crawl_file_count(self, crawl_id: str):
|
||||
"""Get count of pages in crawl that are files and don't need to be QAed"""
|
||||
aggregate = [
|
||||
{
|
||||
"$match": {
|
||||
"crawl_id": crawl_id,
|
||||
"loadState": 2,
|
||||
"mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
|
||||
}
|
||||
},
|
||||
{"$count": "count"},
|
||||
]
|
||||
|
||||
cursor = self.pages.aggregate(aggregate)
|
||||
results = await cursor.to_list(length=1)
|
||||
|
||||
if not results:
|
||||
return 0
|
||||
|
||||
result = results[0]
|
||||
|
||||
try:
|
||||
total = int(result["count"])
|
||||
except (IndexError, ValueError):
|
||||
total = 0
|
||||
|
||||
return total
|
||||
|
||||
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
||||
"""Delete existing pages for crawl and re-add from WACZs."""
|
||||
await self.delete_crawl_pages(crawl_id, oid)
|
||||
@ -520,6 +548,7 @@ class PageOps:
|
||||
crawl_id: str,
|
||||
qa_run_id: str,
|
||||
thresholds: Dict[str, List[float]],
|
||||
file_count: int,
|
||||
key: str = "screenshotMatch",
|
||||
):
|
||||
"""Get counts for pages in QA run in buckets by score key based on thresholds"""
|
||||
@ -556,11 +585,17 @@ class PageOps:
|
||||
return_data = []
|
||||
|
||||
for result in results:
|
||||
return_data.append(
|
||||
QARunBucketStats(
|
||||
lowerBoundary=str(result.get("_id")), count=result.get("count", 0)
|
||||
key = str(result.get("_id"))
|
||||
if key == "No data":
|
||||
count = result.get("count", 0) - file_count
|
||||
return_data.append(QARunBucketStats(lowerBoundary=key, count=count))
|
||||
else:
|
||||
return_data.append(
|
||||
QARunBucketStats(lowerBoundary=key, count=result.get("count", 0))
|
||||
)
|
||||
)
|
||||
|
||||
# Add file count
|
||||
return_data.append(QARunBucketStats(lowerBoundary="Files", count=file_count))
|
||||
|
||||
# Add missing boundaries to result and re-sort
|
||||
for boundary in boundaries:
|
||||
|
@ -329,11 +329,13 @@ def test_qa_stats(
|
||||
{"lowerBoundary": "0.0", "count": 0},
|
||||
{"lowerBoundary": "0.7", "count": 0},
|
||||
{"lowerBoundary": "0.9", "count": 1},
|
||||
{"lowerBoundary": "Files", "count": 0},
|
||||
]
|
||||
assert data["textMatch"] == [
|
||||
{"lowerBoundary": "0.0", "count": 0},
|
||||
{"lowerBoundary": "0.7", "count": 0},
|
||||
{"lowerBoundary": "0.9", "count": 1},
|
||||
{"lowerBoundary": "Files", "count": 0},
|
||||
]
|
||||
|
||||
# Test we get expected results with explicit 0 boundary
|
||||
@ -348,11 +350,13 @@ def test_qa_stats(
|
||||
{"lowerBoundary": "0.0", "count": 0},
|
||||
{"lowerBoundary": "0.7", "count": 0},
|
||||
{"lowerBoundary": "0.9", "count": 1},
|
||||
{"lowerBoundary": "Files", "count": 0},
|
||||
]
|
||||
assert data["textMatch"] == [
|
||||
{"lowerBoundary": "0.0", "count": 0},
|
||||
{"lowerBoundary": "0.7", "count": 0},
|
||||
{"lowerBoundary": "0.9", "count": 1},
|
||||
{"lowerBoundary": "Files", "count": 0},
|
||||
]
|
||||
|
||||
# Test that missing threshold values result in 422 HTTPException
|
||||
|
@ -44,7 +44,7 @@ import { formatNumber, getLocale } from "@/utils/localization";
|
||||
import { pluralOf } from "@/utils/pluralize";
|
||||
|
||||
type QAStatsThreshold = {
|
||||
lowerBoundary: `${number}` | "No data";
|
||||
lowerBoundary: `${number}` | "No data" | "Files";
|
||||
count: number;
|
||||
};
|
||||
type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>;
|
||||
@ -65,6 +65,11 @@ const qaStatsThresholds = [
|
||||
cssColor: "var(--sl-color-success-500)",
|
||||
label: msg("Good Match"),
|
||||
},
|
||||
{
|
||||
lowerBoundary: "Files",
|
||||
cssColor: "var(--sl-color-neutral-500)",
|
||||
label: msg("Identical Files"),
|
||||
},
|
||||
];
|
||||
|
||||
const notApplicable = () =>
|
||||
@ -527,26 +532,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
|
||||
})}
|
||||
</div>
|
||||
<div class="flex items-center gap-2 text-neutral-500">
|
||||
${when(
|
||||
qaRun.state.startsWith("stop") ||
|
||||
(qaRun.state === "complete" &&
|
||||
qaRun.stats.done < qaRun.stats.found),
|
||||
() =>
|
||||
html`<sl-tooltip
|
||||
content=${qaRun.state.startsWith("stop")
|
||||
? msg("This analysis run was stopped and is not complete.")
|
||||
: msg(
|
||||
"Not all pages in this crawl were analyzed. This is likely because some pages are not HTML pages, but other types of documents.",
|
||||
)}
|
||||
class="[--max-width:theme(spacing.56)]"
|
||||
>
|
||||
<sl-icon
|
||||
name="exclamation-triangle-fill"
|
||||
class="text-warning"
|
||||
label=${msg("Note about page counts")}
|
||||
></sl-icon>
|
||||
</sl-tooltip> `,
|
||||
)}
|
||||
${when(
|
||||
qaRun.stats,
|
||||
(stats) => html`
|
||||
@ -653,13 +638,13 @@ export class ArchivedItemDetailQA extends TailwindElement {
|
||||
? msg("No Data")
|
||||
: threshold?.label}
|
||||
<div class="text-xs opacity-80">
|
||||
${bar.lowerBoundary !== "No data"
|
||||
${!["No data", "Files"].includes(bar.lowerBoundary)
|
||||
? html`${idx === 0
|
||||
? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`
|
||||
: idx === qaStatsThresholds.length - 1
|
||||
? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%`
|
||||
: `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`}
|
||||
match <br />`
|
||||
: `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100 || 100}%`}
|
||||
${msg("match")} <br />`
|
||||
: nothing}
|
||||
${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}
|
||||
</div>
|
||||
|
Loading…
Reference in New Issue
Block a user