Include number of Identical Files in QA stats and meter (#1848)
This PR adds Identical Files to the QA Page Match Analysis meter bars. To do this, the backend calculates the number of non-HTML pages once and includes it under the key `Files` in each of the `screenshotMatch` and `textMatch` QA stats return arrays. The backend additionally removes the file count from "No Data" to prevent these from being counted twice. --------- Co-authored-by: emma <hi@emma.cafe>
This commit is contained in:
parent
e3ee63f9b0
commit
a85f9496b0
@ -957,14 +957,16 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
thresholds: Dict[str, List[float]],
|
thresholds: Dict[str, List[float]],
|
||||||
) -> QARunAggregateStatsOut:
|
) -> QARunAggregateStatsOut:
|
||||||
"""Get aggregate stats for QA run"""
|
"""Get aggregate stats for QA run"""
|
||||||
|
file_count = await self.page_ops.get_crawl_file_count(crawl_id)
|
||||||
screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
|
screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
|
||||||
crawl_id, qa_run_id, thresholds, key="screenshotMatch"
|
crawl_id, qa_run_id, thresholds, file_count, key="screenshotMatch"
|
||||||
)
|
)
|
||||||
text_results = await self.page_ops.get_qa_run_aggregate_counts(
|
text_results = await self.page_ops.get_qa_run_aggregate_counts(
|
||||||
crawl_id, qa_run_id, thresholds, key="textMatch"
|
crawl_id, qa_run_id, thresholds, file_count, key="textMatch"
|
||||||
)
|
)
|
||||||
return QARunAggregateStatsOut(
|
return QARunAggregateStatsOut(
|
||||||
screenshotMatch=screenshot_results, textMatch=text_results
|
screenshotMatch=screenshot_results,
|
||||||
|
textMatch=text_results,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -501,6 +501,34 @@ class PageOps:
|
|||||||
|
|
||||||
return [PageOut.from_dict(data) for data in items], total
|
return [PageOut.from_dict(data) for data in items], total
|
||||||
|
|
||||||
|
async def get_crawl_file_count(self, crawl_id: str):
|
||||||
|
"""Get count of pages in crawl that are files and don't need to be QAed"""
|
||||||
|
aggregate = [
|
||||||
|
{
|
||||||
|
"$match": {
|
||||||
|
"crawl_id": crawl_id,
|
||||||
|
"loadState": 2,
|
||||||
|
"mime": {"$not": {"$regex": "^.*html", "$options": "i"}},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{"$count": "count"},
|
||||||
|
]
|
||||||
|
|
||||||
|
cursor = self.pages.aggregate(aggregate)
|
||||||
|
results = await cursor.to_list(length=1)
|
||||||
|
|
||||||
|
if not results:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
result = results[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
total = int(result["count"])
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
return total
|
||||||
|
|
||||||
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
||||||
"""Delete existing pages for crawl and re-add from WACZs."""
|
"""Delete existing pages for crawl and re-add from WACZs."""
|
||||||
await self.delete_crawl_pages(crawl_id, oid)
|
await self.delete_crawl_pages(crawl_id, oid)
|
||||||
@ -520,6 +548,7 @@ class PageOps:
|
|||||||
crawl_id: str,
|
crawl_id: str,
|
||||||
qa_run_id: str,
|
qa_run_id: str,
|
||||||
thresholds: Dict[str, List[float]],
|
thresholds: Dict[str, List[float]],
|
||||||
|
file_count: int,
|
||||||
key: str = "screenshotMatch",
|
key: str = "screenshotMatch",
|
||||||
):
|
):
|
||||||
"""Get counts for pages in QA run in buckets by score key based on thresholds"""
|
"""Get counts for pages in QA run in buckets by score key based on thresholds"""
|
||||||
@ -556,11 +585,17 @@ class PageOps:
|
|||||||
return_data = []
|
return_data = []
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
return_data.append(
|
key = str(result.get("_id"))
|
||||||
QARunBucketStats(
|
if key == "No data":
|
||||||
lowerBoundary=str(result.get("_id")), count=result.get("count", 0)
|
count = result.get("count", 0) - file_count
|
||||||
|
return_data.append(QARunBucketStats(lowerBoundary=key, count=count))
|
||||||
|
else:
|
||||||
|
return_data.append(
|
||||||
|
QARunBucketStats(lowerBoundary=key, count=result.get("count", 0))
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
# Add file count
|
||||||
|
return_data.append(QARunBucketStats(lowerBoundary="Files", count=file_count))
|
||||||
|
|
||||||
# Add missing boundaries to result and re-sort
|
# Add missing boundaries to result and re-sort
|
||||||
for boundary in boundaries:
|
for boundary in boundaries:
|
||||||
|
@ -329,11 +329,13 @@ def test_qa_stats(
|
|||||||
{"lowerBoundary": "0.0", "count": 0},
|
{"lowerBoundary": "0.0", "count": 0},
|
||||||
{"lowerBoundary": "0.7", "count": 0},
|
{"lowerBoundary": "0.7", "count": 0},
|
||||||
{"lowerBoundary": "0.9", "count": 1},
|
{"lowerBoundary": "0.9", "count": 1},
|
||||||
|
{"lowerBoundary": "Files", "count": 0},
|
||||||
]
|
]
|
||||||
assert data["textMatch"] == [
|
assert data["textMatch"] == [
|
||||||
{"lowerBoundary": "0.0", "count": 0},
|
{"lowerBoundary": "0.0", "count": 0},
|
||||||
{"lowerBoundary": "0.7", "count": 0},
|
{"lowerBoundary": "0.7", "count": 0},
|
||||||
{"lowerBoundary": "0.9", "count": 1},
|
{"lowerBoundary": "0.9", "count": 1},
|
||||||
|
{"lowerBoundary": "Files", "count": 0},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Test we get expected results with explicit 0 boundary
|
# Test we get expected results with explicit 0 boundary
|
||||||
@ -348,11 +350,13 @@ def test_qa_stats(
|
|||||||
{"lowerBoundary": "0.0", "count": 0},
|
{"lowerBoundary": "0.0", "count": 0},
|
||||||
{"lowerBoundary": "0.7", "count": 0},
|
{"lowerBoundary": "0.7", "count": 0},
|
||||||
{"lowerBoundary": "0.9", "count": 1},
|
{"lowerBoundary": "0.9", "count": 1},
|
||||||
|
{"lowerBoundary": "Files", "count": 0},
|
||||||
]
|
]
|
||||||
assert data["textMatch"] == [
|
assert data["textMatch"] == [
|
||||||
{"lowerBoundary": "0.0", "count": 0},
|
{"lowerBoundary": "0.0", "count": 0},
|
||||||
{"lowerBoundary": "0.7", "count": 0},
|
{"lowerBoundary": "0.7", "count": 0},
|
||||||
{"lowerBoundary": "0.9", "count": 1},
|
{"lowerBoundary": "0.9", "count": 1},
|
||||||
|
{"lowerBoundary": "Files", "count": 0},
|
||||||
]
|
]
|
||||||
|
|
||||||
# Test that missing threshold values result in 422 HTTPException
|
# Test that missing threshold values result in 422 HTTPException
|
||||||
|
@ -44,7 +44,7 @@ import { formatNumber, getLocale } from "@/utils/localization";
|
|||||||
import { pluralOf } from "@/utils/pluralize";
|
import { pluralOf } from "@/utils/pluralize";
|
||||||
|
|
||||||
type QAStatsThreshold = {
|
type QAStatsThreshold = {
|
||||||
lowerBoundary: `${number}` | "No data";
|
lowerBoundary: `${number}` | "No data" | "Files";
|
||||||
count: number;
|
count: number;
|
||||||
};
|
};
|
||||||
type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>;
|
type QAStats = Record<"screenshotMatch" | "textMatch", QAStatsThreshold[]>;
|
||||||
@ -65,6 +65,11 @@ const qaStatsThresholds = [
|
|||||||
cssColor: "var(--sl-color-success-500)",
|
cssColor: "var(--sl-color-success-500)",
|
||||||
label: msg("Good Match"),
|
label: msg("Good Match"),
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
lowerBoundary: "Files",
|
||||||
|
cssColor: "var(--sl-color-neutral-500)",
|
||||||
|
label: msg("Identical Files"),
|
||||||
|
},
|
||||||
];
|
];
|
||||||
|
|
||||||
const notApplicable = () =>
|
const notApplicable = () =>
|
||||||
@ -527,26 +532,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
|
|||||||
})}
|
})}
|
||||||
</div>
|
</div>
|
||||||
<div class="flex items-center gap-2 text-neutral-500">
|
<div class="flex items-center gap-2 text-neutral-500">
|
||||||
${when(
|
|
||||||
qaRun.state.startsWith("stop") ||
|
|
||||||
(qaRun.state === "complete" &&
|
|
||||||
qaRun.stats.done < qaRun.stats.found),
|
|
||||||
() =>
|
|
||||||
html`<sl-tooltip
|
|
||||||
content=${qaRun.state.startsWith("stop")
|
|
||||||
? msg("This analysis run was stopped and is not complete.")
|
|
||||||
: msg(
|
|
||||||
"Not all pages in this crawl were analyzed. This is likely because some pages are not HTML pages, but other types of documents.",
|
|
||||||
)}
|
|
||||||
class="[--max-width:theme(spacing.56)]"
|
|
||||||
>
|
|
||||||
<sl-icon
|
|
||||||
name="exclamation-triangle-fill"
|
|
||||||
class="text-warning"
|
|
||||||
label=${msg("Note about page counts")}
|
|
||||||
></sl-icon>
|
|
||||||
</sl-tooltip> `,
|
|
||||||
)}
|
|
||||||
${when(
|
${when(
|
||||||
qaRun.stats,
|
qaRun.stats,
|
||||||
(stats) => html`
|
(stats) => html`
|
||||||
@ -653,13 +638,13 @@ export class ArchivedItemDetailQA extends TailwindElement {
|
|||||||
? msg("No Data")
|
? msg("No Data")
|
||||||
: threshold?.label}
|
: threshold?.label}
|
||||||
<div class="text-xs opacity-80">
|
<div class="text-xs opacity-80">
|
||||||
${bar.lowerBoundary !== "No data"
|
${!["No data", "Files"].includes(bar.lowerBoundary)
|
||||||
? html`${idx === 0
|
? html`${idx === 0
|
||||||
? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`
|
? `<${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`
|
||||||
: idx === qaStatsThresholds.length - 1
|
: idx === qaStatsThresholds.length - 1
|
||||||
? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%`
|
? `>=${threshold ? +threshold.lowerBoundary * 100 : 0}%`
|
||||||
: `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100}%`}
|
: `${threshold ? +threshold.lowerBoundary * 100 : 0}-${+qaStatsThresholds[idx + 1].lowerBoundary * 100 || 100}%`}
|
||||||
match <br />`
|
${msg("match")} <br />`
|
||||||
: nothing}
|
: nothing}
|
||||||
${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}
|
${formatNumber(bar.count)} ${pluralOf("pages", bar.count)}
|
||||||
</div>
|
</div>
|
||||||
|
Loading…
Reference in New Issue
Block a user