compute top page origins for each collection (#2483)

A quick PR to fix #2482:
- compute topPageHosts as part of existing collection stats compute
- store top 10 results in collection for now.
- display in collection About sidebar
- fixes #2482 

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-05-08 14:22:40 -07:00 committed by GitHub
parent 0691f43be6
commit 1570011ec7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 117 additions and 1 deletions

View File

@ -705,6 +705,8 @@ class CollectionOps:
unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
top_page_hosts = await self.page_ops.get_top_page_hosts(crawl_ids)
await self.collections.find_one_and_update(
{"_id": collection_id},
{
@ -715,6 +717,7 @@ class CollectionOps:
"totalSize": total_size,
"tags": sorted_tags,
"preloadResources": preload_resources,
"topPageHosts": top_page_hosts,
}
},
)

View File

@ -32,7 +32,7 @@ else:
) = PageOps = BackgroundJobOps = object
CURR_DB_VERSION = "0043"
CURR_DB_VERSION = "0044"
# ============================================================================

View File

@ -0,0 +1,44 @@
"""
Migration 0044 - Recalculate collection stats
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0044"
# pylint: disable=duplicate-code
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
self.coll_ops = kwargs.get("coll_ops")
async def migrate_up(self):
"""Perform migration up.
Recalculate collection stats to get top host names
"""
colls_mdb = self.mdb["collections"]
if self.coll_ops is None:
print(
"Unable to set collection stats, missing coll_ops",
flush=True,
)
return
async for coll in colls_mdb.find({}):
coll_id = coll["_id"]
try:
await self.coll_ops.update_collection_counts_and_tags(coll_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update page stats for collection {coll_id}: {err}",
flush=True,
)

View File

@ -1417,6 +1417,14 @@ class PreloadResource(BaseModel):
crawlId: str
# ============================================================================
class HostCount(BaseModel):
"""Host Count"""
host: str
count: int
# ============================================================================
class Collection(BaseMongoModel):
"""Org collection structure"""
@ -1515,6 +1523,8 @@ class CollOut(BaseMongoModel):
pagesQueryUrl: str = ""
downloadUrl: Optional[str] = None
topPageHosts: List[HostCount] = []
# ============================================================================
class PublicCollOut(BaseMongoModel):
@ -1550,6 +1560,8 @@ class PublicCollOut(BaseMongoModel):
allowPublicDownload: bool = True
topPageHosts: List[HostCount] = []
# ============================================================================
class UpdateColl(BaseModel):

View File

@ -923,6 +923,35 @@ class PageOps:
res = await cursor.to_list(1)
return res[0].get("urls") if res else 0
async def get_top_page_hosts(
self, crawl_ids: List[str]
) -> List[dict[str, str | int]]:
"""Get count of top page hosts across all archived items"""
cursor = self.pages.aggregate(
[
{"$match": {"crawl_id": {"$in": crawl_ids}}},
{
"$addFields": {
"host": {
"$regexFind": {
"input": "$url",
"regex": "^https?://([^/]+)",
}
}
}
},
{
"$group": {
"_id": {"$first": "$host.captures"},
"count": {"$count": {}},
}
},
{"$sort": {"count": -1}},
]
)
res = await cursor.to_list(10)
return [{"host": x.get("_id"), "count": x.get("count")} for x in res]
async def set_archived_item_page_counts(self, crawl_id: str):
"""Store archived item page and unique page counts in crawl document"""
page_count = await self.pages.count_documents({"crawl_id": crawl_id})

View File

@ -94,6 +94,8 @@ def test_create_collection(
assert data["defaultThumbnailName"] == default_thumbnail_name
assert data["allowPublicDownload"]
assert data["topPageHosts"] == [{'count': 3, 'host': 'webrecorder.net'}]
def test_create_public_collection(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
@ -223,6 +225,7 @@ def test_update_collection(
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["defaultThumbnailName"]
assert data["topPageHosts"]
def test_rename_collection(
@ -310,6 +313,7 @@ def test_add_remove_crawl_from_collection(
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["topPageHosts"] == [{'count': 7, 'host': 'webrecorder.net'}]
# Verify it was added
r = requests.get(
@ -335,6 +339,7 @@ def test_add_remove_crawl_from_collection(
assert data.get("tags", []) == []
assert data.get("dateEarliest") is None
assert data.get("dateLatest") is None
assert data["topPageHosts"] == []
# Verify they were removed
r = requests.get(
@ -366,6 +371,7 @@ def test_add_remove_crawl_from_collection(
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["topPageHosts"]
def test_get_collection(crawler_auth_headers, default_org_id):
@ -1137,6 +1143,7 @@ def test_list_public_collections(
assert collection["pageCount"] > 0
assert collection["uniquePageCount"] > 0
assert collection["totalSize"] > 0
assert collection["topPageHosts"]
# Test non-existing slug - it should return a 404 but not reveal
# whether or not an org exists with that slug

View File

@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) {
label: metadata.totalSize,
render: (col) => `${localize.bytes(col.totalSize)}`,
})}
${metadataItem({
label: metadata.topPageHosts,
render: (col) =>
html` <table>
${col.topPageHosts.map(
(x) => html`
<tr>
<td>${x.host}</td>
<td class="pl-4">${x.count}</td>
</tr>
`,
)}
</table>`,
})}
</btrix-desc-list>
`;
}

View File

@ -5,4 +5,5 @@ export const metadata = {
uniquePageCount: msg("Unique Pages in Collection"),
pageCount: msg("Total Pages Crawled"),
totalSize: msg("Collection Size"),
topPageHosts: msg("Top Page Hostnames"),
};

View File

@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({
crawlCount: z.number(),
uniquePageCount: z.number(),
pageCount: z.number(),
topPageHosts: z.array(
z.object({
host: z.string(),
count: z.number(),
}),
),
totalSize: z.number(),
allowPublicDownload: z.boolean(),
homeUrl: z.string().url().nullable(),