compute top page origins for each collection (#2483)
A quick PR to fix #2482: - compute topPageHosts as part of existing collection stats compute - store top 10 results in collection for now. - display in collection About sidebar - fixes #2482 Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
0691f43be6
commit
1570011ec7
@ -705,6 +705,8 @@ class CollectionOps:
|
||||
|
||||
unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
|
||||
|
||||
top_page_hosts = await self.page_ops.get_top_page_hosts(crawl_ids)
|
||||
|
||||
await self.collections.find_one_and_update(
|
||||
{"_id": collection_id},
|
||||
{
|
||||
@ -715,6 +717,7 @@ class CollectionOps:
|
||||
"totalSize": total_size,
|
||||
"tags": sorted_tags,
|
||||
"preloadResources": preload_resources,
|
||||
"topPageHosts": top_page_hosts,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
@ -32,7 +32,7 @@ else:
|
||||
) = PageOps = BackgroundJobOps = object
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0043"
|
||||
CURR_DB_VERSION = "0044"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
44
backend/btrixcloud/migrations/migration_0044_coll_stats.py
Normal file
44
backend/btrixcloud/migrations/migration_0044_coll_stats.py
Normal file
@ -0,0 +1,44 @@
|
||||
"""
|
||||
Migration 0044 - Recalculate collection stats
|
||||
"""
|
||||
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0044"
|
||||
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
def __init__(self, mdb, **kwargs):
|
||||
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||
|
||||
self.coll_ops = kwargs.get("coll_ops")
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Recalculate collection stats to get top host names
|
||||
"""
|
||||
colls_mdb = self.mdb["collections"]
|
||||
|
||||
if self.coll_ops is None:
|
||||
print(
|
||||
"Unable to set collection stats, missing coll_ops",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
async for coll in colls_mdb.find({}):
|
||||
coll_id = coll["_id"]
|
||||
try:
|
||||
await self.coll_ops.update_collection_counts_and_tags(coll_id)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception as err:
|
||||
print(
|
||||
f"Unable to update page stats for collection {coll_id}: {err}",
|
||||
flush=True,
|
||||
)
|
@ -1417,6 +1417,14 @@ class PreloadResource(BaseModel):
|
||||
crawlId: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class HostCount(BaseModel):
|
||||
"""Host Count"""
|
||||
|
||||
host: str
|
||||
count: int
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Collection(BaseMongoModel):
|
||||
"""Org collection structure"""
|
||||
@ -1515,6 +1523,8 @@ class CollOut(BaseMongoModel):
|
||||
pagesQueryUrl: str = ""
|
||||
downloadUrl: Optional[str] = None
|
||||
|
||||
topPageHosts: List[HostCount] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PublicCollOut(BaseMongoModel):
|
||||
@ -1550,6 +1560,8 @@ class PublicCollOut(BaseMongoModel):
|
||||
|
||||
allowPublicDownload: bool = True
|
||||
|
||||
topPageHosts: List[HostCount] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class UpdateColl(BaseModel):
|
||||
|
@ -923,6 +923,35 @@ class PageOps:
|
||||
res = await cursor.to_list(1)
|
||||
return res[0].get("urls") if res else 0
|
||||
|
||||
async def get_top_page_hosts(
|
||||
self, crawl_ids: List[str]
|
||||
) -> List[dict[str, str | int]]:
|
||||
"""Get count of top page hosts across all archived items"""
|
||||
cursor = self.pages.aggregate(
|
||||
[
|
||||
{"$match": {"crawl_id": {"$in": crawl_ids}}},
|
||||
{
|
||||
"$addFields": {
|
||||
"host": {
|
||||
"$regexFind": {
|
||||
"input": "$url",
|
||||
"regex": "^https?://([^/]+)",
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"$group": {
|
||||
"_id": {"$first": "$host.captures"},
|
||||
"count": {"$count": {}},
|
||||
}
|
||||
},
|
||||
{"$sort": {"count": -1}},
|
||||
]
|
||||
)
|
||||
res = await cursor.to_list(10)
|
||||
return [{"host": x.get("_id"), "count": x.get("count")} for x in res]
|
||||
|
||||
async def set_archived_item_page_counts(self, crawl_id: str):
|
||||
"""Store archived item page and unique page counts in crawl document"""
|
||||
page_count = await self.pages.count_documents({"crawl_id": crawl_id})
|
||||
|
@ -94,6 +94,8 @@ def test_create_collection(
|
||||
assert data["defaultThumbnailName"] == default_thumbnail_name
|
||||
assert data["allowPublicDownload"]
|
||||
|
||||
assert data["topPageHosts"] == [{'count': 3, 'host': 'webrecorder.net'}]
|
||||
|
||||
|
||||
def test_create_public_collection(
|
||||
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
||||
@ -223,6 +225,7 @@ def test_update_collection(
|
||||
assert data["dateEarliest"]
|
||||
assert data["dateLatest"]
|
||||
assert data["defaultThumbnailName"]
|
||||
assert data["topPageHosts"]
|
||||
|
||||
|
||||
def test_rename_collection(
|
||||
@ -310,6 +313,7 @@ def test_add_remove_crawl_from_collection(
|
||||
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
||||
assert data["dateEarliest"]
|
||||
assert data["dateLatest"]
|
||||
assert data["topPageHosts"] == [{'count': 7, 'host': 'webrecorder.net'}]
|
||||
|
||||
# Verify it was added
|
||||
r = requests.get(
|
||||
@ -335,6 +339,7 @@ def test_add_remove_crawl_from_collection(
|
||||
assert data.get("tags", []) == []
|
||||
assert data.get("dateEarliest") is None
|
||||
assert data.get("dateLatest") is None
|
||||
assert data["topPageHosts"] == []
|
||||
|
||||
# Verify they were removed
|
||||
r = requests.get(
|
||||
@ -366,6 +371,7 @@ def test_add_remove_crawl_from_collection(
|
||||
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
||||
assert data["dateEarliest"]
|
||||
assert data["dateLatest"]
|
||||
assert data["topPageHosts"]
|
||||
|
||||
|
||||
def test_get_collection(crawler_auth_headers, default_org_id):
|
||||
@ -1137,6 +1143,7 @@ def test_list_public_collections(
|
||||
assert collection["pageCount"] > 0
|
||||
assert collection["uniquePageCount"] > 0
|
||||
assert collection["totalSize"] > 0
|
||||
assert collection["topPageHosts"]
|
||||
|
||||
# Test non-existing slug - it should return a 404 but not reveal
|
||||
# whether or not an org exists with that slug
|
||||
|
@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) {
|
||||
label: metadata.totalSize,
|
||||
render: (col) => `${localize.bytes(col.totalSize)}`,
|
||||
})}
|
||||
${metadataItem({
|
||||
label: metadata.topPageHosts,
|
||||
render: (col) =>
|
||||
html` <table>
|
||||
${col.topPageHosts.map(
|
||||
(x) => html`
|
||||
<tr>
|
||||
<td>${x.host}</td>
|
||||
<td class="pl-4">${x.count}</td>
|
||||
</tr>
|
||||
`,
|
||||
)}
|
||||
</table>`,
|
||||
})}
|
||||
</btrix-desc-list>
|
||||
`;
|
||||
}
|
||||
|
@ -5,4 +5,5 @@ export const metadata = {
|
||||
uniquePageCount: msg("Unique Pages in Collection"),
|
||||
pageCount: msg("Total Pages Crawled"),
|
||||
totalSize: msg("Collection Size"),
|
||||
topPageHosts: msg("Top Page Hostnames"),
|
||||
};
|
||||
|
@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({
|
||||
crawlCount: z.number(),
|
||||
uniquePageCount: z.number(),
|
||||
pageCount: z.number(),
|
||||
topPageHosts: z.array(
|
||||
z.object({
|
||||
host: z.string(),
|
||||
count: z.number(),
|
||||
}),
|
||||
),
|
||||
totalSize: z.number(),
|
||||
allowPublicDownload: z.boolean(),
|
||||
homeUrl: z.string().url().nullable(),
|
||||
|
Loading…
Reference in New Issue
Block a user