compute top page origins for each collection (#2483)

A quick PR to fix #2482: - compute topPageHosts as part of existing collection stats compute - store top 10 results in collection for now. - display in collection About sidebar - fixes #2482 Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2025-05-08 14:22:40 -07:00 · 2025-05-08 14:22:40 -07:00 · 1570011ec7
commit 1570011ec7
parent 0691f43be6
9 changed files with 117 additions and 1 deletions
--- a/backend/btrixcloud/colls.py
+++ b/backend/btrixcloud/colls.py
@ -705,6 +705,8 @@ class CollectionOps:

        unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)

+        top_page_hosts = await self.page_ops.get_top_page_hosts(crawl_ids)
+
        await self.collections.find_one_and_update(
            {"_id": collection_id},
            {
@ -715,6 +717,7 @@ class CollectionOps:
                    "totalSize": total_size,
                    "tags": sorted_tags,
                    "preloadResources": preload_resources,
+                    "topPageHosts": top_page_hosts,
                }
            },
        )
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -32,7 +32,7 @@ else:
    ) = PageOps = BackgroundJobOps = object


-CURR_DB_VERSION = "0043"
+CURR_DB_VERSION = "0044"


 # ============================================================================
--- a/backend/btrixcloud/migrations/migration_0044_coll_stats.py
+++ b/backend/btrixcloud/migrations/migration_0044_coll_stats.py
@ -0,0 +1,44 @@
+"""
+Migration 0044 - Recalculate collection stats
+"""
+
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0044"
+
+
+# pylint: disable=duplicate-code
+class Migration(BaseMigration):
+    """Migration class."""
+
+    # pylint: disable=unused-argument
+    def __init__(self, mdb, **kwargs):
+        super().__init__(mdb, migration_version=MIGRATION_VERSION)
+
+        self.coll_ops = kwargs.get("coll_ops")
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Recalculate collection stats to get top host names
+        """
+        colls_mdb = self.mdb["collections"]
+
+        if self.coll_ops is None:
+            print(
+                "Unable to set collection stats, missing coll_ops",
+                flush=True,
+            )
+            return
+
+        async for coll in colls_mdb.find({}):
+            coll_id = coll["_id"]
+            try:
+                await self.coll_ops.update_collection_counts_and_tags(coll_id)
+            # pylint: disable=broad-exception-caught
+            except Exception as err:
+                print(
+                    f"Unable to update page stats for collection {coll_id}: {err}",
+                    flush=True,
+                )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -1417,6 +1417,14 @@ class PreloadResource(BaseModel):
    crawlId: str


+# ============================================================================
+class HostCount(BaseModel):
+    """Host Count"""
+
+    host: str
+    count: int
+
+
 # ============================================================================
 class Collection(BaseMongoModel):
    """Org collection structure"""
@ -1515,6 +1523,8 @@ class CollOut(BaseMongoModel):
    pagesQueryUrl: str = ""
    downloadUrl: Optional[str] = None

+    topPageHosts: List[HostCount] = []
+

 # ============================================================================
 class PublicCollOut(BaseMongoModel):
@ -1550,6 +1560,8 @@ class PublicCollOut(BaseMongoModel):

    allowPublicDownload: bool = True

+    topPageHosts: List[HostCount] = []
+

 # ============================================================================
 class UpdateColl(BaseModel):
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -923,6 +923,35 @@ class PageOps:
        res = await cursor.to_list(1)
        return res[0].get("urls") if res else 0

+    async def get_top_page_hosts(
+        self, crawl_ids: List[str]
+    ) -> List[dict[str, str | int]]:
+        """Get count of top page hosts across all archived items"""
+        cursor = self.pages.aggregate(
+            [
+                {"$match": {"crawl_id": {"$in": crawl_ids}}},
+                {
+                    "$addFields": {
+                        "host": {
+                            "$regexFind": {
+                                "input": "$url",
+                                "regex": "^https?://([^/]+)",
+                            }
+                        }
+                    }
+                },
+                {
+                    "$group": {
+                        "_id": {"$first": "$host.captures"},
+                        "count": {"$count": {}},
+                    }
+                },
+                {"$sort": {"count": -1}},
+            ]
+        )
+        res = await cursor.to_list(10)
+        return [{"host": x.get("_id"), "count": x.get("count")} for x in res]
+
    async def set_archived_item_page_counts(self, crawl_id: str):
        """Store archived item page and unique page counts in crawl document"""
        page_count = await self.pages.count_documents({"crawl_id": crawl_id})
--- a/backend/test/test_collections.py
+++ b/backend/test/test_collections.py
@ -94,6 +94,8 @@ def test_create_collection(
    assert data["defaultThumbnailName"] == default_thumbnail_name
    assert data["allowPublicDownload"]

+    assert data["topPageHosts"] == [{'count': 3, 'host': 'webrecorder.net'}]
+

 def test_create_public_collection(
    crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
@ -223,6 +225,7 @@ def test_update_collection(
    assert data["dateEarliest"]
    assert data["dateLatest"]
    assert data["defaultThumbnailName"]
+    assert data["topPageHosts"]


 def test_rename_collection(
@ -310,6 +313,7 @@ def test_add_remove_crawl_from_collection(
    assert data["tags"] == ["wr-test-2", "wr-test-1"]
    assert data["dateEarliest"]
    assert data["dateLatest"]
+    assert data["topPageHosts"] == [{'count': 7, 'host': 'webrecorder.net'}]

    # Verify it was added
    r = requests.get(
@ -335,6 +339,7 @@ def test_add_remove_crawl_from_collection(
    assert data.get("tags", []) == []
    assert data.get("dateEarliest") is None
    assert data.get("dateLatest") is None
+    assert data["topPageHosts"] == []

    # Verify they were removed
    r = requests.get(
@ -366,6 +371,7 @@ def test_add_remove_crawl_from_collection(
    assert data["tags"] == ["wr-test-2", "wr-test-1"]
    assert data["dateEarliest"]
    assert data["dateLatest"]
+    assert data["topPageHosts"]


 def test_get_collection(crawler_auth_headers, default_org_id):
@ -1137,6 +1143,7 @@ def test_list_public_collections(
        assert collection["pageCount"] > 0
        assert collection["uniquePageCount"] > 0
        assert collection["totalSize"] > 0
+        assert collection["topPageHosts"]

    # Test non-existing slug - it should return a 404 but not reveal
    # whether or not an org exists with that slug
--- a/frontend/src/layouts/collections/metadataColumn.ts
+++ b/frontend/src/layouts/collections/metadataColumn.ts
@ -56,6 +56,20 @@ export function metadataColumn(collection?: Collection | PublicCollection) {
        label: metadata.totalSize,
        render: (col) => `${localize.bytes(col.totalSize)}`,
      })}
+      ${metadataItem({
+        label: metadata.topPageHosts,
+        render: (col) =>
+          html` <table>
+            ${col.topPageHosts.map(
+              (x) => html`
+                <tr>
+                  <td>${x.host}</td>
+                  <td class="pl-4">${x.count}</td>
+                </tr>
+              `,
+            )}
+          </table>`,
+      })}
    </btrix-desc-list>
  `;
 }
--- a/frontend/src/strings/collections/metadata.ts
+++ b/frontend/src/strings/collections/metadata.ts
@ -5,4 +5,5 @@ export const metadata = {
  uniquePageCount: msg("Unique Pages in Collection"),
  pageCount: msg("Total Pages Crawled"),
  totalSize: msg("Collection Size"),
+  topPageHosts: msg("Top Page Hostnames"),
 };
--- a/frontend/src/types/collection.ts
+++ b/frontend/src/types/collection.ts
@ -41,6 +41,12 @@ export const publicCollectionSchema = z.object({
  crawlCount: z.number(),
  uniquePageCount: z.number(),
  pageCount: z.number(),
+  topPageHosts: z.array(
+    z.object({
+      host: z.string(),
+      count: z.number(),
+    }),
+  ),
  totalSize: z.number(),
  allowPublicDownload: z.boolean(),
  homeUrl: z.string().url().nullable(),