Add WACZ filename, depth, favIconUrl, isSeed to pages (#2352)

Adds `filename` to pages, pointed to the WACZ file those files come
from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent
migration to backfill this information for existing pages, and increases
the backend container's startupProbe time to 24 hours to give it sufficient
time to finish the migration.
---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2025-02-05 15:50:04 -05:00 committed by GitHub
parent 8cfa28733a
commit 0e9e70f3a3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 131 additions and 4 deletions

View File

@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
from .migrations import BaseMigration
CURR_DB_VERSION = "0041"
CURR_DB_VERSION = "0042"
# ============================================================================

View File

@ -0,0 +1,50 @@
"""
Migration 0042 - Add filename to pages
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0042"
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
self.page_ops = kwargs.get("page_ops")
async def migrate_up(self):
"""Perform migration up.
Add filename to all pages that don't currently have it stored,
iterating through each archived item and its WACZ files as necessary
"""
pages_mdb = self.mdb["pages"]
if self.page_ops is None:
print(
"Unable to add filename and other fields to pages, missing page_ops",
flush=True,
)
return
crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})
crawl_count = len(crawl_ids_to_update)
current_index = 1
for crawl_id in crawl_ids_to_update:
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
try:
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
flush=True,
)
current_index += 1

View File

@ -2493,6 +2493,10 @@ class Page(BaseMongoModel):
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
filename: Optional[str] = None
depth: Optional[int] = None
favIconUrl: Optional[AnyHttpUrl] = None
isSeed: Optional[bool] = False
# manual review
userid: Optional[UUID] = None

View File

@ -1,6 +1,7 @@
"""crawl pages"""
import asyncio
import os
import traceback
from datetime import datetime
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
@ -83,6 +84,7 @@ class PageOps:
if len(pages_buffer) > batch_size:
await self._add_pages_to_db(crawl_id, pages_buffer)
pages_buffer = []
pages_buffer.append(
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
@ -100,6 +102,53 @@ class PageOps:
traceback.print_exc()
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
async def add_crawl_wacz_filename_to_pages(self, crawl_id: str):
"""Add WACZ filename and additional fields to existing pages in crawl if not already set"""
try:
crawl = await self.crawl_ops.get_crawl_out(crawl_id)
if not crawl.resources:
return
for wacz_file in crawl.resources:
# Strip oid directory from filename
filename = os.path.basename(wacz_file.name)
stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file])
for page_dict in stream:
if not page_dict.get("url"):
continue
page_id = page_dict.get("id")
if not page_id:
continue
if page_id:
try:
page_id = UUID(page_id)
# pylint: disable=broad-exception-caught
except Exception:
continue
await self.pages.find_one_and_update(
{"_id": page_id},
{
"$set": {
"filename": filename,
"depth": page_dict.get("depth"),
"isSeed": page_dict.get("seed", False),
"favIconUrl": page_dict.get("favIconUrl"),
}
},
)
# pylint: disable=broad-exception-caught, raise-missing-from
except Exception as err:
traceback.print_exc()
print(
f"Error adding filename to pages from item {crawl_id} to db: {err}",
flush=True,
)
def _get_page_from_dict(
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
) -> Page:
@ -127,6 +176,10 @@ class PageOps:
loadState=page_dict.get("loadState"),
status=status,
mime=page_dict.get("mime", "text/html"),
filename=page_dict.get("filename"),
depth=page_dict.get("depth"),
isSeed=page_dict.get("seed", False),
favIconUrl=page_dict.get("favIconUrl"),
ts=(str_to_date(ts) if ts else dt_now()),
)
p.compute_page_type()

View File

@ -619,7 +619,9 @@ class StorageOps:
line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
for line in line_iter:
yield _parse_json(line.decode("utf-8", errors="ignore"))
page_json = _parse_json(line.decode("utf-8", errors="ignore"))
page_json["filename"] = os.path.basename(wacz_filename)
yield page_json
page_generators: List[Iterator[Dict[Any, Any]]] = []

View File

@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
assert page.get("title") or page.get("title") is None
assert page["loadState"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)
@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
assert page["loadState"]
assert page["status"]
assert page["mime"]
assert page["filename"]
assert page["depth"] is not None
assert page["favIconUrl"]
assert page["isSeed"] in (True, False)
assert page["isError"] in (True, False)
assert page["isFile"] in (True, False)

View File

@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["crawl_id"] == upload_id
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None
page_id = pages[0]["id"]
@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["crawl_id"]
assert page["url"]
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None
assert page["notes"] == []

View File

@ -123,8 +123,8 @@ spec:
httpGet:
path: /healthzStartup
port: 8000
periodSeconds: 5
failureThreshold: 60
periodSeconds: 10
failureThreshold: 8640
successThreshold: 1
readinessProbe: