Add WACZ filename, depth, favIconUrl, isSeed to pages (#2352)
Adds `filename` to pages, pointed to the WACZ file those files come from, as well as depth, favIconUrl, and isSeed. Also adds an idempotent migration to backfill this information for existing pages, and increases the backend container's startupProbe time to 24 hours to give it sufficient time to finish the migration. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
8cfa28733a
commit
0e9e70f3a3
@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
|
||||
from .migrations import BaseMigration
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0041"
|
||||
CURR_DB_VERSION = "0042"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -0,0 +1,50 @@
|
||||
"""
|
||||
Migration 0042 - Add filename to pages
|
||||
"""
|
||||
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0042"
|
||||
|
||||
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
def __init__(self, mdb, **kwargs):
|
||||
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||
|
||||
self.page_ops = kwargs.get("page_ops")
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Add filename to all pages that don't currently have it stored,
|
||||
iterating through each archived item and its WACZ files as necessary
|
||||
"""
|
||||
pages_mdb = self.mdb["pages"]
|
||||
|
||||
if self.page_ops is None:
|
||||
print(
|
||||
"Unable to add filename and other fields to pages, missing page_ops",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
crawl_ids_to_update = await pages_mdb.distinct("crawl_id", {"filename": None})
|
||||
|
||||
crawl_count = len(crawl_ids_to_update)
|
||||
current_index = 1
|
||||
|
||||
for crawl_id in crawl_ids_to_update:
|
||||
print(f"Migrating archived item {current_index}/{crawl_count}", flush=True)
|
||||
try:
|
||||
await self.page_ops.add_crawl_wacz_filename_to_pages(crawl_id)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception as err:
|
||||
print(
|
||||
f"Error adding filename and other fields to pages in item {crawl_id}: {err}",
|
||||
flush=True,
|
||||
)
|
||||
current_index += 1
|
@ -2493,6 +2493,10 @@ class Page(BaseMongoModel):
|
||||
loadState: Optional[int] = None
|
||||
status: Optional[int] = None
|
||||
mime: Optional[str] = None
|
||||
filename: Optional[str] = None
|
||||
depth: Optional[int] = None
|
||||
favIconUrl: Optional[AnyHttpUrl] = None
|
||||
isSeed: Optional[bool] = False
|
||||
|
||||
# manual review
|
||||
userid: Optional[UUID] = None
|
||||
|
@ -1,6 +1,7 @@
|
||||
"""crawl pages"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
|
||||
@ -83,6 +84,7 @@ class PageOps:
|
||||
|
||||
if len(pages_buffer) > batch_size:
|
||||
await self._add_pages_to_db(crawl_id, pages_buffer)
|
||||
pages_buffer = []
|
||||
|
||||
pages_buffer.append(
|
||||
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
|
||||
@ -100,6 +102,53 @@ class PageOps:
|
||||
traceback.print_exc()
|
||||
print(f"Error adding pages for crawl {crawl_id} to db: {err}", flush=True)
|
||||
|
||||
async def add_crawl_wacz_filename_to_pages(self, crawl_id: str):
|
||||
"""Add WACZ filename and additional fields to existing pages in crawl if not already set"""
|
||||
try:
|
||||
crawl = await self.crawl_ops.get_crawl_out(crawl_id)
|
||||
if not crawl.resources:
|
||||
return
|
||||
|
||||
for wacz_file in crawl.resources:
|
||||
# Strip oid directory from filename
|
||||
filename = os.path.basename(wacz_file.name)
|
||||
|
||||
stream = await self.storage_ops.sync_stream_wacz_pages([wacz_file])
|
||||
for page_dict in stream:
|
||||
if not page_dict.get("url"):
|
||||
continue
|
||||
|
||||
page_id = page_dict.get("id")
|
||||
|
||||
if not page_id:
|
||||
continue
|
||||
|
||||
if page_id:
|
||||
try:
|
||||
page_id = UUID(page_id)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
await self.pages.find_one_and_update(
|
||||
{"_id": page_id},
|
||||
{
|
||||
"$set": {
|
||||
"filename": filename,
|
||||
"depth": page_dict.get("depth"),
|
||||
"isSeed": page_dict.get("seed", False),
|
||||
"favIconUrl": page_dict.get("favIconUrl"),
|
||||
}
|
||||
},
|
||||
)
|
||||
# pylint: disable=broad-exception-caught, raise-missing-from
|
||||
except Exception as err:
|
||||
traceback.print_exc()
|
||||
print(
|
||||
f"Error adding filename to pages from item {crawl_id} to db: {err}",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
def _get_page_from_dict(
|
||||
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
|
||||
) -> Page:
|
||||
@ -127,6 +176,10 @@ class PageOps:
|
||||
loadState=page_dict.get("loadState"),
|
||||
status=status,
|
||||
mime=page_dict.get("mime", "text/html"),
|
||||
filename=page_dict.get("filename"),
|
||||
depth=page_dict.get("depth"),
|
||||
isSeed=page_dict.get("seed", False),
|
||||
favIconUrl=page_dict.get("favIconUrl"),
|
||||
ts=(str_to_date(ts) if ts else dt_now()),
|
||||
)
|
||||
p.compute_page_type()
|
||||
|
@ -619,7 +619,9 @@ class StorageOps:
|
||||
|
||||
line_iter: Iterator[bytes] = self._sync_get_filestream(wacz_url, filename)
|
||||
for line in line_iter:
|
||||
yield _parse_json(line.decode("utf-8", errors="ignore"))
|
||||
page_json = _parse_json(line.decode("utf-8", errors="ignore"))
|
||||
page_json["filename"] = os.path.basename(wacz_filename)
|
||||
yield page_json
|
||||
|
||||
page_generators: List[Iterator[Dict[Any, Any]]] = []
|
||||
|
||||
|
@ -673,6 +673,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
assert page["loadState"]
|
||||
assert page["status"]
|
||||
assert page["mime"]
|
||||
assert page["filename"]
|
||||
assert page["depth"] is not None
|
||||
assert page["favIconUrl"]
|
||||
assert page["isSeed"] in (True, False)
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
@ -694,6 +698,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
assert page.get("title") or page.get("title") is None
|
||||
assert page["loadState"]
|
||||
assert page["mime"]
|
||||
assert page["filename"]
|
||||
assert page["depth"] is not None
|
||||
assert page["favIconUrl"]
|
||||
assert page["isSeed"] in (True, False)
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
@ -794,6 +802,10 @@ def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||
assert page.get("title") or page.get("title") is None
|
||||
assert page["loadState"]
|
||||
assert page["mime"]
|
||||
assert page["filename"]
|
||||
assert page["depth"] is not None
|
||||
assert page["favIconUrl"]
|
||||
assert page["isSeed"] in (True, False)
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
@ -876,6 +888,10 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_
|
||||
assert page["loadState"]
|
||||
assert page["status"]
|
||||
assert page["mime"]
|
||||
assert page["filename"]
|
||||
assert page["depth"] is not None
|
||||
assert page["favIconUrl"]
|
||||
assert page["isSeed"] in (True, False)
|
||||
assert page["isError"] in (True, False)
|
||||
assert page["isFile"] in (True, False)
|
||||
|
||||
|
@ -252,6 +252,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
|
||||
assert page["crawl_id"] == upload_id
|
||||
assert page["url"]
|
||||
assert page["ts"]
|
||||
assert page["filename"]
|
||||
assert page.get("title") or page.get("title") is None
|
||||
|
||||
page_id = pages[0]["id"]
|
||||
@ -267,6 +268,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
|
||||
assert page["crawl_id"]
|
||||
assert page["url"]
|
||||
assert page["ts"]
|
||||
assert page["filename"]
|
||||
assert page.get("title") or page.get("title") is None
|
||||
|
||||
assert page["notes"] == []
|
||||
|
@ -123,8 +123,8 @@ spec:
|
||||
httpGet:
|
||||
path: /healthzStartup
|
||||
port: 8000
|
||||
periodSeconds: 5
|
||||
failureThreshold: 60
|
||||
periodSeconds: 10
|
||||
failureThreshold: 8640
|
||||
successThreshold: 1
|
||||
|
||||
readinessProbe:
|
||||
|
Loading…
Reference in New Issue
Block a user