Mark all pages from pages.jsonl as seeds (#2390)

Fixes #2389 

All pages from `pages/pages.jsonl` files now have `isSeed: True` in the
database, in addition to any pages that explicitly have `seed` set to
true in the actual JSONL.

Tests have been added to ensure that all pages from our fixture uploads
have `isSeed: True`.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2025-02-13 19:54:30 -05:00 committed by GitHub
parent 7b2932c582
commit 7f1af9bb31
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 3 deletions

View File

@ -92,7 +92,7 @@ class PageOps:
if not page_dict.get("url"):
continue
if not page_dict.get("isSeed"):
if not page_dict.get("isSeed") and not page_dict.get("seed"):
page_dict["isSeed"] = False
if len(pages_buffer) > batch_size:

View File

@ -607,7 +607,9 @@ class StorageOps:
# pylint: disable=too-many-function-args
def stream_page_lines(
pagefile_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str
pagefile_zipinfo: ZipInfo,
wacz_url: str,
wacz_filename: str,
) -> Iterator[Dict[Any, Any]]:
"""Pass lines as json objects"""
filename = pagefile_zipinfo.filename
@ -621,6 +623,8 @@ class StorageOps:
for line in line_iter:
page_json = _parse_json(line.decode("utf-8", errors="ignore"))
page_json["filename"] = os.path.basename(wacz_filename)
if filename == "pages/pages.jsonl":
page_json["seed"] = True
yield page_json
page_generators: List[Iterator[Dict[Any, Any]]] = []
@ -637,7 +641,11 @@ class StorageOps:
]
for pagefile_zipinfo in page_files:
page_generators.append(
stream_page_lines(pagefile_zipinfo, wacz_url, wacz_file.name)
stream_page_lines(
pagefile_zipinfo,
wacz_url,
wacz_file.name,
)
)
return chain.from_iterable(page_generators)

View File

@ -254,6 +254,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None
assert page["isSeed"]
page_id = pages[0]["id"]
r = requests.get(
@ -270,6 +271,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert page["ts"]
assert page["filename"]
assert page.get("title") or page.get("title") is None
assert page["isSeed"]
assert page["notes"] == []
assert page.get("userid") is None