Mark all pages from pages.jsonl as seeds (#2390)
Fixes #2389 All pages from `pages/pages.jsonl` files now have `isSeed: True` in the database, in addition to any pages that explicitly have `seed` set to true in the actual JSONL. Tests have been added to ensure that all pages from our fixture uploads have `isSeed: True`. --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
7b2932c582
commit
7f1af9bb31
@ -92,7 +92,7 @@ class PageOps:
|
||||
if not page_dict.get("url"):
|
||||
continue
|
||||
|
||||
if not page_dict.get("isSeed"):
|
||||
if not page_dict.get("isSeed") and not page_dict.get("seed"):
|
||||
page_dict["isSeed"] = False
|
||||
|
||||
if len(pages_buffer) > batch_size:
|
||||
|
||||
@ -607,7 +607,9 @@ class StorageOps:
|
||||
|
||||
# pylint: disable=too-many-function-args
|
||||
def stream_page_lines(
|
||||
pagefile_zipinfo: ZipInfo, wacz_url: str, wacz_filename: str
|
||||
pagefile_zipinfo: ZipInfo,
|
||||
wacz_url: str,
|
||||
wacz_filename: str,
|
||||
) -> Iterator[Dict[Any, Any]]:
|
||||
"""Pass lines as json objects"""
|
||||
filename = pagefile_zipinfo.filename
|
||||
@ -621,6 +623,8 @@ class StorageOps:
|
||||
for line in line_iter:
|
||||
page_json = _parse_json(line.decode("utf-8", errors="ignore"))
|
||||
page_json["filename"] = os.path.basename(wacz_filename)
|
||||
if filename == "pages/pages.jsonl":
|
||||
page_json["seed"] = True
|
||||
yield page_json
|
||||
|
||||
page_generators: List[Iterator[Dict[Any, Any]]] = []
|
||||
@ -637,7 +641,11 @@ class StorageOps:
|
||||
]
|
||||
for pagefile_zipinfo in page_files:
|
||||
page_generators.append(
|
||||
stream_page_lines(pagefile_zipinfo, wacz_url, wacz_file.name)
|
||||
stream_page_lines(
|
||||
pagefile_zipinfo,
|
||||
wacz_url,
|
||||
wacz_file.name,
|
||||
)
|
||||
)
|
||||
|
||||
return chain.from_iterable(page_generators)
|
||||
|
||||
@ -254,6 +254,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
|
||||
assert page["ts"]
|
||||
assert page["filename"]
|
||||
assert page.get("title") or page.get("title") is None
|
||||
assert page["isSeed"]
|
||||
|
||||
page_id = pages[0]["id"]
|
||||
r = requests.get(
|
||||
@ -270,6 +271,7 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
|
||||
assert page["ts"]
|
||||
assert page["filename"]
|
||||
assert page.get("title") or page.get("title") is None
|
||||
assert page["isSeed"]
|
||||
|
||||
assert page["notes"] == []
|
||||
assert page.get("userid") is None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user