Upload Fixes: (#2397)

- ensure upload pages are always added with a new uuid, to avoid any
duplicates with existing uploads, even if upload wacz is actually a
crawl from different browsertrix instance, etc..
- cleanup upload names with slugify, which also replaces spaces, fixes
uploading wacz filenames with spaces in them
- part of fix for #2396

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-02-17 13:05:33 -08:00 committed by GitHub
parent 44ca293999
commit e112f96614
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 26 additions and 17 deletions

View File

@ -24,7 +24,7 @@ from pydantic import (
BeforeValidator, BeforeValidator,
TypeAdapter, TypeAdapter,
) )
from pathvalidate import sanitize_filename from slugify import slugify
# from fastapi_users import models as fastapi_users_models # from fastapi_users import models as fastapi_users_models
@ -1074,7 +1074,7 @@ class FilePreparer:
def __init__(self, prefix, filename): def __init__(self, prefix, filename):
self.upload_size = 0 self.upload_size = 0
self.upload_hasher = hashlib.sha256() self.upload_hasher = hashlib.sha256()
self.upload_name = prefix + self.prepare_filename(filename) self.upload_name = prefix + "-" + self.prepare_filename(filename)
def add_chunk(self, chunk): def add_chunk(self, chunk):
"""add chunk for file""" """add chunk for file"""
@ -1093,11 +1093,10 @@ class FilePreparer:
def prepare_filename(self, filename): def prepare_filename(self, filename):
"""prepare filename by sanitizing and adding extra string """prepare filename by sanitizing and adding extra string
to avoid duplicates""" to avoid duplicates"""
name = sanitize_filename(filename.rsplit("/", 1)[-1]) name, ext = os.path.splitext(filename)
parts = name.split(".") name = slugify(name.rsplit("/", 1)[-1])
randstr = base64.b32encode(os.urandom(5)).lower() randstr = base64.b32encode(os.urandom(5)).lower()
parts[0] += "-" + randstr.decode("utf-8") return name + "-" + randstr.decode("utf-8") + ext
return ".".join(parts)
# ============================================================================ # ============================================================================

View File

@ -88,19 +88,26 @@ class PageOps:
stream = await self.storage_ops.sync_stream_wacz_pages( stream = await self.storage_ops.sync_stream_wacz_pages(
crawl.resources or [] crawl.resources or []
) )
new_uuid = crawl.type == "upload"
seed_count = 0
non_seed_count = 0
for page_dict in stream: for page_dict in stream:
if not page_dict.get("url"): if not page_dict.get("url"):
continue continue
if not page_dict.get("isSeed") and not page_dict.get("seed"): page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get("seed")
page_dict["isSeed"] = False
if page_dict.get("isSeed"):
seed_count += 1
else:
non_seed_count += 1
if len(pages_buffer) > batch_size: if len(pages_buffer) > batch_size:
await self._add_pages_to_db(crawl_id, pages_buffer) await self._add_pages_to_db(crawl_id, pages_buffer)
pages_buffer = [] pages_buffer = []
pages_buffer.append( pages_buffer.append(
self._get_page_from_dict(page_dict, crawl_id, crawl.oid) self._get_page_from_dict(page_dict, crawl_id, crawl.oid, new_uuid)
) )
# Add any remaining pages in buffer to db # Add any remaining pages in buffer to db
@ -109,7 +116,10 @@ class PageOps:
await self.set_archived_item_page_counts(crawl_id) await self.set_archived_item_page_counts(crawl_id)
print(f"Added pages for crawl {crawl_id} to db", flush=True) print(
f"Added pages for crawl {crawl_id}: {seed_count} Seed, {non_seed_count} Non-Seed",
flush=True,
)
# pylint: disable=broad-exception-caught, raise-missing-from # pylint: disable=broad-exception-caught, raise-missing-from
except Exception as err: except Exception as err:
traceback.print_exc() traceback.print_exc()
@ -163,16 +173,14 @@ class PageOps:
) )
def _get_page_from_dict( def _get_page_from_dict(
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool
) -> Page: ) -> Page:
"""Return Page object from dict""" """Return Page object from dict"""
page_id = page_dict.get("id", "") page_id = page_dict.get("id", "") if not new_uuid else None
if not page_id:
page_id = uuid4()
try: try:
UUID(page_id) UUID(page_id)
except ValueError: except (TypeError, ValueError):
page_id = uuid4() page_id = uuid4()
status = page_dict.get("status") status = page_dict.get("status")
@ -222,7 +230,7 @@ class PageOps:
oid: UUID, oid: UUID,
): ):
"""Add page to database""" """Add page to database"""
page = self._get_page_from_dict(page_dict, crawl_id, oid) page = self._get_page_from_dict(page_dict, crawl_id, oid, new_uuid=False)
page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True) page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)

View File

@ -1120,7 +1120,9 @@ def test_delete_form_upload_and_crawls_from_all_crawls(
break break
if count + 1 == MAX_ATTEMPTS: if count + 1 == MAX_ATTEMPTS:
assert False assert data["storageUsedBytes"] == org_bytes - total_size
assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size
assert data["storageUsedUploads"] == org_upload_bytes - upload_size
time.sleep(5) time.sleep(5)
count += 1 count += 1