From e112f966140a5975e0681301e1844db792776e38 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 17 Feb 2025 13:05:33 -0800 Subject: [PATCH] Upload Fixes: (#2397) - ensure upload pages are always added with a new uuid, to avoid any duplicates with existing uploads, even if upload wacz is actually a crawl from different browsertrix instance, etc.. - cleanup upload names with slugify, which also replaces spaces, fixes uploading wacz filenames with spaces in them - part of fix for #2396 --------- Co-authored-by: Tessa Walsh --- backend/btrixcloud/models.py | 11 +++++------ backend/btrixcloud/pages.py | 28 ++++++++++++++++++---------- backend/test/test_uploads.py | 4 +++- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 1b0ab0b3..52f30ec5 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -24,7 +24,7 @@ from pydantic import ( BeforeValidator, TypeAdapter, ) -from pathvalidate import sanitize_filename +from slugify import slugify # from fastapi_users import models as fastapi_users_models @@ -1074,7 +1074,7 @@ class FilePreparer: def __init__(self, prefix, filename): self.upload_size = 0 self.upload_hasher = hashlib.sha256() - self.upload_name = prefix + self.prepare_filename(filename) + self.upload_name = prefix + "-" + self.prepare_filename(filename) def add_chunk(self, chunk): """add chunk for file""" @@ -1093,11 +1093,10 @@ class FilePreparer: def prepare_filename(self, filename): """prepare filename by sanitizing and adding extra string to avoid duplicates""" - name = sanitize_filename(filename.rsplit("/", 1)[-1]) - parts = name.split(".") + name, ext = os.path.splitext(filename) + name = slugify(name.rsplit("/", 1)[-1]) randstr = base64.b32encode(os.urandom(5)).lower() - parts[0] += "-" + randstr.decode("utf-8") - return ".".join(parts) + return name + "-" + randstr.decode("utf-8") + ext # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 977aeb0a..8ce69a58 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -88,19 +88,26 @@ class PageOps: stream = await self.storage_ops.sync_stream_wacz_pages( crawl.resources or [] ) + new_uuid = crawl.type == "upload" + seed_count = 0 + non_seed_count = 0 for page_dict in stream: if not page_dict.get("url"): continue - if not page_dict.get("isSeed") and not page_dict.get("seed"): - page_dict["isSeed"] = False + page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get("seed") + + if page_dict.get("isSeed"): + seed_count += 1 + else: + non_seed_count += 1 if len(pages_buffer) > batch_size: await self._add_pages_to_db(crawl_id, pages_buffer) pages_buffer = [] pages_buffer.append( - self._get_page_from_dict(page_dict, crawl_id, crawl.oid) + self._get_page_from_dict(page_dict, crawl_id, crawl.oid, new_uuid) ) # Add any remaining pages in buffer to db @@ -109,7 +116,10 @@ class PageOps: await self.set_archived_item_page_counts(crawl_id) - print(f"Added pages for crawl {crawl_id} to db", flush=True) + print( + f"Added pages for crawl {crawl_id}: {seed_count} Seed, {non_seed_count} Non-Seed", + flush=True, + ) # pylint: disable=broad-exception-caught, raise-missing-from except Exception as err: traceback.print_exc() @@ -163,16 +173,14 @@ class PageOps: ) def _get_page_from_dict( - self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID + self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool ) -> Page: """Return Page object from dict""" - page_id = page_dict.get("id", "") - if not page_id: - page_id = uuid4() + page_id = page_dict.get("id", "") if not new_uuid else None try: UUID(page_id) - except ValueError: + except (TypeError, ValueError): page_id = uuid4() status = page_dict.get("status") @@ -222,7 +230,7 @@ class PageOps: oid: UUID, ): """Add page to database""" - page = self._get_page_from_dict(page_dict, crawl_id, oid) + page = self._get_page_from_dict(page_dict, crawl_id, oid, new_uuid=False) page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True) diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 6c4b18df..e565a812 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1120,7 +1120,9 @@ def test_delete_form_upload_and_crawls_from_all_crawls( break if count + 1 == MAX_ATTEMPTS: - assert False + assert data["storageUsedBytes"] == org_bytes - total_size + assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size + assert data["storageUsedUploads"] == org_upload_bytes - upload_size time.sleep(5) count += 1