Upload Fixes: (#2397)
- ensure upload pages are always added with a new uuid, to avoid any duplicates with existing uploads, even if upload wacz is actually a crawl from different browsertrix instance, etc.. - cleanup upload names with slugify, which also replaces spaces, fixes uploading wacz filenames with spaces in them - part of fix for #2396 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
44ca293999
commit
e112f96614
@ -24,7 +24,7 @@ from pydantic import (
|
|||||||
BeforeValidator,
|
BeforeValidator,
|
||||||
TypeAdapter,
|
TypeAdapter,
|
||||||
)
|
)
|
||||||
from pathvalidate import sanitize_filename
|
from slugify import slugify
|
||||||
|
|
||||||
# from fastapi_users import models as fastapi_users_models
|
# from fastapi_users import models as fastapi_users_models
|
||||||
|
|
||||||
@ -1074,7 +1074,7 @@ class FilePreparer:
|
|||||||
def __init__(self, prefix, filename):
|
def __init__(self, prefix, filename):
|
||||||
self.upload_size = 0
|
self.upload_size = 0
|
||||||
self.upload_hasher = hashlib.sha256()
|
self.upload_hasher = hashlib.sha256()
|
||||||
self.upload_name = prefix + self.prepare_filename(filename)
|
self.upload_name = prefix + "-" + self.prepare_filename(filename)
|
||||||
|
|
||||||
def add_chunk(self, chunk):
|
def add_chunk(self, chunk):
|
||||||
"""add chunk for file"""
|
"""add chunk for file"""
|
||||||
@ -1093,11 +1093,10 @@ class FilePreparer:
|
|||||||
def prepare_filename(self, filename):
|
def prepare_filename(self, filename):
|
||||||
"""prepare filename by sanitizing and adding extra string
|
"""prepare filename by sanitizing and adding extra string
|
||||||
to avoid duplicates"""
|
to avoid duplicates"""
|
||||||
name = sanitize_filename(filename.rsplit("/", 1)[-1])
|
name, ext = os.path.splitext(filename)
|
||||||
parts = name.split(".")
|
name = slugify(name.rsplit("/", 1)[-1])
|
||||||
randstr = base64.b32encode(os.urandom(5)).lower()
|
randstr = base64.b32encode(os.urandom(5)).lower()
|
||||||
parts[0] += "-" + randstr.decode("utf-8")
|
return name + "-" + randstr.decode("utf-8") + ext
|
||||||
return ".".join(parts)
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -88,19 +88,26 @@ class PageOps:
|
|||||||
stream = await self.storage_ops.sync_stream_wacz_pages(
|
stream = await self.storage_ops.sync_stream_wacz_pages(
|
||||||
crawl.resources or []
|
crawl.resources or []
|
||||||
)
|
)
|
||||||
|
new_uuid = crawl.type == "upload"
|
||||||
|
seed_count = 0
|
||||||
|
non_seed_count = 0
|
||||||
for page_dict in stream:
|
for page_dict in stream:
|
||||||
if not page_dict.get("url"):
|
if not page_dict.get("url"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not page_dict.get("isSeed") and not page_dict.get("seed"):
|
page_dict["isSeed"] = page_dict.get("isSeed") or page_dict.get("seed")
|
||||||
page_dict["isSeed"] = False
|
|
||||||
|
if page_dict.get("isSeed"):
|
||||||
|
seed_count += 1
|
||||||
|
else:
|
||||||
|
non_seed_count += 1
|
||||||
|
|
||||||
if len(pages_buffer) > batch_size:
|
if len(pages_buffer) > batch_size:
|
||||||
await self._add_pages_to_db(crawl_id, pages_buffer)
|
await self._add_pages_to_db(crawl_id, pages_buffer)
|
||||||
pages_buffer = []
|
pages_buffer = []
|
||||||
|
|
||||||
pages_buffer.append(
|
pages_buffer.append(
|
||||||
self._get_page_from_dict(page_dict, crawl_id, crawl.oid)
|
self._get_page_from_dict(page_dict, crawl_id, crawl.oid, new_uuid)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add any remaining pages in buffer to db
|
# Add any remaining pages in buffer to db
|
||||||
@ -109,7 +116,10 @@ class PageOps:
|
|||||||
|
|
||||||
await self.set_archived_item_page_counts(crawl_id)
|
await self.set_archived_item_page_counts(crawl_id)
|
||||||
|
|
||||||
print(f"Added pages for crawl {crawl_id} to db", flush=True)
|
print(
|
||||||
|
f"Added pages for crawl {crawl_id}: {seed_count} Seed, {non_seed_count} Non-Seed",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
# pylint: disable=broad-exception-caught, raise-missing-from
|
# pylint: disable=broad-exception-caught, raise-missing-from
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
@ -163,16 +173,14 @@ class PageOps:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _get_page_from_dict(
|
def _get_page_from_dict(
|
||||||
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID
|
self, page_dict: Dict[str, Any], crawl_id: str, oid: UUID, new_uuid: bool
|
||||||
) -> Page:
|
) -> Page:
|
||||||
"""Return Page object from dict"""
|
"""Return Page object from dict"""
|
||||||
page_id = page_dict.get("id", "")
|
page_id = page_dict.get("id", "") if not new_uuid else None
|
||||||
if not page_id:
|
|
||||||
page_id = uuid4()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
UUID(page_id)
|
UUID(page_id)
|
||||||
except ValueError:
|
except (TypeError, ValueError):
|
||||||
page_id = uuid4()
|
page_id = uuid4()
|
||||||
|
|
||||||
status = page_dict.get("status")
|
status = page_dict.get("status")
|
||||||
@ -222,7 +230,7 @@ class PageOps:
|
|||||||
oid: UUID,
|
oid: UUID,
|
||||||
):
|
):
|
||||||
"""Add page to database"""
|
"""Add page to database"""
|
||||||
page = self._get_page_from_dict(page_dict, crawl_id, oid)
|
page = self._get_page_from_dict(page_dict, crawl_id, oid, new_uuid=False)
|
||||||
|
|
||||||
page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)
|
page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)
|
||||||
|
|
||||||
|
@ -1120,7 +1120,9 @@ def test_delete_form_upload_and_crawls_from_all_crawls(
|
|||||||
break
|
break
|
||||||
|
|
||||||
if count + 1 == MAX_ATTEMPTS:
|
if count + 1 == MAX_ATTEMPTS:
|
||||||
assert False
|
assert data["storageUsedBytes"] == org_bytes - total_size
|
||||||
|
assert data["storageUsedCrawls"] == org_crawl_bytes - combined_crawl_size
|
||||||
|
assert data["storageUsedUploads"] == org_upload_bytes - upload_size
|
||||||
|
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
count += 1
|
count += 1
|
||||||
|
Loading…
Reference in New Issue
Block a user