remove crc32 from CrawlFile (#1980)

- no longer being used with latest stream-zip
- was not computed correctly in the crawler
- counterpart to webrecorder/browsertrix-crawler#657

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-07-30 11:23:15 -07:00 committed by GitHub
parent 4aca107710
commit 894aa29d4b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 38 additions and 6 deletions

View File

@ -492,7 +492,6 @@ class BaseCrawlOps:
name=file_.filename, name=file_.filename,
path=presigned_url or "", path=presigned_url or "",
hash=file_.hash, hash=file_.hash,
crc32=file_.crc32,
size=file_.size, size=file_.size,
crawlId=crawl_id, crawlId=crawl_id,
numReplicas=len(file_.replicas) if file_.replicas else 0, numReplicas=len(file_.replicas) if file_.replicas else 0,

View File

@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
from .migrations import BaseMigration from .migrations import BaseMigration
CURR_DB_VERSION = "0033" CURR_DB_VERSION = "0034"
# ============================================================================ # ============================================================================

View File

@ -0,0 +1,37 @@
"""
Migration 0034 -- remove crc32 from CrawlFile
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0034"
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
Remove crc32 field from all crawl files
"""
crawls_db = self.mdb["crawls"]
try:
res = await crawls_db.update_many(
{"files.crc32": {"$exists": 1}},
{"$unset": {"files.$[].crc32": 1}},
)
updated = res.modified_count
print(f"{updated} crawls migrated to remove crc32 from files", flush=True)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error migrating crawl files to remove crc32: {err}",
flush=True,
)

View File

@ -625,7 +625,6 @@ class CrawlFile(BaseFile):
presignedUrl: Optional[str] = None presignedUrl: Optional[str] = None
expireAt: Optional[datetime] = None expireAt: Optional[datetime] = None
crc32: int = 0
# ============================================================================ # ============================================================================
@ -635,7 +634,6 @@ class CrawlFileOut(BaseModel):
name: str name: str
path: str path: str
hash: str hash: str
crc32: int = 0
size: int size: int
crawlId: Optional[str] = None crawlId: Optional[str] = None
@ -930,7 +928,6 @@ class CrawlCompleteIn(BaseModel):
filename: str filename: str
size: int size: int
hash: str hash: str
crc32: int = 0
completed: Optional[bool] = True completed: Optional[bool] = True

View File

@ -1192,7 +1192,6 @@ class CrawlOperator(BaseOperator):
filename=filename, filename=filename,
size=filecomplete.size, size=filecomplete.size,
hash=filecomplete.hash, hash=filecomplete.hash,
crc32=filecomplete.crc32,
storage=crawl.storage, storage=crawl.storage,
) )