remove crc32 from CrawlFile (#1980)
- no longer being used with latest stream-zip - was not computed correctly in the crawler - counterpart to webrecorder/browsertrix-crawler#657 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
4aca107710
commit
894aa29d4b
@ -492,7 +492,6 @@ class BaseCrawlOps:
|
||||
name=file_.filename,
|
||||
path=presigned_url or "",
|
||||
hash=file_.hash,
|
||||
crc32=file_.crc32,
|
||||
size=file_.size,
|
||||
crawlId=crawl_id,
|
||||
numReplicas=len(file_.replicas) if file_.replicas else 0,
|
||||
|
@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
|
||||
from .migrations import BaseMigration
|
||||
|
||||
|
||||
CURR_DB_VERSION = "0033"
|
||||
CURR_DB_VERSION = "0034"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -0,0 +1,37 @@
|
||||
"""
|
||||
Migration 0034 -- remove crc32 from CrawlFile
|
||||
"""
|
||||
|
||||
from btrixcloud.migrations import BaseMigration
|
||||
|
||||
|
||||
MIGRATION_VERSION = "0034"
|
||||
|
||||
|
||||
class Migration(BaseMigration):
|
||||
"""Migration class."""
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
def __init__(self, mdb, **kwargs):
|
||||
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||
|
||||
async def migrate_up(self):
|
||||
"""Perform migration up.
|
||||
|
||||
Remove crc32 field from all crawl files
|
||||
"""
|
||||
crawls_db = self.mdb["crawls"]
|
||||
|
||||
try:
|
||||
res = await crawls_db.update_many(
|
||||
{"files.crc32": {"$exists": 1}},
|
||||
{"$unset": {"files.$[].crc32": 1}},
|
||||
)
|
||||
updated = res.modified_count
|
||||
print(f"{updated} crawls migrated to remove crc32 from files", flush=True)
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception as err:
|
||||
print(
|
||||
f"Error migrating crawl files to remove crc32: {err}",
|
||||
flush=True,
|
||||
)
|
@ -625,7 +625,6 @@ class CrawlFile(BaseFile):
|
||||
|
||||
presignedUrl: Optional[str] = None
|
||||
expireAt: Optional[datetime] = None
|
||||
crc32: int = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -635,7 +634,6 @@ class CrawlFileOut(BaseModel):
|
||||
name: str
|
||||
path: str
|
||||
hash: str
|
||||
crc32: int = 0
|
||||
size: int
|
||||
|
||||
crawlId: Optional[str] = None
|
||||
@ -930,7 +928,6 @@ class CrawlCompleteIn(BaseModel):
|
||||
filename: str
|
||||
size: int
|
||||
hash: str
|
||||
crc32: int = 0
|
||||
|
||||
completed: Optional[bool] = True
|
||||
|
||||
|
@ -1192,7 +1192,6 @@ class CrawlOperator(BaseOperator):
|
||||
filename=filename,
|
||||
size=filecomplete.size,
|
||||
hash=filecomplete.hash,
|
||||
crc32=filecomplete.crc32,
|
||||
storage=crawl.storage,
|
||||
)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user