remove crc32 from CrawlFile (#1980)
- no longer being used with latest stream-zip - was not computed correctly in the crawler - counterpart to webrecorder/browsertrix-crawler#657 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
4aca107710
commit
894aa29d4b
@ -492,7 +492,6 @@ class BaseCrawlOps:
|
|||||||
name=file_.filename,
|
name=file_.filename,
|
||||||
path=presigned_url or "",
|
path=presigned_url or "",
|
||||||
hash=file_.hash,
|
hash=file_.hash,
|
||||||
crc32=file_.crc32,
|
|
||||||
size=file_.size,
|
size=file_.size,
|
||||||
crawlId=crawl_id,
|
crawlId=crawl_id,
|
||||||
numReplicas=len(file_.replicas) if file_.replicas else 0,
|
numReplicas=len(file_.replicas) if file_.replicas else 0,
|
||||||
|
@ -17,7 +17,7 @@ from pymongo.errors import InvalidName
|
|||||||
from .migrations import BaseMigration
|
from .migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
CURR_DB_VERSION = "0033"
|
CURR_DB_VERSION = "0034"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -0,0 +1,37 @@
|
|||||||
|
"""
|
||||||
|
Migration 0034 -- remove crc32 from CrawlFile
|
||||||
|
"""
|
||||||
|
|
||||||
|
from btrixcloud.migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
|
MIGRATION_VERSION = "0034"
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(BaseMigration):
|
||||||
|
"""Migration class."""
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def __init__(self, mdb, **kwargs):
|
||||||
|
super().__init__(mdb, migration_version=MIGRATION_VERSION)
|
||||||
|
|
||||||
|
async def migrate_up(self):
|
||||||
|
"""Perform migration up.
|
||||||
|
|
||||||
|
Remove crc32 field from all crawl files
|
||||||
|
"""
|
||||||
|
crawls_db = self.mdb["crawls"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = await crawls_db.update_many(
|
||||||
|
{"files.crc32": {"$exists": 1}},
|
||||||
|
{"$unset": {"files.$[].crc32": 1}},
|
||||||
|
)
|
||||||
|
updated = res.modified_count
|
||||||
|
print(f"{updated} crawls migrated to remove crc32 from files", flush=True)
|
||||||
|
# pylint: disable=broad-exception-caught
|
||||||
|
except Exception as err:
|
||||||
|
print(
|
||||||
|
f"Error migrating crawl files to remove crc32: {err}",
|
||||||
|
flush=True,
|
||||||
|
)
|
@ -625,7 +625,6 @@ class CrawlFile(BaseFile):
|
|||||||
|
|
||||||
presignedUrl: Optional[str] = None
|
presignedUrl: Optional[str] = None
|
||||||
expireAt: Optional[datetime] = None
|
expireAt: Optional[datetime] = None
|
||||||
crc32: int = 0
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -635,7 +634,6 @@ class CrawlFileOut(BaseModel):
|
|||||||
name: str
|
name: str
|
||||||
path: str
|
path: str
|
||||||
hash: str
|
hash: str
|
||||||
crc32: int = 0
|
|
||||||
size: int
|
size: int
|
||||||
|
|
||||||
crawlId: Optional[str] = None
|
crawlId: Optional[str] = None
|
||||||
@ -930,7 +928,6 @@ class CrawlCompleteIn(BaseModel):
|
|||||||
filename: str
|
filename: str
|
||||||
size: int
|
size: int
|
||||||
hash: str
|
hash: str
|
||||||
crc32: int = 0
|
|
||||||
|
|
||||||
completed: Optional[bool] = True
|
completed: Optional[bool] = True
|
||||||
|
|
||||||
|
@ -1192,7 +1192,6 @@ class CrawlOperator(BaseOperator):
|
|||||||
filename=filename,
|
filename=filename,
|
||||||
size=filecomplete.size,
|
size=filecomplete.size,
|
||||||
hash=filecomplete.hash,
|
hash=filecomplete.hash,
|
||||||
crc32=filecomplete.crc32,
|
|
||||||
storage=crawl.storage,
|
storage=crawl.storage,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user