Issue 2396 readd pages fixes (#2398)

readd pages fixes:
    - add additional mem to background job
- copy page qa data to separate temp coll when re-adding pages, then
merge back in
This commit is contained in:
Ilya Kreymer 2025-02-17 13:52:11 -08:00 committed by GitHub
parent e112f96614
commit 5bebb6161a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 75 additions and 6 deletions

View File

@ -646,6 +646,13 @@ class CrawlOps(BaseCrawlOps):
return None, None return None, None
return res.get("state"), res.get("finished") return res.get("state"), res.get("finished")
async def is_upload(self, crawl_id: str):
"""return true if archived item with this id is an upload"""
res = await self.crawls.find_one({"_id": crawl_id}, projection={"type": 1})
if not res:
return False
return res.get("type") == "upload"
async def add_crawl_error( async def add_crawl_error(
self, self,
crawl_id: str, crawl_id: str,

View File

@ -66,6 +66,7 @@ class PageOps:
): ):
self.pages = mdb["pages"] self.pages = mdb["pages"]
self.crawls = mdb["crawls"] self.crawls = mdb["crawls"]
self.mdb = mdb
self.crawl_ops = crawl_ops self.crawl_ops = crawl_ops
self.org_ops = org_ops self.org_ops = org_ops
self.storage_ops = storage_ops self.storage_ops = storage_ops
@ -785,9 +786,70 @@ class PageOps:
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID): async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
"""Delete existing pages for crawl and re-add from WACZs.""" """Delete existing pages for crawl and re-add from WACZs."""
await self.delete_crawl_pages(crawl_id, oid)
print(f"Deleted pages for crawl {crawl_id}", flush=True) try:
await self.add_crawl_pages_to_db_from_wacz(crawl_id) is_upload = await self.crawl_ops.is_upload(crawl_id)
print(f"Processing {'upload' if is_upload else 'crawl'} {crawl_id}")
if not is_upload:
ts_now = dt_now().strftime("%Y%m%d%H%M%S")
qa_temp_db_name = f"pages-qa-temp-{crawl_id}-{ts_now}"
cursor = self.pages.aggregate(
[
{
"$match": {
"crawl_id": crawl_id,
"$or": [
{"qa": {"$nin": [None, {}]}},
{"modified": {"$ne": None}},
{"userid": {"$ne": None}},
{"approved": {"$ne": None}},
{"notes": {"$ne": None}},
],
}
},
{
"$project": {
"_id": 1,
"qa": 1,
"modified": 1,
"userid": 1,
"approved": 1,
"notes": 1,
}
},
{"$out": qa_temp_db_name},
]
)
print(f"Stored QA data in temp db {qa_temp_db_name}")
assert await cursor.to_list() == []
await self.delete_crawl_pages(crawl_id, oid)
print(f"Deleted pages for crawl {crawl_id}", flush=True)
await self.add_crawl_pages_to_db_from_wacz(crawl_id)
if not is_upload:
qa_temp_db = self.mdb[qa_temp_db_name]
cursor = qa_temp_db.aggregate(
[
{
"$merge": {
"into": "pages",
"on": ["_id"],
"whenNotMatched": "fail",
}
}
]
)
print(f"Merged QA data from temp db {qa_temp_db_name}")
# async for data in qa_temp_db.find({}):
# print("qa data", data)
assert await cursor.to_list() == []
await qa_temp_db.drop()
print(f"Dropped temp db {qa_temp_db_name}")
# pylint: disable=broad-exception-caught
except Exception as e:
print(e)
async def re_add_all_crawl_pages( async def re_add_all_crawl_pages(
self, org: Organization, crawl_type: Optional[str] = None self, org: Organization, crawl_type: Optional[str] = None

View File

@ -55,8 +55,8 @@ spec:
resources: resources:
limits: limits:
memory: "200Mi" memory: "500Mi"
requests: requests:
memory: "200Mi" memory: "250Mi"
cpu: "50m" cpu: "200m"