Issue 2396 readd pages fixes (#2398)
readd pages fixes: - add additional mem to background job - copy page qa data to separate temp coll when re-adding pages, then merge back in
This commit is contained in:
parent
e112f96614
commit
5bebb6161a
@ -646,6 +646,13 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
return None, None
|
return None, None
|
||||||
return res.get("state"), res.get("finished")
|
return res.get("state"), res.get("finished")
|
||||||
|
|
||||||
|
async def is_upload(self, crawl_id: str):
|
||||||
|
"""return true if archived item with this id is an upload"""
|
||||||
|
res = await self.crawls.find_one({"_id": crawl_id}, projection={"type": 1})
|
||||||
|
if not res:
|
||||||
|
return False
|
||||||
|
return res.get("type") == "upload"
|
||||||
|
|
||||||
async def add_crawl_error(
|
async def add_crawl_error(
|
||||||
self,
|
self,
|
||||||
crawl_id: str,
|
crawl_id: str,
|
||||||
|
@ -66,6 +66,7 @@ class PageOps:
|
|||||||
):
|
):
|
||||||
self.pages = mdb["pages"]
|
self.pages = mdb["pages"]
|
||||||
self.crawls = mdb["crawls"]
|
self.crawls = mdb["crawls"]
|
||||||
|
self.mdb = mdb
|
||||||
self.crawl_ops = crawl_ops
|
self.crawl_ops = crawl_ops
|
||||||
self.org_ops = org_ops
|
self.org_ops = org_ops
|
||||||
self.storage_ops = storage_ops
|
self.storage_ops = storage_ops
|
||||||
@ -785,9 +786,70 @@ class PageOps:
|
|||||||
|
|
||||||
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
|
||||||
"""Delete existing pages for crawl and re-add from WACZs."""
|
"""Delete existing pages for crawl and re-add from WACZs."""
|
||||||
await self.delete_crawl_pages(crawl_id, oid)
|
|
||||||
print(f"Deleted pages for crawl {crawl_id}", flush=True)
|
try:
|
||||||
await self.add_crawl_pages_to_db_from_wacz(crawl_id)
|
is_upload = await self.crawl_ops.is_upload(crawl_id)
|
||||||
|
print(f"Processing {'upload' if is_upload else 'crawl'} {crawl_id}")
|
||||||
|
if not is_upload:
|
||||||
|
ts_now = dt_now().strftime("%Y%m%d%H%M%S")
|
||||||
|
qa_temp_db_name = f"pages-qa-temp-{crawl_id}-{ts_now}"
|
||||||
|
cursor = self.pages.aggregate(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"$match": {
|
||||||
|
"crawl_id": crawl_id,
|
||||||
|
"$or": [
|
||||||
|
{"qa": {"$nin": [None, {}]}},
|
||||||
|
{"modified": {"$ne": None}},
|
||||||
|
{"userid": {"$ne": None}},
|
||||||
|
{"approved": {"$ne": None}},
|
||||||
|
{"notes": {"$ne": None}},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$project": {
|
||||||
|
"_id": 1,
|
||||||
|
"qa": 1,
|
||||||
|
"modified": 1,
|
||||||
|
"userid": 1,
|
||||||
|
"approved": 1,
|
||||||
|
"notes": 1,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{"$out": qa_temp_db_name},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print(f"Stored QA data in temp db {qa_temp_db_name}")
|
||||||
|
assert await cursor.to_list() == []
|
||||||
|
|
||||||
|
await self.delete_crawl_pages(crawl_id, oid)
|
||||||
|
print(f"Deleted pages for crawl {crawl_id}", flush=True)
|
||||||
|
await self.add_crawl_pages_to_db_from_wacz(crawl_id)
|
||||||
|
|
||||||
|
if not is_upload:
|
||||||
|
qa_temp_db = self.mdb[qa_temp_db_name]
|
||||||
|
cursor = qa_temp_db.aggregate(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"$merge": {
|
||||||
|
"into": "pages",
|
||||||
|
"on": ["_id"],
|
||||||
|
"whenNotMatched": "fail",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print(f"Merged QA data from temp db {qa_temp_db_name}")
|
||||||
|
# async for data in qa_temp_db.find({}):
|
||||||
|
# print("qa data", data)
|
||||||
|
|
||||||
|
assert await cursor.to_list() == []
|
||||||
|
await qa_temp_db.drop()
|
||||||
|
print(f"Dropped temp db {qa_temp_db_name}")
|
||||||
|
# pylint: disable=broad-exception-caught
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
async def re_add_all_crawl_pages(
|
async def re_add_all_crawl_pages(
|
||||||
self, org: Organization, crawl_type: Optional[str] = None
|
self, org: Organization, crawl_type: Optional[str] = None
|
||||||
|
@ -55,8 +55,8 @@ spec:
|
|||||||
|
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
memory: "200Mi"
|
memory: "500Mi"
|
||||||
|
|
||||||
requests:
|
requests:
|
||||||
memory: "200Mi"
|
memory: "250Mi"
|
||||||
cpu: "50m"
|
cpu: "200m"
|
||||||
|
Loading…
Reference in New Issue
Block a user