From f243d34395a66ce6fac3b68bee3f40764609d65d Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 12 Apr 2024 16:04:33 -0700 Subject: [PATCH] Remove pages from QA Configmap (#1671) Fixes #1670 No longer need to pass pages to the ConfigMap. The ConfigMap has a size limit and will fail if there are too many pages. With this change, the page list for QA will be read directly from the WACZ files pages.jsonl / extraPages.jsonl entries. --- backend/btrixcloud/models.py | 8 -------- backend/btrixcloud/operator/crawls.py | 9 +-------- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e0bd3f5e..64567b55 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -1557,11 +1557,3 @@ class PageOutWithSingleQA(Page): """Page out with single QA entry""" qa: Optional[PageQACompare] = None - - -# ============================================================================ -class PagesAndResources(BaseModel): - """moage for qa configmap data, pages + resources""" - - resources: List[CrawlFileOut] = [] - pages: List[PageOut] = [] diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 5a2e19e4..e5e80021 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -27,7 +27,6 @@ from btrixcloud.models import ( CrawlFile, CrawlCompleteIn, StorageRef, - PagesAndResources, ) from btrixcloud.utils import ( @@ -326,16 +325,10 @@ class CrawlOperator(BaseOperator): if name in children[CMAP]: return [children[CMAP][name]] - pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000) - crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id) - res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages) - params["name"] = name - params["qa_source_replay_json"] = res_and_pages.json() - # params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) - + params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) return self.load_from_yaml("qa_configmap.yaml", params) def _load_crawler(self, params, i, status, children):