Remove pages from QA Configmap (#1671)

Fixes #1670 No longer need to pass pages to the ConfigMap. The ConfigMap has a size limit and will fail if there are too many pages. With this change, the page list for QA will be read directly from the WACZ files pages.jsonl / extraPages.jsonl entries.
2024-04-12 16:04:33 -07:00 · 2024-04-12 16:04:33 -07:00 · f243d34395
commit f243d34395
parent ed08b734ba
2 changed files with 1 additions and 16 deletions
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -1557,11 +1557,3 @@ class PageOutWithSingleQA(Page):
    """Page out with single QA entry"""

    qa: Optional[PageQACompare] = None
-
-
-# ============================================================================
-class PagesAndResources(BaseModel):
-    """moage for qa configmap data, pages + resources"""
-
-    resources: List[CrawlFileOut] = []
-    pages: List[PageOut] = []
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@ -27,7 +27,6 @@ from btrixcloud.models import (
    CrawlFile,
    CrawlCompleteIn,
    StorageRef,
-    PagesAndResources,
 )

 from btrixcloud.utils import (
@ -326,16 +325,10 @@ class CrawlOperator(BaseOperator):
        if name in children[CMAP]:
            return [children[CMAP][name]]

-        pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000)
-
        crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id)

-        res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages)
-
        params["name"] = name
-        params["qa_source_replay_json"] = res_and_pages.json()
-        # params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
-
+        params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
        return self.load_from_yaml("qa_configmap.yaml", params)

    def _load_crawler(self, params, i, status, children):