From f243d34395a66ce6fac3b68bee3f40764609d65d Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Fri, 12 Apr 2024 16:04:33 -0700
Subject: [PATCH] Remove pages from QA Configmap (#1671)

Fixes #1670

No longer need to pass pages to the ConfigMap. The ConfigMap has a size
limit and will fail if there are too many pages.

With this change, the page list for QA will be read directly from the
WACZ files pages.jsonl / extraPages.jsonl entries.
---
 backend/btrixcloud/models.py          | 8 --------
 backend/btrixcloud/operator/crawls.py | 9 +--------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index e0bd3f5e..64567b55 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -1557,11 +1557,3 @@ class PageOutWithSingleQA(Page):
     """Page out with single QA entry"""
 
     qa: Optional[PageQACompare] = None
-
-
-# ============================================================================
-class PagesAndResources(BaseModel):
-    """moage for qa configmap data, pages + resources"""
-
-    resources: List[CrawlFileOut] = []
-    pages: List[PageOut] = []
diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py
index 5a2e19e4..e5e80021 100644
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@@ -27,7 +27,6 @@ from btrixcloud.models import (
     CrawlFile,
     CrawlCompleteIn,
     StorageRef,
-    PagesAndResources,
 )
 
 from btrixcloud.utils import (
@@ -326,16 +325,10 @@ class CrawlOperator(BaseOperator):
         if name in children[CMAP]:
             return [children[CMAP][name]]
 
-        pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000)
-
         crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id)
 
-        res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages)
-
         params["name"] = name
-        params["qa_source_replay_json"] = res_and_pages.json()
-        # params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
-
+        params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
         return self.load_from_yaml("qa_configmap.yaml", params)
 
     def _load_crawler(self, params, i, status, children):