Remove pages from QA Configmap (#1671)

Fixes #1670 

No longer need to pass pages to the ConfigMap. The ConfigMap has a size
limit and will fail if there are too many pages.

With this change, the page list for QA will be read directly from the
WACZ files pages.jsonl / extraPages.jsonl entries.
This commit is contained in:
Ilya Kreymer 2024-04-12 16:04:33 -07:00 committed by GitHub
parent ed08b734ba
commit f243d34395
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 1 additions and 16 deletions

View File

@ -1557,11 +1557,3 @@ class PageOutWithSingleQA(Page):
"""Page out with single QA entry"""
qa: Optional[PageQACompare] = None
# ============================================================================
class PagesAndResources(BaseModel):
"""moage for qa configmap data, pages + resources"""
resources: List[CrawlFileOut] = []
pages: List[PageOut] = []

View File

@ -27,7 +27,6 @@ from btrixcloud.models import (
CrawlFile,
CrawlCompleteIn,
StorageRef,
PagesAndResources,
)
from btrixcloud.utils import (
@ -326,16 +325,10 @@ class CrawlOperator(BaseOperator):
if name in children[CMAP]:
return [children[CMAP][name]]
pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000)
crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id)
res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages)
params["name"] = name
params["qa_source_replay_json"] = res_and_pages.json()
# params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
return self.load_from_yaml("qa_configmap.yaml", params)
def _load_crawler(self, params, i, status, children):