Remove pages from QA Configmap (#1671)
Fixes #1670 No longer need to pass pages to the ConfigMap. The ConfigMap has a size limit and will fail if there are too many pages. With this change, the page list for QA will be read directly from the WACZ files pages.jsonl / extraPages.jsonl entries.
This commit is contained in:
parent
ed08b734ba
commit
f243d34395
@ -1557,11 +1557,3 @@ class PageOutWithSingleQA(Page):
|
||||
"""Page out with single QA entry"""
|
||||
|
||||
qa: Optional[PageQACompare] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PagesAndResources(BaseModel):
|
||||
"""moage for qa configmap data, pages + resources"""
|
||||
|
||||
resources: List[CrawlFileOut] = []
|
||||
pages: List[PageOut] = []
|
||||
|
||||
@ -27,7 +27,6 @@ from btrixcloud.models import (
|
||||
CrawlFile,
|
||||
CrawlCompleteIn,
|
||||
StorageRef,
|
||||
PagesAndResources,
|
||||
)
|
||||
|
||||
from btrixcloud.utils import (
|
||||
@ -326,16 +325,10 @@ class CrawlOperator(BaseOperator):
|
||||
if name in children[CMAP]:
|
||||
return [children[CMAP][name]]
|
||||
|
||||
pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000)
|
||||
|
||||
crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id)
|
||||
|
||||
res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages)
|
||||
|
||||
params["name"] = name
|
||||
params["qa_source_replay_json"] = res_and_pages.json()
|
||||
# params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
|
||||
|
||||
params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
|
||||
return self.load_from_yaml("qa_configmap.yaml", params)
|
||||
|
||||
def _load_crawler(self, params, i, status, children):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user