Remove pages from QA Configmap (#1671)
Fixes #1670 No longer need to pass pages to the ConfigMap. The ConfigMap has a size limit and will fail if there are too many pages. With this change, the page list for QA will be read directly from the WACZ files pages.jsonl / extraPages.jsonl entries.
This commit is contained in:
parent
ed08b734ba
commit
f243d34395
@ -1557,11 +1557,3 @@ class PageOutWithSingleQA(Page):
|
|||||||
"""Page out with single QA entry"""
|
"""Page out with single QA entry"""
|
||||||
|
|
||||||
qa: Optional[PageQACompare] = None
|
qa: Optional[PageQACompare] = None
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
class PagesAndResources(BaseModel):
|
|
||||||
"""moage for qa configmap data, pages + resources"""
|
|
||||||
|
|
||||||
resources: List[CrawlFileOut] = []
|
|
||||||
pages: List[PageOut] = []
|
|
||||||
|
|||||||
@ -27,7 +27,6 @@ from btrixcloud.models import (
|
|||||||
CrawlFile,
|
CrawlFile,
|
||||||
CrawlCompleteIn,
|
CrawlCompleteIn,
|
||||||
StorageRef,
|
StorageRef,
|
||||||
PagesAndResources,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from btrixcloud.utils import (
|
from btrixcloud.utils import (
|
||||||
@ -326,16 +325,10 @@ class CrawlOperator(BaseOperator):
|
|||||||
if name in children[CMAP]:
|
if name in children[CMAP]:
|
||||||
return [children[CMAP][name]]
|
return [children[CMAP][name]]
|
||||||
|
|
||||||
pages, _ = await self.page_ops.list_pages(qa_source_crawl_id, page_size=1000)
|
|
||||||
|
|
||||||
crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id)
|
crawl_replay = await self.crawl_ops.get_internal_crawl_out(qa_source_crawl_id)
|
||||||
|
|
||||||
res_and_pages = PagesAndResources(resources=crawl_replay.resources, pages=pages)
|
|
||||||
|
|
||||||
params["name"] = name
|
params["name"] = name
|
||||||
params["qa_source_replay_json"] = res_and_pages.json()
|
params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
|
||||||
# params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
|
|
||||||
|
|
||||||
return self.load_from_yaml("qa_configmap.yaml", params)
|
return self.load_from_yaml("qa_configmap.yaml", params)
|
||||||
|
|
||||||
def _load_crawler(self, params, i, status, children):
|
def _load_crawler(self, params, i, status, children):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user