Optimize single-page crawl workflows (#2656)
For single page crawls: - Always force 1 browser to be used, ignoring browser windows/scale setting - Don't use custom PVC volumes in crawler / redis, just use emptyDir - no chance of crawler being interrupted and restarted on different machine for a single page. Adds a 'is_single_page' check to CrawlConfig, checking for either limit or scopeType / no extra hops. Fixes #2655
This commit is contained in:
		
							parent
							
								
									86c4d326e9
								
							
						
					
					
						commit
						8ea16393c5
					
				| @ -45,6 +45,7 @@ from .models import ( | ||||
|     CrawlerProxy, | ||||
|     CrawlerProxies, | ||||
|     ValidateCustomBehavior, | ||||
|     RawCrawlConfig, | ||||
| ) | ||||
| from .utils import ( | ||||
|     dt_now, | ||||
| @ -223,15 +224,18 @@ class CrawlConfigOps: | ||||
|     ) -> CrawlConfigAddedResponse: | ||||
|         """Add new crawl config""" | ||||
| 
 | ||||
|         # ensure crawlChannel is valid | ||||
|         if not self.get_channel_crawler_image(config_in.crawlerChannel): | ||||
|             raise HTTPException(status_code=404, detail="crawler_not_found") | ||||
| 
 | ||||
|         # Overrides scale if set | ||||
|         if config_in.browserWindows is None: | ||||
|             config_in.browserWindows = browser_windows_from_scale( | ||||
|                 cast(int, config_in.scale) | ||||
|             ) | ||||
| 
 | ||||
|         # ensure crawlChannel is valid | ||||
|         if not self.get_channel_crawler_image(config_in.crawlerChannel): | ||||
|             raise HTTPException(status_code=404, detail="crawler_not_found") | ||||
|         if self.is_single_page(config_in.config): | ||||
|             config_in.browserWindows = 1 | ||||
| 
 | ||||
|         profileid = None | ||||
|         if isinstance(config_in.profileid, UUID): | ||||
| @ -321,6 +325,19 @@ class CrawlConfigOps: | ||||
|             execMinutesQuotaReached=exec_mins_quota_reached, | ||||
|         ) | ||||
| 
 | ||||
|     def is_single_page(self, config: RawCrawlConfig): | ||||
|         """return true if this config represents a single page crawl""" | ||||
|         if not config.seeds or len(config.seeds) != 1: | ||||
|             return False | ||||
| 
 | ||||
|         if config.limit == 1: | ||||
|             return True | ||||
| 
 | ||||
|         extra_hops = config.seeds[0].extraHops or config.extraHops | ||||
|         scope_type = config.seeds[0].scopeType or config.scopeType | ||||
| 
 | ||||
|         return extra_hops == 0 and scope_type == "page" | ||||
| 
 | ||||
|     def _validate_link_selectors(self, link_selectors: List[str]): | ||||
|         """Validate link selectors | ||||
| 
 | ||||
| @ -435,6 +452,10 @@ class CrawlConfigOps: | ||||
|         if update.config and update.config.lang: | ||||
|             validate_language_code(update.config.lang) | ||||
| 
 | ||||
|         if update.config or update.browserWindows: | ||||
|             if self.is_single_page(update.config or orig_crawl_config.config): | ||||
|                 update.browserWindows = 1 | ||||
| 
 | ||||
|         # indicates if any k8s crawl config settings changed | ||||
|         changed = False | ||||
|         changed = changed or ( | ||||
| @ -1021,6 +1042,7 @@ class CrawlConfigOps: | ||||
|                 warc_prefix=self.get_warc_prefix(org, crawlconfig), | ||||
|                 storage_filename=storage_filename, | ||||
|                 profile_filename=profile_filename or "", | ||||
|                 is_single_page=self.is_single_page(crawlconfig.config), | ||||
|             ) | ||||
|             await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True) | ||||
|             return crawl_id | ||||
|  | ||||
| @ -220,6 +220,7 @@ class CrawlManager(K8sAPI): | ||||
|         warc_prefix: str, | ||||
|         storage_filename: str, | ||||
|         profile_filename: str, | ||||
|         is_single_page: bool, | ||||
|     ) -> str: | ||||
|         """create new crawl job from config""" | ||||
|         cid = str(crawlconfig.id) | ||||
| @ -244,6 +245,7 @@ class CrawlManager(K8sAPI): | ||||
|             storage_filename=storage_filename, | ||||
|             profile_filename=profile_filename, | ||||
|             proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID, | ||||
|             is_single_page=is_single_page, | ||||
|         ) | ||||
| 
 | ||||
|     async def reload_running_crawl_config(self, crawl_id: str): | ||||
|  | ||||
| @ -95,6 +95,7 @@ class K8sAPI: | ||||
|         profile_filename: str = "", | ||||
|         qa_source: str = "", | ||||
|         proxy_id: str = "", | ||||
|         is_single_page: bool = False, | ||||
|     ): | ||||
|         """load job template from yaml""" | ||||
|         if not crawl_id: | ||||
| @ -119,6 +120,7 @@ class K8sAPI: | ||||
|             "profile_filename": profile_filename, | ||||
|             "qa_source": qa_source, | ||||
|             "proxy_id": proxy_id, | ||||
|             "is_single_page": "1" if is_single_page else "0", | ||||
|         } | ||||
| 
 | ||||
|         data = self.templates.env.get_template("crawl_job.yaml").render(params) | ||||
| @ -142,6 +144,7 @@ class K8sAPI: | ||||
|         profile_filename: str = "", | ||||
|         qa_source: str = "", | ||||
|         proxy_id: str = "", | ||||
|         is_single_page: bool = False, | ||||
|     ) -> str: | ||||
|         """load and init crawl job via k8s api""" | ||||
|         crawl_id, data = self.new_crawl_job_yaml( | ||||
| @ -161,6 +164,7 @@ class K8sAPI: | ||||
|             profile_filename=profile_filename, | ||||
|             qa_source=qa_source, | ||||
|             proxy_id=proxy_id, | ||||
|             is_single_page=is_single_page, | ||||
|         ) | ||||
| 
 | ||||
|         # create job directly | ||||
|  | ||||
| @ -101,6 +101,8 @@ class CrawlOperator(BaseOperator): | ||||
| 
 | ||||
|     paused_expires_delta: timedelta | ||||
| 
 | ||||
|     num_browsers_per_pod: int | ||||
| 
 | ||||
|     def __init__(self, *args): | ||||
|         super().__init__(*args) | ||||
| 
 | ||||
| @ -125,6 +127,8 @@ class CrawlOperator(BaseOperator): | ||||
| 
 | ||||
|         self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes) | ||||
| 
 | ||||
|         self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) | ||||
| 
 | ||||
|     def init_routes(self, app): | ||||
|         """init routes for this operator""" | ||||
| 
 | ||||
| @ -181,6 +185,7 @@ class CrawlOperator(BaseOperator): | ||||
|             max_crawl_size=int(spec.get("maxCrawlSize") or 0), | ||||
|             scheduled=spec.get("manual") != "1", | ||||
|             qa_source_crawl_id=spec.get("qaSourceCrawlId"), | ||||
|             is_single_page=spec.get("isSinglePage") == "1", | ||||
|         ) | ||||
| 
 | ||||
|         if crawl.qa_source_crawl_id: | ||||
| @ -301,7 +306,7 @@ class CrawlOperator(BaseOperator): | ||||
|                 status.stopReason = stop_reason | ||||
|                 await self.mark_finished(crawl, status, state) | ||||
| 
 | ||||
|         children = self._load_redis(params, status, data.children) | ||||
|         children = self._load_redis(params, status, crawl, data.children) | ||||
| 
 | ||||
|         storage_path = crawl.storage.get_storage_extra_path(oid) | ||||
|         storage_secret = crawl.storage.get_storage_secret_name(oid) | ||||
| @ -368,10 +373,8 @@ class CrawlOperator(BaseOperator): | ||||
|         # crawl_scale is the number of pods to create | ||||
|         crawler_scale = scale_from_browser_windows(crawl.browser_windows) | ||||
| 
 | ||||
|         browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) | ||||
| 
 | ||||
|         for i in range(0, crawler_scale): | ||||
|             if status.pagesFound < i * browsers_per_pod: | ||||
|             if status.pagesFound < i * self.num_browsers_per_pod: | ||||
|                 break | ||||
| 
 | ||||
|             children.extend( | ||||
| @ -392,7 +395,7 @@ class CrawlOperator(BaseOperator): | ||||
|             "resyncAfterSeconds": status.resync_after, | ||||
|         } | ||||
| 
 | ||||
|     def _load_redis(self, params, status: CrawlStatus, children): | ||||
|     def _load_redis(self, params, status: CrawlStatus, crawl: CrawlSpec, children): | ||||
|         name = f"redis-{params['id']}" | ||||
|         has_pod = name in children[POD] | ||||
| 
 | ||||
| @ -400,6 +403,8 @@ class CrawlOperator(BaseOperator): | ||||
|         params["name"] = name | ||||
|         params["cpu"] = pod_info.newCpu or params.get("redis_cpu") | ||||
|         params["memory"] = pod_info.newMemory or params.get("redis_memory") | ||||
|         params["no_pvc"] = crawl.is_single_page | ||||
| 
 | ||||
|         restart_reason = None | ||||
|         if has_pod: | ||||
|             restart_reason = pod_info.should_restart_pod() | ||||
| @ -870,7 +875,7 @@ class CrawlOperator(BaseOperator): | ||||
|         if redis_pod in pods: | ||||
|             # if has other pods, keep redis pod until they are removed | ||||
|             if len(pods) > 1: | ||||
|                 new_children = self._load_redis(params, status, children) | ||||
|                 new_children = self._load_redis(params, status, crawl, children) | ||||
|                 await self.increment_pod_exec_time(pods, crawl, status) | ||||
| 
 | ||||
|         # keep pvs until pods are removed | ||||
|  | ||||
| @ -140,6 +140,7 @@ class CronJobOperator(BaseOperator): | ||||
|             storage_filename=self.crawl_config_ops.default_filename_template, | ||||
|             profile_filename=profile_filename or "", | ||||
|             proxy_id=crawlconfig.proxyId or "", | ||||
|             is_single_page=self.crawl_config_ops.is_single_page(crawlconfig.config), | ||||
|         ) | ||||
| 
 | ||||
|         return MCDecoratorSyncResponse(attachments=list(yaml.safe_load_all(crawljob))) | ||||
|  | ||||
| @ -86,6 +86,7 @@ class CrawlSpec(BaseModel): | ||||
|     max_crawl_size: int = 0 | ||||
|     qa_source_crawl_id: Optional[str] = "" | ||||
|     proxy_id: Optional[str] = None | ||||
|     is_single_page: bool = False | ||||
| 
 | ||||
|     @property | ||||
|     def db_crawl_id(self) -> str: | ||||
|  | ||||
| @ -331,7 +331,7 @@ def sample_crawl_data(): | ||||
|     return { | ||||
|         "runNow": False, | ||||
|         "name": "Test Crawl", | ||||
|         "config": {"seeds": [{"url": "https://example.com/"}]}, | ||||
|         "config": {"seeds": [{"url": "https://example.com/"}], "extraHops": 1}, | ||||
|         "tags": ["tag1", "tag2"], | ||||
|     } | ||||
| 
 | ||||
|  | ||||
| @ -6,6 +6,7 @@ from .conftest import API_PREFIX | ||||
| 
 | ||||
| 
 | ||||
| cid = None | ||||
| cid_single_page = None | ||||
| UPDATED_NAME = "Updated name" | ||||
| UPDATED_DESCRIPTION = "Updated description" | ||||
| UPDATED_TAGS = ["tag3", "tag4"] | ||||
| @ -67,6 +68,37 @@ def test_verify_default_browser_windows( | ||||
|     assert data["browserWindows"] == 2 | ||||
| 
 | ||||
| 
 | ||||
| def test_add_crawl_config_single_page( | ||||
|     crawler_auth_headers, default_org_id, sample_crawl_data | ||||
| ): | ||||
|     # Create crawl config | ||||
|     sample_crawl_data["config"]["limit"] = 1 | ||||
|     r = requests.post( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", | ||||
|         headers=crawler_auth_headers, | ||||
|         json=sample_crawl_data, | ||||
|     ) | ||||
|     assert r.status_code == 200 | ||||
| 
 | ||||
|     data = r.json() | ||||
|     global cid_single_page | ||||
|     cid_single_page = data["id"] | ||||
| 
 | ||||
| 
 | ||||
| def test_verify_default_browser_windows_single_page( | ||||
|     crawler_auth_headers, default_org_id, sample_crawl_data | ||||
| ): | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid_single_page}/", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     assert r.status_code == 200 | ||||
| 
 | ||||
|     data = r.json() | ||||
|     assert data.get("scale") is None | ||||
|     assert data["browserWindows"] == 1 | ||||
| 
 | ||||
| 
 | ||||
| def test_custom_browser_windows( | ||||
|     crawler_auth_headers, default_org_id, sample_crawl_data | ||||
| ): | ||||
|  | ||||
| @ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_ | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     assert len(r.json()["items"]) == 7 | ||||
|     assert r.json()["total"] == 7 | ||||
|     assert len(r.json()["items"]) == 8 | ||||
|     assert r.json()["total"] == 8 | ||||
| 
 | ||||
| 
 | ||||
| def test_get_config_by_modified_by( | ||||
| @ -23,8 +23,8 @@ def test_get_config_by_modified_by( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     assert len(r.json()["items"]) == 7 | ||||
|     assert r.json()["total"] == 7 | ||||
|     assert len(r.json()["items"]) == 8 | ||||
|     assert r.json()["total"] == 8 | ||||
| 
 | ||||
| 
 | ||||
| def test_get_configs_by_first_seed( | ||||
| @ -362,9 +362,9 @@ def test_sort_crawl_configs( | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     data = r.json() | ||||
|     assert data["total"] == 13 | ||||
|     assert data["total"] == 14 | ||||
|     items = data["items"] | ||||
|     assert len(items) == 13 | ||||
|     assert len(items) == 14 | ||||
| 
 | ||||
|     last_created = None | ||||
|     for config in items: | ||||
|  | ||||
| @ -39,3 +39,5 @@ spec: | ||||
| 
 | ||||
|   pausedAt: "{{ pausedAt }}" | ||||
| 
 | ||||
|   isSinglePage: "{{ is_single_page }}" | ||||
| 
 | ||||
|  | ||||
| @ -1,3 +1,4 @@ | ||||
| {% if not no_pvc %} | ||||
| # ------- | ||||
| # PVC | ||||
| # ------- | ||||
| @ -23,7 +24,7 @@ spec: | ||||
|   storageClassName: {{ volume_storage_class }} | ||||
|   {% endif %} | ||||
| 
 | ||||
| 
 | ||||
| {% endif %} | ||||
| 
 | ||||
| # ------- | ||||
| # CRAWLER | ||||
| @ -67,8 +68,12 @@ spec: | ||||
|         name: qa-replay-{{ qa_source_crawl_id }} | ||||
|     {% endif %} | ||||
|     - name: crawl-data | ||||
|     {% if not no_pvc %} | ||||
|       persistentVolumeClaim: | ||||
|         claimName: {{ name }} | ||||
|     {% else %} | ||||
|       emptyDir: {} | ||||
|     {% endif %} | ||||
|     {% if proxy_id %} | ||||
|     - name: proxies | ||||
|       secret: | ||||
|  | ||||
| @ -1,3 +1,4 @@ | ||||
| {% if not no_pvc %} | ||||
| # ------- | ||||
| # PVC | ||||
| # ------- | ||||
| @ -22,6 +23,7 @@ spec: | ||||
|   {% if volume_storage_class %} | ||||
|   storageClassName: {{ volume_storage_class }} | ||||
|   {% endif %} | ||||
| {% endif %} | ||||
| 
 | ||||
| # -------- | ||||
| # REDIS | ||||
| @ -51,8 +53,12 @@ spec: | ||||
|             path: redis.conf | ||||
| 
 | ||||
|     - name: redis-data | ||||
|       {% if not no_pvc %} | ||||
|       persistentVolumeClaim: | ||||
|         claimName: {{ name }} | ||||
|       {% else %} | ||||
|       emptyDir: {} | ||||
|       {% endif %} | ||||
| 
 | ||||
|   affinity: | ||||
|     nodeAffinity: | ||||
|  | ||||
| @ -277,11 +277,11 @@ crawler_memory_base: 1024Mi | ||||
| # number of browser workers per crawler instances | ||||
| crawler_browser_instances: 2 | ||||
| 
 | ||||
| # number of browser workers per crawler instances for QA runs | ||||
| # number of browser workers per QA pod to run for QA runs | ||||
| # defaults to 'crawler_browser_instances' if not set | ||||
| # qa_browser_instances: 2 | ||||
| qa_browser_instances: 1 | ||||
| 
 | ||||
| # fixed scale (number of crawler pods) for QA runs | ||||
| # fixed scale (number of QA pods) to run | ||||
| qa_scale: 1 | ||||
| 
 | ||||
| # this value is added to crawler_cpu_base, for each additional browser | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user