crawlconfig: fix default filename template, make configurable (#835)
* crawlconfig: fix default filename template, make configurable - make default crawl file template configurable with 'default_crawl_filename_template' value in values.yaml - set to '@ts-@hostsuffix.wacz' by default - allow updating via 'crawlFilenameTemplate' in crawlconfig patch, which updates configmap - tests: add test for custom 'default_crawl_filename_template'
This commit is contained in:
		
							parent
							
								
									fd7e81b8b7
								
							
						
					
					
						commit
						70319594c2
					
				| @ -7,6 +7,7 @@ from enum import Enum | ||||
| import uuid | ||||
| import asyncio | ||||
| import re | ||||
| import os | ||||
| from datetime import datetime | ||||
| import urllib.parse | ||||
| 
 | ||||
| @ -116,6 +117,8 @@ class CrawlConfigIn(BaseModel): | ||||
|     crawlTimeout: Optional[int] = 0 | ||||
|     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 | ||||
| 
 | ||||
|     crawlFilenameTemplate: Optional[str] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class ConfigRevision(BaseMongoModel): | ||||
| @ -233,6 +236,7 @@ class UpdateCrawlConfig(BaseModel): | ||||
|     profileid: Optional[str] | ||||
|     crawlTimeout: Optional[int] | ||||
|     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] | ||||
|     crawlFilenameTemplate: Optional[str] | ||||
|     config: Optional[RawCrawlConfig] | ||||
| 
 | ||||
| 
 | ||||
| @ -252,6 +256,7 @@ class CrawlConfigOps: | ||||
|         self.profiles = profiles | ||||
|         self.profiles.set_crawlconfigs(self) | ||||
|         self.crawl_ops = None | ||||
|         self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"] | ||||
| 
 | ||||
|         self.router = APIRouter( | ||||
|             prefix="/crawlconfigs", | ||||
| @ -331,10 +336,9 @@ class CrawlConfigOps: | ||||
| 
 | ||||
|         crawlconfig = CrawlConfig.from_dict(data) | ||||
| 
 | ||||
|         suffix = f"{self.sanitize(str(crawlconfig.id))}-{self.sanitize(user.name)}" | ||||
| 
 | ||||
|         # pylint: disable=line-too-long | ||||
|         out_filename = f"data/{self.sanitize(str(crawlconfig.id))}-@id/{suffix}-@ts-@hostsuffix.wacz" | ||||
|         out_filename = ( | ||||
|             data.get("crawlFilenameTemplate") or self.default_filename_template | ||||
|         ) | ||||
| 
 | ||||
|         crawl_id = await self.crawl_manager.add_crawl_config( | ||||
|             crawlconfig=crawlconfig, | ||||
| @ -382,6 +386,9 @@ class CrawlConfigOps: | ||||
|         changed = changed or ( | ||||
|             self.check_attr_changed(orig_crawl_config, update, "crawlTimeout") | ||||
|         ) | ||||
|         changed = changed or ( | ||||
|             self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate") | ||||
|         ) | ||||
|         changed = changed or ( | ||||
|             self.check_attr_changed(orig_crawl_config, update, "schedule") | ||||
|         ) | ||||
|  | ||||
| @ -97,12 +97,12 @@ class CrawlManager(K8sAPI): | ||||
|         # Create Config Map | ||||
|         await self._create_config_map( | ||||
|             crawlconfig, | ||||
|             STORE_PATH=storage_path, | ||||
|             STORE_FILENAME=out_filename, | ||||
|             STORAGE_NAME=storage_name, | ||||
|             USER_ID=str(crawlconfig.modifiedBy), | ||||
|             ORG_ID=str(crawlconfig.oid), | ||||
|             CRAWL_CONFIG_ID=str(crawlconfig.id), | ||||
|             STORE_PATH=storage_path, | ||||
|             STORE_FILENAME=out_filename, | ||||
|             STORAGE_NAME=storage_name, | ||||
|             PROFILE_FILENAME=profile_filename, | ||||
|             INITIAL_SCALE=str(crawlconfig.scale), | ||||
|             CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout) | ||||
| @ -147,9 +147,8 @@ class CrawlManager(K8sAPI): | ||||
|         ): | ||||
|             await self._update_config_map( | ||||
|                 crawlconfig, | ||||
|                 update.scale, | ||||
|                 update, | ||||
|                 profile_filename, | ||||
|                 update.crawlTimeout, | ||||
|                 has_config_update, | ||||
|             ) | ||||
| 
 | ||||
| @ -397,22 +396,24 @@ class CrawlManager(K8sAPI): | ||||
|     async def _update_config_map( | ||||
|         self, | ||||
|         crawlconfig, | ||||
|         scale=None, | ||||
|         update, | ||||
|         profile_filename=None, | ||||
|         crawl_timeout=None, | ||||
|         update_config=False, | ||||
|     ): | ||||
|         config_map = await self.get_configmap(crawlconfig.id) | ||||
| 
 | ||||
|         if scale is not None: | ||||
|             config_map.data["INITIAL_SCALE"] = str(scale) | ||||
|         if update.scale is not None: | ||||
|             config_map.data["INITIAL_SCALE"] = str(update.scale) | ||||
| 
 | ||||
|         if update.crawlTimeout is not None: | ||||
|             config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout) | ||||
| 
 | ||||
|         if update.crawlFilenameTemplate is not None: | ||||
|             config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate | ||||
| 
 | ||||
|         if profile_filename is not None: | ||||
|             config_map.data["PROFILE_FILENAME"] = profile_filename | ||||
| 
 | ||||
|         if crawl_timeout is not None: | ||||
|             config_map.data["CRAWL_TIMEOUT"] = str(crawl_timeout) | ||||
| 
 | ||||
|         if update_config: | ||||
|             config_map.data["crawl-config.json"] = json.dumps( | ||||
|                 crawlconfig.get_raw_config() | ||||
|  | ||||
| @ -124,19 +124,19 @@ spec: | ||||
| 
 | ||||
|           env: | ||||
|             - name: CRAWL_ID | ||||
|               value: {{ id }} | ||||
|               value: "{{ id }}" | ||||
| 
 | ||||
|             - name: WEBHOOK_URL | ||||
|               value: {{ redis_url }}/crawls-done | ||||
|               value: "{{ redis_url }}/crawls-done" | ||||
| 
 | ||||
|             - name: STORE_PATH | ||||
|               value: {{ store_path }} | ||||
|               value: "{{ store_path }}" | ||||
| 
 | ||||
|             - name: STORE_FILENAME | ||||
|               value: {{ store_filename }} | ||||
|               value: "{{ store_filename }}" | ||||
| 
 | ||||
|             - name: STORE_USER | ||||
|               value: {{ userid }} | ||||
|               value: "{{ userid }}" | ||||
| 
 | ||||
|           resources: | ||||
|             limits: | ||||
|  | ||||
| @ -3,6 +3,7 @@ import hashlib | ||||
| import time | ||||
| import io | ||||
| import zipfile | ||||
| import re | ||||
| 
 | ||||
| from .conftest import API_PREFIX, HOST_PREFIX | ||||
| from .test_collections import UPDATED_NAME as COLLECTION_NAME | ||||
| @ -58,6 +59,10 @@ def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id): | ||||
|     assert len(data["resources"]) == 1 | ||||
|     assert data["resources"][0]["path"] | ||||
| 
 | ||||
|     # ensure filename matches specified pattern | ||||
|     # set in default_crawl_filename_template | ||||
|     assert re.search('/[\\d]+-testing-[\\w-]+\.wacz', data["resources"][0]["path"]) | ||||
| 
 | ||||
|     assert data["tags"] == ["wr-test-1", "wr-test-2"] | ||||
| 
 | ||||
|     global wacz_path | ||||
|  | ||||
| @ -54,6 +54,8 @@ data: | ||||
| 
 | ||||
|   DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}" | ||||
| 
 | ||||
|   DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}" | ||||
| 
 | ||||
|   MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" | ||||
| 
 | ||||
|   IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" | ||||
|  | ||||
| @ -4,6 +4,8 @@ | ||||
| backend_pull_policy: "Never" | ||||
| frontend_pull_policy: "Never" | ||||
| 
 | ||||
| default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz" | ||||
| 
 | ||||
| operator_resync_seconds: 5 | ||||
| 
 | ||||
| mongo_auth: | ||||
|  | ||||
| @ -34,15 +34,20 @@ max_pages_per_crawl: 0 | ||||
| # if set to "1", allow inviting same user to same org multiple times | ||||
| allow_dupe_invites: "0" | ||||
| 
 | ||||
| # number of workers for backend api | ||||
| backend_workers: 4 | ||||
| 
 | ||||
| # number of seconds before pending invites expire - default is 7 days | ||||
| invite_expire_seconds: 604800 | ||||
| 
 | ||||
| # base url for replayweb.page | ||||
| rwp_base_url: "https://replayweb.page/" | ||||
| 
 | ||||
| # default template for generate wacz files | ||||
| # supports following interpolated vars: | ||||
| # @ts - current timestamp | ||||
| # @hostname - full hostname | ||||
| # @hostsuffix - last 14-characters of hostname | ||||
| # @id - full crawl id | ||||
| default_crawl_filename_template: "@ts-@hostsuffix.wacz" | ||||
| 
 | ||||
| superuser: | ||||
|   # set this to enable a superuser admin | ||||
|   email: admin@example.com | ||||
| @ -62,8 +67,12 @@ backend_pull_policy: "Always" | ||||
| 
 | ||||
| backend_password_secret: "c9085f33ecce4347aa1d69339e16c499" | ||||
| 
 | ||||
| # number of backend pods | ||||
| backend_num_replicas: 1 | ||||
| 
 | ||||
| # number of workers per pod | ||||
| backend_workers: 2 | ||||
| 
 | ||||
| backend_requests_cpu: "10m" | ||||
| backend_limits_cpu: "768m" | ||||
| 
 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user