crawlconfig: fix default filename template, make configurable (#835)
* crawlconfig: fix default filename template, make configurable - make default crawl file template configurable with 'default_crawl_filename_template' value in values.yaml - set to '@ts-@hostsuffix.wacz' by default - allow updating via 'crawlFilenameTemplate' in crawlconfig patch, which updates configmap - tests: add test for custom 'default_crawl_filename_template'
This commit is contained in:
parent
fd7e81b8b7
commit
70319594c2
@ -7,6 +7,7 @@ from enum import Enum
|
||||
import uuid
|
||||
import asyncio
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime
|
||||
import urllib.parse
|
||||
|
||||
@ -116,6 +117,8 @@ class CrawlConfigIn(BaseModel):
|
||||
crawlTimeout: Optional[int] = 0
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||
|
||||
crawlFilenameTemplate: Optional[str]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ConfigRevision(BaseMongoModel):
|
||||
@ -233,6 +236,7 @@ class UpdateCrawlConfig(BaseModel):
|
||||
profileid: Optional[str]
|
||||
crawlTimeout: Optional[int]
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)]
|
||||
crawlFilenameTemplate: Optional[str]
|
||||
config: Optional[RawCrawlConfig]
|
||||
|
||||
|
||||
@ -252,6 +256,7 @@ class CrawlConfigOps:
|
||||
self.profiles = profiles
|
||||
self.profiles.set_crawlconfigs(self)
|
||||
self.crawl_ops = None
|
||||
self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
|
||||
|
||||
self.router = APIRouter(
|
||||
prefix="/crawlconfigs",
|
||||
@ -331,10 +336,9 @@ class CrawlConfigOps:
|
||||
|
||||
crawlconfig = CrawlConfig.from_dict(data)
|
||||
|
||||
suffix = f"{self.sanitize(str(crawlconfig.id))}-{self.sanitize(user.name)}"
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
out_filename = f"data/{self.sanitize(str(crawlconfig.id))}-@id/{suffix}-@ts-@hostsuffix.wacz"
|
||||
out_filename = (
|
||||
data.get("crawlFilenameTemplate") or self.default_filename_template
|
||||
)
|
||||
|
||||
crawl_id = await self.crawl_manager.add_crawl_config(
|
||||
crawlconfig=crawlconfig,
|
||||
@ -382,6 +386,9 @@ class CrawlConfigOps:
|
||||
changed = changed or (
|
||||
self.check_attr_changed(orig_crawl_config, update, "crawlTimeout")
|
||||
)
|
||||
changed = changed or (
|
||||
self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate")
|
||||
)
|
||||
changed = changed or (
|
||||
self.check_attr_changed(orig_crawl_config, update, "schedule")
|
||||
)
|
||||
|
@ -97,12 +97,12 @@ class CrawlManager(K8sAPI):
|
||||
# Create Config Map
|
||||
await self._create_config_map(
|
||||
crawlconfig,
|
||||
STORE_PATH=storage_path,
|
||||
STORE_FILENAME=out_filename,
|
||||
STORAGE_NAME=storage_name,
|
||||
USER_ID=str(crawlconfig.modifiedBy),
|
||||
ORG_ID=str(crawlconfig.oid),
|
||||
CRAWL_CONFIG_ID=str(crawlconfig.id),
|
||||
STORE_PATH=storage_path,
|
||||
STORE_FILENAME=out_filename,
|
||||
STORAGE_NAME=storage_name,
|
||||
PROFILE_FILENAME=profile_filename,
|
||||
INITIAL_SCALE=str(crawlconfig.scale),
|
||||
CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout)
|
||||
@ -147,9 +147,8 @@ class CrawlManager(K8sAPI):
|
||||
):
|
||||
await self._update_config_map(
|
||||
crawlconfig,
|
||||
update.scale,
|
||||
update,
|
||||
profile_filename,
|
||||
update.crawlTimeout,
|
||||
has_config_update,
|
||||
)
|
||||
|
||||
@ -397,22 +396,24 @@ class CrawlManager(K8sAPI):
|
||||
async def _update_config_map(
|
||||
self,
|
||||
crawlconfig,
|
||||
scale=None,
|
||||
update,
|
||||
profile_filename=None,
|
||||
crawl_timeout=None,
|
||||
update_config=False,
|
||||
):
|
||||
config_map = await self.get_configmap(crawlconfig.id)
|
||||
|
||||
if scale is not None:
|
||||
config_map.data["INITIAL_SCALE"] = str(scale)
|
||||
if update.scale is not None:
|
||||
config_map.data["INITIAL_SCALE"] = str(update.scale)
|
||||
|
||||
if update.crawlTimeout is not None:
|
||||
config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout)
|
||||
|
||||
if update.crawlFilenameTemplate is not None:
|
||||
config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate
|
||||
|
||||
if profile_filename is not None:
|
||||
config_map.data["PROFILE_FILENAME"] = profile_filename
|
||||
|
||||
if crawl_timeout is not None:
|
||||
config_map.data["CRAWL_TIMEOUT"] = str(crawl_timeout)
|
||||
|
||||
if update_config:
|
||||
config_map.data["crawl-config.json"] = json.dumps(
|
||||
crawlconfig.get_raw_config()
|
||||
|
@ -124,19 +124,19 @@ spec:
|
||||
|
||||
env:
|
||||
- name: CRAWL_ID
|
||||
value: {{ id }}
|
||||
value: "{{ id }}"
|
||||
|
||||
- name: WEBHOOK_URL
|
||||
value: {{ redis_url }}/crawls-done
|
||||
value: "{{ redis_url }}/crawls-done"
|
||||
|
||||
- name: STORE_PATH
|
||||
value: {{ store_path }}
|
||||
value: "{{ store_path }}"
|
||||
|
||||
- name: STORE_FILENAME
|
||||
value: {{ store_filename }}
|
||||
value: "{{ store_filename }}"
|
||||
|
||||
- name: STORE_USER
|
||||
value: {{ userid }}
|
||||
value: "{{ userid }}"
|
||||
|
||||
resources:
|
||||
limits:
|
||||
|
@ -3,6 +3,7 @@ import hashlib
|
||||
import time
|
||||
import io
|
||||
import zipfile
|
||||
import re
|
||||
|
||||
from .conftest import API_PREFIX, HOST_PREFIX
|
||||
from .test_collections import UPDATED_NAME as COLLECTION_NAME
|
||||
@ -58,6 +59,10 @@ def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
assert len(data["resources"]) == 1
|
||||
assert data["resources"][0]["path"]
|
||||
|
||||
# ensure filename matches specified pattern
|
||||
# set in default_crawl_filename_template
|
||||
assert re.search('/[\\d]+-testing-[\\w-]+\.wacz', data["resources"][0]["path"])
|
||||
|
||||
assert data["tags"] == ["wr-test-1", "wr-test-2"]
|
||||
|
||||
global wacz_path
|
||||
|
@ -54,6 +54,8 @@ data:
|
||||
|
||||
DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}"
|
||||
|
||||
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
|
||||
|
||||
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
|
||||
|
||||
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
||||
|
@ -4,6 +4,8 @@
|
||||
backend_pull_policy: "Never"
|
||||
frontend_pull_policy: "Never"
|
||||
|
||||
default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz"
|
||||
|
||||
operator_resync_seconds: 5
|
||||
|
||||
mongo_auth:
|
||||
|
@ -34,15 +34,20 @@ max_pages_per_crawl: 0
|
||||
# if set to "1", allow inviting same user to same org multiple times
|
||||
allow_dupe_invites: "0"
|
||||
|
||||
# number of workers for backend api
|
||||
backend_workers: 4
|
||||
|
||||
# number of seconds before pending invites expire - default is 7 days
|
||||
invite_expire_seconds: 604800
|
||||
|
||||
# base url for replayweb.page
|
||||
rwp_base_url: "https://replayweb.page/"
|
||||
|
||||
# default template for generate wacz files
|
||||
# supports following interpolated vars:
|
||||
# @ts - current timestamp
|
||||
# @hostname - full hostname
|
||||
# @hostsuffix - last 14-characters of hostname
|
||||
# @id - full crawl id
|
||||
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
|
||||
|
||||
superuser:
|
||||
# set this to enable a superuser admin
|
||||
email: admin@example.com
|
||||
@ -62,8 +67,12 @@ backend_pull_policy: "Always"
|
||||
|
||||
backend_password_secret: "c9085f33ecce4347aa1d69339e16c499"
|
||||
|
||||
# number of backend pods
|
||||
backend_num_replicas: 1
|
||||
|
||||
# number of workers per pod
|
||||
backend_workers: 2
|
||||
|
||||
backend_requests_cpu: "10m"
|
||||
backend_limits_cpu: "768m"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user