crawlconfig: fix default filename template, make configurable (#835)

* crawlconfig: fix default filename template, make configurable
- make default crawl file template configurable with 'default_crawl_filename_template' value in values.yaml
- set to '@ts-@hostsuffix.wacz' by default
- allow updating via 'crawlFilenameTemplate' in crawlconfig patch, which updates configmap
- tests: add test for custom 'default_crawl_filename_template'
This commit is contained in:
Ilya Kreymer 2023-05-08 14:03:27 -07:00 committed by GitHub
parent fd7e81b8b7
commit 70319594c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 50 additions and 24 deletions

View File

@ -7,6 +7,7 @@ from enum import Enum
import uuid
import asyncio
import re
import os
from datetime import datetime
import urllib.parse
@ -116,6 +117,8 @@ class CrawlConfigIn(BaseModel):
crawlTimeout: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
crawlFilenameTemplate: Optional[str]
# ============================================================================
class ConfigRevision(BaseMongoModel):
@ -233,6 +236,7 @@ class UpdateCrawlConfig(BaseModel):
profileid: Optional[str]
crawlTimeout: Optional[int]
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)]
crawlFilenameTemplate: Optional[str]
config: Optional[RawCrawlConfig]
@ -252,6 +256,7 @@ class CrawlConfigOps:
self.profiles = profiles
self.profiles.set_crawlconfigs(self)
self.crawl_ops = None
self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
self.router = APIRouter(
prefix="/crawlconfigs",
@ -331,10 +336,9 @@ class CrawlConfigOps:
crawlconfig = CrawlConfig.from_dict(data)
suffix = f"{self.sanitize(str(crawlconfig.id))}-{self.sanitize(user.name)}"
# pylint: disable=line-too-long
out_filename = f"data/{self.sanitize(str(crawlconfig.id))}-@id/{suffix}-@ts-@hostsuffix.wacz"
out_filename = (
data.get("crawlFilenameTemplate") or self.default_filename_template
)
crawl_id = await self.crawl_manager.add_crawl_config(
crawlconfig=crawlconfig,
@ -382,6 +386,9 @@ class CrawlConfigOps:
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "crawlTimeout")
)
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate")
)
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "schedule")
)

View File

@ -97,12 +97,12 @@ class CrawlManager(K8sAPI):
# Create Config Map
await self._create_config_map(
crawlconfig,
STORE_PATH=storage_path,
STORE_FILENAME=out_filename,
STORAGE_NAME=storage_name,
USER_ID=str(crawlconfig.modifiedBy),
ORG_ID=str(crawlconfig.oid),
CRAWL_CONFIG_ID=str(crawlconfig.id),
STORE_PATH=storage_path,
STORE_FILENAME=out_filename,
STORAGE_NAME=storage_name,
PROFILE_FILENAME=profile_filename,
INITIAL_SCALE=str(crawlconfig.scale),
CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout)
@ -147,9 +147,8 @@ class CrawlManager(K8sAPI):
):
await self._update_config_map(
crawlconfig,
update.scale,
update,
profile_filename,
update.crawlTimeout,
has_config_update,
)
@ -397,22 +396,24 @@ class CrawlManager(K8sAPI):
async def _update_config_map(
self,
crawlconfig,
scale=None,
update,
profile_filename=None,
crawl_timeout=None,
update_config=False,
):
config_map = await self.get_configmap(crawlconfig.id)
if scale is not None:
config_map.data["INITIAL_SCALE"] = str(scale)
if update.scale is not None:
config_map.data["INITIAL_SCALE"] = str(update.scale)
if update.crawlTimeout is not None:
config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout)
if update.crawlFilenameTemplate is not None:
config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate
if profile_filename is not None:
config_map.data["PROFILE_FILENAME"] = profile_filename
if crawl_timeout is not None:
config_map.data["CRAWL_TIMEOUT"] = str(crawl_timeout)
if update_config:
config_map.data["crawl-config.json"] = json.dumps(
crawlconfig.get_raw_config()

View File

@ -124,19 +124,19 @@ spec:
env:
- name: CRAWL_ID
value: {{ id }}
value: "{{ id }}"
- name: WEBHOOK_URL
value: {{ redis_url }}/crawls-done
value: "{{ redis_url }}/crawls-done"
- name: STORE_PATH
value: {{ store_path }}
value: "{{ store_path }}"
- name: STORE_FILENAME
value: {{ store_filename }}
value: "{{ store_filename }}"
- name: STORE_USER
value: {{ userid }}
value: "{{ userid }}"
resources:
limits:

View File

@ -3,6 +3,7 @@ import hashlib
import time
import io
import zipfile
import re
from .conftest import API_PREFIX, HOST_PREFIX
from .test_collections import UPDATED_NAME as COLLECTION_NAME
@ -58,6 +59,10 @@ def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
assert len(data["resources"]) == 1
assert data["resources"][0]["path"]
# ensure filename matches specified pattern
# set in default_crawl_filename_template
assert re.search('/[\\d]+-testing-[\\w-]+\.wacz', data["resources"][0]["path"])
assert data["tags"] == ["wr-test-1", "wr-test-2"]
global wacz_path

View File

@ -54,6 +54,8 @@ data:
DEFAULT_PAGE_LOAD_TIME_SECONDS: "{{ .Values.default_page_load_time_seconds }}"
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"

View File

@ -4,6 +4,8 @@
backend_pull_policy: "Never"
frontend_pull_policy: "Never"
default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz"
operator_resync_seconds: 5
mongo_auth:

View File

@ -34,15 +34,20 @@ max_pages_per_crawl: 0
# if set to "1", allow inviting same user to same org multiple times
allow_dupe_invites: "0"
# number of workers for backend api
backend_workers: 4
# number of seconds before pending invites expire - default is 7 days
invite_expire_seconds: 604800
# base url for replayweb.page
rwp_base_url: "https://replayweb.page/"
# default template for generate wacz files
# supports following interpolated vars:
# @ts - current timestamp
# @hostname - full hostname
# @hostsuffix - last 14-characters of hostname
# @id - full crawl id
default_crawl_filename_template: "@ts-@hostsuffix.wacz"
superuser:
# set this to enable a superuser admin
email: admin@example.com
@ -62,8 +67,12 @@ backend_pull_policy: "Always"
backend_password_secret: "c9085f33ecce4347aa1d69339e16c499"
# number of backend pods
backend_num_replicas: 1
# number of workers per pod
backend_workers: 2
backend_requests_cpu: "10m"
backend_limits_cpu: "768m"