Fixes #1502 - Adds pages to database as they get added to Redis during crawl - Adds migration to add pages to database for older crawls from pages.jsonl and extraPages.jsonl files in WACZ - Adds GET, list GET, and PATCH update endpoints for pages - Adds POST (add), PATCH, and POST (delete) endpoints for page notes, each with their own id, timestamp, and user info in addition to text - Adds page_ops methods for 1. adding resources/urls to page, and 2. adding automated heuristics and supplemental info (mime, type, etc.) to page (for use in crawl QA job) - Modifies `Migration` class to accept kwargs so that we can pass in ops classes as needed for migrations - Deletes WACZ files and pages from database for failed crawls during crawl_finished process - Deletes crawl pages when a crawl is deleted Note: Requires a crawler version 1.0.0 beta3 or later, with support for `--writePagesToRedis` to populate pages at crawl completion. Beta 4 is configured in the test chart, which should be upgraded to stable 1.0.0 when it's released. Connected to https://github.com/webrecorder/browsertrix-crawler/pull/464 --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
52 lines
1.1 KiB
YAML
52 lines
1.1 KiB
YAML
# test overrides
|
|
# --------------
|
|
|
|
# use local images built to :latest tag
|
|
backend_image: docker.io/webrecorder/browsertrix-backend:latest
|
|
frontend_image: docker.io/webrecorder/browsertrix-frontend:latest
|
|
|
|
backend_pull_policy: "Never"
|
|
frontend_pull_policy: "Never"
|
|
|
|
default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz"
|
|
|
|
operator_resync_seconds: 3
|
|
|
|
# for testing only
|
|
crawler_extra_cpu_per_browser: 300m
|
|
|
|
crawler_extra_memory_per_browser: 256Mi
|
|
|
|
crawler_channels:
|
|
- id: default
|
|
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
|
|
|
- id: test
|
|
image: "docker.io/webrecorder/browsertrix-crawler:1.0.0-beta.4"
|
|
|
|
mongo_auth:
|
|
# specify either username + password (for local mongo)
|
|
username: root
|
|
password: PASSWORD@
|
|
|
|
|
|
superuser:
|
|
# set this to enable a superuser admin
|
|
email: admin@example.com
|
|
|
|
# optional: if not set, automatically generated
|
|
# change or remove this
|
|
password: PASSW0RD!
|
|
|
|
|
|
# test max pages per crawl global limit
|
|
max_pages_per_crawl: 4
|
|
|
|
registration_enabled: "0"
|
|
|
|
# log failed crawl pods to operator backend
|
|
log_failed_crawl_lines: 200
|
|
|
|
# disable for tests
|
|
disk_utilization_threshold: 0
|