browsertrix/btrix
Tessa Walsh f7ba712646 Add seed file support to Browsertrix backend (#2710)
Fixes #2673 

Changes in this PR:

- Adds a new `file_uploads.py` module and corresponding `/files` API
prefix with methods/endpoints for uploading, GETing, and deleting seed
files (can be extended to other types of files moving forward)
- Seed files are supported via `CrawlConfig.config.seedFileId` on POST
and PATCH endpoints. This seedFileId is replaced by a presigned url when
passed to the crawler by the operator
- Seed files are read when first uploaded to calculate `firstSeed` and
`seedCount` and store them in the database, and this is copied into the
workflow and crawl documents when they are created.
- Logic is added to store `firstSeed` and `seedCount` for other
workflows as well, and a migration added to backfill data, to maintain
consistency and fix some of the pymongo aggregations that previously
assumed all workflows would have at least one `Seed` object in
`CrawlConfig.seeds`
- Seed file and thumbnail storage stats are added to org stats
- Seed file and thumbnail uploads first check that the org's storage
quota has not been exceeded and return a 400 if so
- A cron background job (run weekly each Sunday at midnight by default,
but configurable) is added to look for seed files at least x minutes old
(1440 minutes, or 1 day, by default, but configurable) that are not in
use in any workflows, and to delete them when they are found. The
backend pods will ensure this k8s batch job exists when starting up and
create it if it does not already exist. A database entry for each run of
the job is created in the operator on job completion so that it'll
appear in the `/jobs` API endpoints, but retrying of this type of
regularly scheduled background job is not supported as we don't want to
accidentally create multiple competing scheduled jobs.
- Adds a `min_seed_file_crawler_image` value to the Helm chart that is
checked before creating a crawl from a workflow if set. If a workflow
cannot be run, return the detail of the exception in
`CrawlConfigAddedResponse.errorDetail` so that we can display the reason
in the frontend
- Add SeedFile model from base UserFile (former ImageFIle), ensure all APIs
returning uploaded files return an absolute pre-signed URL (either with external origin or internal service origin)

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-07-22 19:11:02 -07:00

216 lines
5.2 KiB
Bash
Executable File

#!/usr/bin/env bash
# ./btrix: Browsertrix dev environment utility
#
# Test commands require installing pytest first, e.g.:
# python3 -m pip install pytest
#
# Usage:
#
# $ ./btrix setup
# Copy local config to expected location
# Local config must exist for other commands to work
#
# $ ./btrix bootstrap
# Build frontend and backend and upgrade
# Optional args:
# -microk8s: Preface kubectl/helm commands with microk8s
# -wait: Wait until pods are ready
#
# $ ./btrix reset
# Uninstall, delete data, then bootstrap
# Optional args:
# -microk8s: Preface kubectl/helm commands with microk8s
# -wait: Wait until pods are ready
#
# $ ./btrix down
# Uninstall and delete data
# Optional args:
# -microk8s: Preface kubectl/helm commands with microk8s
#
# $ ./btrix test
# Run backend tests
#
# $ ./btrix nightly
# Run nightly backend tests
bootstrap(){
echo "Building backend..."
./scripts/build-backend.sh
echo "Building frontend..."
./scripts/build-frontend.sh
echo "Installing..."
helm upgrade --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart
}
bootstrapMicrok8s(){
echo "Building backend..."
./scripts/build-backend.sh
echo "Building frontend..."
./scripts/build-frontend.sh
echo "Installing..."
microk8s helm3 upgrade --install -f ./chart/values.yaml -f ./chart/local.yaml btrix ./chart
}
waitUntilReady(){
echo "Waiting until ready..."
kubectl wait --for=condition=ready pod --all --timeout=300s
}
waitUntilReadyMicrok8s(){
echo "Waiting until ready..."
microk8s kubectl wait --for=condition=ready pod --all --timeout=300s
}
reset(){
echo "Stopping all crawls & profile browsers"
kubectl delete cjs -n crawlers --all
kubectl delete pjs -n crawlers --all
echo "Uninstalling..."
helm uninstall btrix
echo "Deleting data..."
kubectl delete pvc --all
kubectl delete cronjob -n crawlers --all
kubectl delete cronjob --all
kubectl delete configmap -n crawlers -l btrix.crawlconfig
}
resetMicrok8s(){
echo "Stopping all crawls & profile browsers"
microk8s kubectl delete cjs -n crawlers --all
microk8s kubectl delete pjs -n crawlers --all
echo "Uninstalling..."
microk8s helm uninstall btrix
echo "Deleting data..."
microk8s kubectl delete pvc --all
microk8s kubectl delete cronjob -n crawlers --all
microk8s kubectl delete cronjob --all
microk8s kubectl delete configmap -n crawlers -l btrix.crawlconfig
}
runTests() {
echo "Running backend tests..."
python3 -m pytest backend/test/test_*.py
}
runNightlyTests() {
echo "Running nightly backend tests..."
python3 -m pytest backend/test_nightly/test_*.py
}
setupLocalConfig() {
if [ -f ./chart/local.yaml ]; then
echo "Local config file already exists at ./chart/local.yaml"
exit 1
fi
echo "Copying local example config to ./chart/local.yaml"
cp ./chart/examples/local-config.yaml ./chart/local.yaml
echo "Local config file created. Edit ./chart/local.yaml to set local overrides"
exit 0
}
CONTEXT=$(cat ~/.kube/config | grep "current-context:" | sed "s/current-context: //")
MICROK8S="-microk8s"
WAIT="-wait"
if [ $CONTEXT != "docker-desktop" ]; then
echo "Attempting to modify context other than docker-desktop not supported. Quitting."
exit 1
fi
if [[ $1 = "setup" ]]; then
setupLocalConfig
fi
if [ ! -f ./chart/local.yaml ]; then
echo "Local config file not found. Run './btrix setup' to configure"
exit 1
fi
# bootstrap: build frontend and backend, upgrade and wait until ready
if [[ $1 = "bootstrap" ]]; then
echo "Current context: $CONTEXT"
echo "Are you sure you want to update this context?"
if [[ "$(read -e -p '[y/N] > '; echo $REPLY)" == [Yy]* ]] ; then
echo Continuing
else
echo Stopping
exit 1
fi
if [[ $2 = "$MICROK8S" || $3 = "$MICROK8S" ]] ; then
bootstrapMicrok8s
else
bootstrap
fi
if [[ $2 = "$WAIT" || $3 = "$WAIT" ]]; then
if [[ $2 = "$MICROK8S" || $3 = "$MICROK8S" ]] ; then
waitUntilReadyMicrok8s
else
waitUntilReady
fi
fi
fi
# reset: uninstall, delete data, then bootstrap
if [[ $1 = "reset" ]]; then
echo "Current context: $CONTEXT"
echo "Resetting k8s cluster will delete the database. All running crawls will first be canceled. Are you sure you want to do this?"
if [[ "$(read -e -p '[y/N] > '; echo $REPLY)" == [Yy]* ]] ; then
echo Continuing
else
echo Stopping
exit 1
fi
if [[ $2 = "$MICROK8S" || $3 = "$MICROK8S" ]] ; then
resetMicrok8s
bootstrapMicrok8s
else
reset
bootstrap
fi
if [[ $2 = "$WAIT" || $3 = "$WAIT" ]] ; then
if [[ $2 = "$MICROK8S" || $3 = "$MICROK8S" ]] ; then
waitUntilReadyMicrok8s
else
waitUntilReady
fi
fi
fi
# test: run backend tests
if [[ $1 = "test" ]]; then
runTests
fi
# nightly: run nightly backend tests
if [[ $1 = "nightly" ]]; then
runNightlyTests
fi
# down: stop and uninstall browsertrix
if [[ $1 = "down" ]]; then
if [[ $2 = "$MICROK8S" ]] ; then
resetMicrok8s
else
reset
fi
fi
echo "Done"