Add backend support for custom behaviors + validation endpoint (#2505)

Backend support for #2151 

Adds support for specifying custom behaviors via a list of strings.

When workflows are added or modified, minimal backend validation is done
to ensure that all custom behavior URLs are valid URLs (after removing
the git prefix and custom query arguments).

A separate `POST /crawlconfigs/validate/custom-behavior` endpoint is
also added, which can be used to validate a custom behavior URL. It
performs the same syntax check as above and then:
- For URL directly to behavior file, ensures URL resolves and returns a
2xx/3xx status code
- For Git repositories, uses `git ls-remote` to ensure they exist (and
that branch exists if specified)

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
Tessa Walsh 2025-04-02 19:20:51 -04:00 committed by GitHub
parent c067a0fe7c
commit cd7b695520
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 336 additions and 11 deletions

View File

@ -1,5 +1,10 @@
FROM docker.io/library/python:3.12-slim
RUN apt-get update \
&& apt-get install -y --no-install-recommends git \
&& apt-get purge -y --auto-remove \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
ADD requirements.txt /app

View File

@ -4,7 +4,7 @@ Crawl Config API handling
# pylint: disable=too-many-lines
from typing import List, Union, Optional, TYPE_CHECKING, cast
from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple
import asyncio
import json
@ -15,8 +15,9 @@ from datetime import datetime
from uuid import UUID, uuid4
import urllib.parse
import pymongo
import aiohttp
from fastapi import APIRouter, Depends, HTTPException, Query
import pymongo
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .models import (
@ -43,8 +44,9 @@ from .models import (
CrawlConfigDeletedResponse,
CrawlerProxy,
CrawlerProxies,
ValidateCustomBehavior,
)
from .utils import dt_now, slug_from_name, validate_regexes
from .utils import dt_now, slug_from_name, validate_regexes, is_url
if TYPE_CHECKING:
from .orgs import OrgOps
@ -231,6 +233,10 @@ class CrawlConfigOps:
exclude = [exclude]
validate_regexes(exclude)
if config_in.config.customBehaviors:
for url in config_in.config.customBehaviors:
self._validate_custom_behavior_url_syntax(url)
now = dt_now()
crawlconfig = CrawlConfig(
id=uuid4(),
@ -291,6 +297,23 @@ class CrawlConfigOps:
execMinutesQuotaReached=exec_mins_quota_reached,
)
def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]:
"""Validate custom behaviors are valid URLs after removing custom git syntax"""
git_prefix = "git+"
is_git_repo = False
if url.startswith(git_prefix):
is_git_repo = True
url = url[len(git_prefix) :]
parts = url.split("?")
url = parts[0]
if not is_url(url):
raise HTTPException(status_code=400, detail="invalid_custom_behavior")
return is_git_repo, parts
def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
"""ensure page limit is set to no greater than quota page limit, if any"""
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
@ -356,6 +379,10 @@ class CrawlConfigOps:
exclude = [exclude]
validate_regexes(exclude)
if update.config and update.config.customBehaviors:
for url in update.config.customBehaviors:
self._validate_custom_behavior_url_syntax(url)
# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (
@ -1070,6 +1097,75 @@ class CrawlConfigOps:
flush=True,
)
async def _validate_behavior_git_repo(self, repo_url: str, branch: str = ""):
"""Validate git repository and branch, if specified, exist and are reachable"""
cmd = f"git ls-remote {repo_url} HEAD"
proc = await asyncio.create_subprocess_shell(cmd)
if await proc.wait() > 0:
raise HTTPException(
status_code=404,
detail="custom_behavior_not_found",
)
if branch:
await asyncio.sleep(0.5)
git_remote_cmd = (
f"git ls-remote --exit-code --heads {repo_url} refs/heads/{branch}"
)
proc = await asyncio.create_subprocess_shell(git_remote_cmd)
if await proc.wait() > 0:
raise HTTPException(
status_code=404,
detail="custom_behavior_branch_not_found",
)
async def _validate_behavior_url(self, url: str):
"""Validate behavior file exists at url"""
try:
timeout = aiohttp.ClientTimeout(total=10)
async with aiohttp.ClientSession(timeout=timeout) as session:
async with session.get(url) as resp:
if resp.status >= 400:
raise HTTPException(
status_code=404,
detail="custom_behavior_not_found",
)
# pylint: disable=raise-missing-from
except aiohttp.ClientError:
raise HTTPException(
status_code=404,
detail="custom_behavior_not_found",
)
async def validate_custom_behavior(self, url: str) -> Dict[str, bool]:
"""Validate custom behavior URL
Implemented:
- Ensure URL is valid (after removing custom git prefix and syntax)
- Ensure URL returns status code < 400
- Ensure git repository can be reached by git ls-remote and that branch
exists, if provided
"""
git_branch = ""
is_git_repo, url_parts = self._validate_custom_behavior_url_syntax(url)
url = url_parts[0]
if is_git_repo and len(url_parts) > 1:
query_str = url_parts[1]
try:
git_branch = urllib.parse.parse_qs(query_str)["branch"][0]
# pylint: disable=broad-exception-caught
except (KeyError, IndexError):
pass
if is_git_repo:
await self._validate_behavior_git_repo(url, branch=git_branch)
else:
await self._validate_behavior_url(url)
return {"success": True}
# ============================================================================
# pylint: disable=too-many-locals
@ -1316,6 +1412,14 @@ def init_crawl_config_api(
asyncio.create_task(ops.re_add_all_scheduled_cron_jobs())
return {"success": True}
@router.post("/validate/custom-behavior", response_model=SuccessResponse)
async def validate_custom_behavior(
behavior: ValidateCustomBehavior,
# pylint: disable=unused-argument
org: Organization = Depends(org_crawl_dep),
):
return await ops.validate_custom_behavior(behavior.customBehavior)
org_ops.router.include_router(router)
return ops

View File

@ -333,6 +333,7 @@ class RawCrawlConfig(BaseModel):
logging: Optional[str] = None
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
customBehaviors: List[str] = []
userAgent: Optional[str] = None
@ -541,6 +542,8 @@ class CrawlConfigDefaults(BaseModel):
exclude: Optional[List[str]] = None
customBehaviors: List[str] = []
# ============================================================================
class CrawlConfigAddedResponse(BaseModel):
@ -592,6 +595,13 @@ class CrawlConfigDeletedResponse(BaseModel):
status: str
# ============================================================================
class ValidateCustomBehavior(BaseModel):
"""Input model for validating custom behavior URL/Git reference"""
customBehavior: str
# ============================================================================
### CRAWLER VERSIONS ###

View File

@ -11,6 +11,7 @@ import re
from datetime import datetime, timezone
from typing import Optional, Dict, Union, List, Any
from urllib.parse import urlparse
from uuid import UUID
from fastapi import HTTPException
@ -100,6 +101,15 @@ def is_falsy_bool(stri: Optional[str]) -> bool:
return False
def is_url(url: str) -> bool:
"""Check if string is a valid URL"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def str_list_to_bools(str_list: List[str], allow_none=True) -> List[Union[bool, None]]:
"""Return version of input string list cast to bool or None, ignoring other values"""
output: List[Union[bool, None]] = []

View File

@ -326,7 +326,7 @@ def crawler_config_id_only(_crawler_create_config_only):
return _crawler_config_id
@pytest.fixture(scope="session")
@pytest.fixture(scope="function")
def sample_crawl_data():
return {
"runNow": False,

View File

@ -543,3 +543,192 @@ def test_add_crawl_config_invalid_exclude_regex(
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_add_crawl_config_custom_behaviors_invalid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["customBehaviors"] = ["http"]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
def test_add_crawl_config_custom_behaviors_valid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
sample_crawl_data["config"]["customBehaviors"] = [url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [url]
def test_add_update_crawl_config_custom_behaviors_git_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):
git_url = "git+https://github.com/webrecorder/custom-behaviors"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
# Create workflow and validate it looks like we expect
sample_crawl_data["config"]["customBehaviors"] = [git_url]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
config_id = data["id"]
assert config_id
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Try to update custom behaviors with invalid url, validate unchanged
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url, "not-a-url"],
}
},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url]
# Update custom behaviors with valid url, validate changed
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
headers=crawler_auth_headers,
json={
"config": {
"customBehaviors": [git_url_with_params],
}
},
)
assert r.status_code == 200
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == config_id
assert data["config"]["customBehaviors"] == [git_url_with_params]
def test_validate_custom_behavior(crawler_auth_headers, default_org_id):
valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js"
doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com"
malformed_url = "http"
git_url = "git+https://github.com/webrecorder/custom-behaviors"
invalid_git_url = "git+https://github.com/webrecorder/doesntexist"
private_git_url = "git+https://github.com/webrecorder/website"
git_url_with_params = (
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
)
git_url_invalid_branch = (
"git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist"
)
# Success
for url in (valid_url, git_url, git_url_with_params):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 200
assert r.json()["success"]
# Behavior 404s
for url in (invalid_url_404, doesnt_resolve_url):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Malformed url
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": malformed_url},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_custom_behavior"
# Git repo doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": invalid_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git repo isn't public
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": private_git_url},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_not_found"
# Git branch doesn't exist
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
headers=crawler_auth_headers,
json={"customBehavior": git_url_invalid_branch},
)
assert r.status_code == 404
assert r.json()["detail"] == "custom_behavior_branch_not_found"

View File

@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}",
headers=crawler_auth_headers,
)
assert len(r.json()["items"]) == 3
assert r.json()["total"] == 3
assert len(r.json()["items"]) == 5
assert r.json()["total"] == 5
def test_get_config_by_modified_by(
@ -23,8 +23,8 @@ def test_get_config_by_modified_by(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}",
headers=crawler_auth_headers,
)
assert len(r.json()["items"]) == 3
assert r.json()["total"] == 3
assert len(r.json()["items"]) == 5
assert r.json()["total"] == 5
def test_get_configs_by_first_seed(
@ -362,9 +362,9 @@ def test_sort_crawl_configs(
headers=crawler_auth_headers,
)
data = r.json()
assert data["total"] == 9
assert data["total"] == 11
items = data["items"]
assert len(items) == 9
assert len(items) == 11
last_created = None
for config in items:

View File

@ -60,7 +60,11 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling",
headers=admin_auth_headers,
json={"maxCrawlSize": 200000, "lang": "fr"},
json={
"maxCrawlSize": 200000,
"lang": "fr",
"customBehaviors": ["git+https://github.com/webrecorder/custom-behaviors"],
},
)
assert r.status_code == 200
@ -72,6 +76,9 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id):
assert data["crawlingDefaults"]
assert data["crawlingDefaults"]["maxCrawlSize"] == 200000
assert data["crawlingDefaults"]["lang"] == "fr"
assert data["crawlingDefaults"]["customBehaviors"] == [
"git+https://github.com/webrecorder/custom-behaviors"
]
def test_rename_org(admin_auth_headers, default_org_id):