Add backend support for custom behaviors + validation endpoint (#2505)
Backend support for #2151 Adds support for specifying custom behaviors via a list of strings. When workflows are added or modified, minimal backend validation is done to ensure that all custom behavior URLs are valid URLs (after removing the git prefix and custom query arguments). A separate `POST /crawlconfigs/validate/custom-behavior` endpoint is also added, which can be used to validate a custom behavior URL. It performs the same syntax check as above and then: - For URL directly to behavior file, ensures URL resolves and returns a 2xx/3xx status code - For Git repositories, uses `git ls-remote` to ensure they exist (and that branch exists if specified) --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
c067a0fe7c
commit
cd7b695520
@ -1,5 +1,10 @@
|
||||
FROM docker.io/library/python:3.12-slim
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends git \
|
||||
&& apt-get purge -y --auto-remove \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt /app
|
||||
|
@ -4,7 +4,7 @@ Crawl Config API handling
|
||||
|
||||
# pylint: disable=too-many-lines
|
||||
|
||||
from typing import List, Union, Optional, TYPE_CHECKING, cast
|
||||
from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
@ -15,8 +15,9 @@ from datetime import datetime
|
||||
from uuid import UUID, uuid4
|
||||
import urllib.parse
|
||||
|
||||
import pymongo
|
||||
import aiohttp
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
import pymongo
|
||||
|
||||
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
||||
from .models import (
|
||||
@ -43,8 +44,9 @@ from .models import (
|
||||
CrawlConfigDeletedResponse,
|
||||
CrawlerProxy,
|
||||
CrawlerProxies,
|
||||
ValidateCustomBehavior,
|
||||
)
|
||||
from .utils import dt_now, slug_from_name, validate_regexes
|
||||
from .utils import dt_now, slug_from_name, validate_regexes, is_url
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .orgs import OrgOps
|
||||
@ -231,6 +233,10 @@ class CrawlConfigOps:
|
||||
exclude = [exclude]
|
||||
validate_regexes(exclude)
|
||||
|
||||
if config_in.config.customBehaviors:
|
||||
for url in config_in.config.customBehaviors:
|
||||
self._validate_custom_behavior_url_syntax(url)
|
||||
|
||||
now = dt_now()
|
||||
crawlconfig = CrawlConfig(
|
||||
id=uuid4(),
|
||||
@ -291,6 +297,23 @@ class CrawlConfigOps:
|
||||
execMinutesQuotaReached=exec_mins_quota_reached,
|
||||
)
|
||||
|
||||
def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]:
|
||||
"""Validate custom behaviors are valid URLs after removing custom git syntax"""
|
||||
git_prefix = "git+"
|
||||
is_git_repo = False
|
||||
|
||||
if url.startswith(git_prefix):
|
||||
is_git_repo = True
|
||||
url = url[len(git_prefix) :]
|
||||
|
||||
parts = url.split("?")
|
||||
url = parts[0]
|
||||
|
||||
if not is_url(url):
|
||||
raise HTTPException(status_code=400, detail="invalid_custom_behavior")
|
||||
|
||||
return is_git_repo, parts
|
||||
|
||||
def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
|
||||
"""ensure page limit is set to no greater than quota page limit, if any"""
|
||||
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
|
||||
@ -356,6 +379,10 @@ class CrawlConfigOps:
|
||||
exclude = [exclude]
|
||||
validate_regexes(exclude)
|
||||
|
||||
if update.config and update.config.customBehaviors:
|
||||
for url in update.config.customBehaviors:
|
||||
self._validate_custom_behavior_url_syntax(url)
|
||||
|
||||
# indicates if any k8s crawl config settings changed
|
||||
changed = False
|
||||
changed = changed or (
|
||||
@ -1070,6 +1097,75 @@ class CrawlConfigOps:
|
||||
flush=True,
|
||||
)
|
||||
|
||||
async def _validate_behavior_git_repo(self, repo_url: str, branch: str = ""):
|
||||
"""Validate git repository and branch, if specified, exist and are reachable"""
|
||||
cmd = f"git ls-remote {repo_url} HEAD"
|
||||
proc = await asyncio.create_subprocess_shell(cmd)
|
||||
if await proc.wait() > 0:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="custom_behavior_not_found",
|
||||
)
|
||||
|
||||
if branch:
|
||||
await asyncio.sleep(0.5)
|
||||
git_remote_cmd = (
|
||||
f"git ls-remote --exit-code --heads {repo_url} refs/heads/{branch}"
|
||||
)
|
||||
proc = await asyncio.create_subprocess_shell(git_remote_cmd)
|
||||
if await proc.wait() > 0:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="custom_behavior_branch_not_found",
|
||||
)
|
||||
|
||||
async def _validate_behavior_url(self, url: str):
|
||||
"""Validate behavior file exists at url"""
|
||||
try:
|
||||
timeout = aiohttp.ClientTimeout(total=10)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(url) as resp:
|
||||
if resp.status >= 400:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="custom_behavior_not_found",
|
||||
)
|
||||
# pylint: disable=raise-missing-from
|
||||
except aiohttp.ClientError:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="custom_behavior_not_found",
|
||||
)
|
||||
|
||||
async def validate_custom_behavior(self, url: str) -> Dict[str, bool]:
|
||||
"""Validate custom behavior URL
|
||||
|
||||
Implemented:
|
||||
- Ensure URL is valid (after removing custom git prefix and syntax)
|
||||
- Ensure URL returns status code < 400
|
||||
- Ensure git repository can be reached by git ls-remote and that branch
|
||||
exists, if provided
|
||||
"""
|
||||
git_branch = ""
|
||||
|
||||
is_git_repo, url_parts = self._validate_custom_behavior_url_syntax(url)
|
||||
url = url_parts[0]
|
||||
|
||||
if is_git_repo and len(url_parts) > 1:
|
||||
query_str = url_parts[1]
|
||||
try:
|
||||
git_branch = urllib.parse.parse_qs(query_str)["branch"][0]
|
||||
# pylint: disable=broad-exception-caught
|
||||
except (KeyError, IndexError):
|
||||
pass
|
||||
|
||||
if is_git_repo:
|
||||
await self._validate_behavior_git_repo(url, branch=git_branch)
|
||||
else:
|
||||
await self._validate_behavior_url(url)
|
||||
|
||||
return {"success": True}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-locals
|
||||
@ -1316,6 +1412,14 @@ def init_crawl_config_api(
|
||||
asyncio.create_task(ops.re_add_all_scheduled_cron_jobs())
|
||||
return {"success": True}
|
||||
|
||||
@router.post("/validate/custom-behavior", response_model=SuccessResponse)
|
||||
async def validate_custom_behavior(
|
||||
behavior: ValidateCustomBehavior,
|
||||
# pylint: disable=unused-argument
|
||||
org: Organization = Depends(org_crawl_dep),
|
||||
):
|
||||
return await ops.validate_custom_behavior(behavior.customBehavior)
|
||||
|
||||
org_ops.router.include_router(router)
|
||||
|
||||
return ops
|
||||
|
@ -333,6 +333,7 @@ class RawCrawlConfig(BaseModel):
|
||||
|
||||
logging: Optional[str] = None
|
||||
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
||||
customBehaviors: List[str] = []
|
||||
|
||||
userAgent: Optional[str] = None
|
||||
|
||||
@ -541,6 +542,8 @@ class CrawlConfigDefaults(BaseModel):
|
||||
|
||||
exclude: Optional[List[str]] = None
|
||||
|
||||
customBehaviors: List[str] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlConfigAddedResponse(BaseModel):
|
||||
@ -592,6 +595,13 @@ class CrawlConfigDeletedResponse(BaseModel):
|
||||
status: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class ValidateCustomBehavior(BaseModel):
|
||||
"""Input model for validating custom behavior URL/Git reference"""
|
||||
|
||||
customBehavior: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
### CRAWLER VERSIONS ###
|
||||
|
@ -11,6 +11,7 @@ import re
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Dict, Union, List, Any
|
||||
from urllib.parse import urlparse
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import HTTPException
|
||||
@ -100,6 +101,15 @@ def is_falsy_bool(stri: Optional[str]) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def is_url(url: str) -> bool:
|
||||
"""Check if string is a valid URL"""
|
||||
try:
|
||||
result = urlparse(url)
|
||||
return all([result.scheme, result.netloc])
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def str_list_to_bools(str_list: List[str], allow_none=True) -> List[Union[bool, None]]:
|
||||
"""Return version of input string list cast to bool or None, ignoring other values"""
|
||||
output: List[Union[bool, None]] = []
|
||||
|
@ -326,7 +326,7 @@ def crawler_config_id_only(_crawler_create_config_only):
|
||||
return _crawler_config_id
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@pytest.fixture(scope="function")
|
||||
def sample_crawl_data():
|
||||
return {
|
||||
"runNow": False,
|
||||
|
@ -543,3 +543,192 @@ def test_add_crawl_config_invalid_exclude_regex(
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_regex"
|
||||
|
||||
|
||||
def test_add_crawl_config_custom_behaviors_invalid_url(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
sample_crawl_data["config"]["customBehaviors"] = ["http"]
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=crawler_auth_headers,
|
||||
json=sample_crawl_data,
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_custom_behavior"
|
||||
|
||||
|
||||
def test_add_crawl_config_custom_behaviors_valid_url(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
|
||||
sample_crawl_data["config"]["customBehaviors"] = [url]
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=crawler_auth_headers,
|
||||
json=sample_crawl_data,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
config_id = data["id"]
|
||||
assert config_id
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["id"] == config_id
|
||||
assert data["config"]["customBehaviors"] == [url]
|
||||
|
||||
|
||||
def test_add_update_crawl_config_custom_behaviors_git_url(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
git_url = "git+https://github.com/webrecorder/custom-behaviors"
|
||||
git_url_with_params = (
|
||||
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
|
||||
)
|
||||
|
||||
# Create workflow and validate it looks like we expect
|
||||
sample_crawl_data["config"]["customBehaviors"] = [git_url]
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=crawler_auth_headers,
|
||||
json=sample_crawl_data,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
config_id = data["id"]
|
||||
assert config_id
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["id"] == config_id
|
||||
assert data["config"]["customBehaviors"] == [git_url]
|
||||
|
||||
# Try to update custom behaviors with invalid url, validate unchanged
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
|
||||
headers=crawler_auth_headers,
|
||||
json={
|
||||
"config": {
|
||||
"customBehaviors": [git_url, "not-a-url"],
|
||||
}
|
||||
},
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_custom_behavior"
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["id"] == config_id
|
||||
assert data["config"]["customBehaviors"] == [git_url]
|
||||
|
||||
# Update custom behaviors with valid url, validate changed
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/",
|
||||
headers=crawler_auth_headers,
|
||||
json={
|
||||
"config": {
|
||||
"customBehaviors": [git_url_with_params],
|
||||
}
|
||||
},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["id"] == config_id
|
||||
assert data["config"]["customBehaviors"] == [git_url_with_params]
|
||||
|
||||
|
||||
def test_validate_custom_behavior(crawler_auth_headers, default_org_id):
|
||||
valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js"
|
||||
invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js"
|
||||
doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com"
|
||||
malformed_url = "http"
|
||||
|
||||
git_url = "git+https://github.com/webrecorder/custom-behaviors"
|
||||
invalid_git_url = "git+https://github.com/webrecorder/doesntexist"
|
||||
private_git_url = "git+https://github.com/webrecorder/website"
|
||||
|
||||
git_url_with_params = (
|
||||
"git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors"
|
||||
)
|
||||
git_url_invalid_branch = (
|
||||
"git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist"
|
||||
)
|
||||
|
||||
# Success
|
||||
for url in (valid_url, git_url, git_url_with_params):
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
||||
headers=crawler_auth_headers,
|
||||
json={"customBehavior": url},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["success"]
|
||||
|
||||
# Behavior 404s
|
||||
for url in (invalid_url_404, doesnt_resolve_url):
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
||||
headers=crawler_auth_headers,
|
||||
json={"customBehavior": url},
|
||||
)
|
||||
assert r.status_code == 404
|
||||
assert r.json()["detail"] == "custom_behavior_not_found"
|
||||
|
||||
# Malformed url
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
||||
headers=crawler_auth_headers,
|
||||
json={"customBehavior": malformed_url},
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_custom_behavior"
|
||||
|
||||
# Git repo doesn't exist
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
||||
headers=crawler_auth_headers,
|
||||
json={"customBehavior": invalid_git_url},
|
||||
)
|
||||
assert r.status_code == 404
|
||||
assert r.json()["detail"] == "custom_behavior_not_found"
|
||||
|
||||
# Git repo isn't public
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
||||
headers=crawler_auth_headers,
|
||||
json={"customBehavior": private_git_url},
|
||||
)
|
||||
assert r.status_code == 404
|
||||
assert r.json()["detail"] == "custom_behavior_not_found"
|
||||
|
||||
# Git branch doesn't exist
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior",
|
||||
headers=crawler_auth_headers,
|
||||
json={"customBehavior": git_url_invalid_branch},
|
||||
)
|
||||
assert r.status_code == 404
|
||||
assert r.json()["detail"] == "custom_behavior_branch_not_found"
|
||||
|
@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert len(r.json()["items"]) == 3
|
||||
assert r.json()["total"] == 3
|
||||
assert len(r.json()["items"]) == 5
|
||||
assert r.json()["total"] == 5
|
||||
|
||||
|
||||
def test_get_config_by_modified_by(
|
||||
@ -23,8 +23,8 @@ def test_get_config_by_modified_by(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert len(r.json()["items"]) == 3
|
||||
assert r.json()["total"] == 3
|
||||
assert len(r.json()["items"]) == 5
|
||||
assert r.json()["total"] == 5
|
||||
|
||||
|
||||
def test_get_configs_by_first_seed(
|
||||
@ -362,9 +362,9 @@ def test_sort_crawl_configs(
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
assert data["total"] == 9
|
||||
assert data["total"] == 11
|
||||
items = data["items"]
|
||||
assert len(items) == 9
|
||||
assert len(items) == 11
|
||||
|
||||
last_created = None
|
||||
for config in items:
|
||||
|
@ -60,7 +60,11 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id):
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling",
|
||||
headers=admin_auth_headers,
|
||||
json={"maxCrawlSize": 200000, "lang": "fr"},
|
||||
json={
|
||||
"maxCrawlSize": 200000,
|
||||
"lang": "fr",
|
||||
"customBehaviors": ["git+https://github.com/webrecorder/custom-behaviors"],
|
||||
},
|
||||
)
|
||||
|
||||
assert r.status_code == 200
|
||||
@ -72,6 +76,9 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id):
|
||||
assert data["crawlingDefaults"]
|
||||
assert data["crawlingDefaults"]["maxCrawlSize"] == 200000
|
||||
assert data["crawlingDefaults"]["lang"] == "fr"
|
||||
assert data["crawlingDefaults"]["customBehaviors"] == [
|
||||
"git+https://github.com/webrecorder/custom-behaviors"
|
||||
]
|
||||
|
||||
|
||||
def test_rename_org(admin_auth_headers, default_org_id):
|
||||
|
Loading…
Reference in New Issue
Block a user