From cd7b695520046a533c6506c5482daa49ca38cb35 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 2 Apr 2025 19:20:51 -0400 Subject: [PATCH] Add backend support for custom behaviors + validation endpoint (#2505) Backend support for #2151 Adds support for specifying custom behaviors via a list of strings. When workflows are added or modified, minimal backend validation is done to ensure that all custom behavior URLs are valid URLs (after removing the git prefix and custom query arguments). A separate `POST /crawlconfigs/validate/custom-behavior` endpoint is also added, which can be used to validate a custom behavior URL. It performs the same syntax check as above and then: - For URL directly to behavior file, ensures URL resolves and returns a 2xx/3xx status code - For Git repositories, uses `git ls-remote` to ensure they exist (and that branch exists if specified) --------- Co-authored-by: Ilya Kreymer --- backend/Dockerfile | 5 + backend/btrixcloud/crawlconfigs.py | 110 ++++++++++++- backend/btrixcloud/models.py | 10 ++ backend/btrixcloud/utils.py | 10 ++ backend/test/conftest.py | 2 +- backend/test/test_crawlconfigs.py | 189 +++++++++++++++++++++++ backend/test/test_filter_sort_results.py | 12 +- backend/test/test_org.py | 9 +- 8 files changed, 336 insertions(+), 11 deletions(-) diff --git a/backend/Dockerfile b/backend/Dockerfile index 9ae13260..413a5800 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,5 +1,10 @@ FROM docker.io/library/python:3.12-slim +RUN apt-get update \ + && apt-get install -y --no-install-recommends git \ + && apt-get purge -y --auto-remove \ + && rm -rf /var/lib/apt/lists/* + WORKDIR /app ADD requirements.txt /app diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 0f66f03e..aa2fd2bd 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -4,7 +4,7 @@ Crawl Config API handling # pylint: disable=too-many-lines -from typing import List, Union, Optional, TYPE_CHECKING, cast +from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple import asyncio import json @@ -15,8 +15,9 @@ from datetime import datetime from uuid import UUID, uuid4 import urllib.parse -import pymongo +import aiohttp from fastapi import APIRouter, Depends, HTTPException, Query +import pymongo from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( @@ -43,8 +44,9 @@ from .models import ( CrawlConfigDeletedResponse, CrawlerProxy, CrawlerProxies, + ValidateCustomBehavior, ) -from .utils import dt_now, slug_from_name, validate_regexes +from .utils import dt_now, slug_from_name, validate_regexes, is_url if TYPE_CHECKING: from .orgs import OrgOps @@ -231,6 +233,10 @@ class CrawlConfigOps: exclude = [exclude] validate_regexes(exclude) + if config_in.config.customBehaviors: + for url in config_in.config.customBehaviors: + self._validate_custom_behavior_url_syntax(url) + now = dt_now() crawlconfig = CrawlConfig( id=uuid4(), @@ -291,6 +297,23 @@ class CrawlConfigOps: execMinutesQuotaReached=exec_mins_quota_reached, ) + def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]: + """Validate custom behaviors are valid URLs after removing custom git syntax""" + git_prefix = "git+" + is_git_repo = False + + if url.startswith(git_prefix): + is_git_repo = True + url = url[len(git_prefix) :] + + parts = url.split("?") + url = parts[0] + + if not is_url(url): + raise HTTPException(status_code=400, detail="invalid_custom_behavior") + + return is_git_repo, parts + def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization): """ensure page limit is set to no greater than quota page limit, if any""" if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0: @@ -356,6 +379,10 @@ class CrawlConfigOps: exclude = [exclude] validate_regexes(exclude) + if update.config and update.config.customBehaviors: + for url in update.config.customBehaviors: + self._validate_custom_behavior_url_syntax(url) + # indicates if any k8s crawl config settings changed changed = False changed = changed or ( @@ -1070,6 +1097,75 @@ class CrawlConfigOps: flush=True, ) + async def _validate_behavior_git_repo(self, repo_url: str, branch: str = ""): + """Validate git repository and branch, if specified, exist and are reachable""" + cmd = f"git ls-remote {repo_url} HEAD" + proc = await asyncio.create_subprocess_shell(cmd) + if await proc.wait() > 0: + raise HTTPException( + status_code=404, + detail="custom_behavior_not_found", + ) + + if branch: + await asyncio.sleep(0.5) + git_remote_cmd = ( + f"git ls-remote --exit-code --heads {repo_url} refs/heads/{branch}" + ) + proc = await asyncio.create_subprocess_shell(git_remote_cmd) + if await proc.wait() > 0: + raise HTTPException( + status_code=404, + detail="custom_behavior_branch_not_found", + ) + + async def _validate_behavior_url(self, url: str): + """Validate behavior file exists at url""" + try: + timeout = aiohttp.ClientTimeout(total=10) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get(url) as resp: + if resp.status >= 400: + raise HTTPException( + status_code=404, + detail="custom_behavior_not_found", + ) + # pylint: disable=raise-missing-from + except aiohttp.ClientError: + raise HTTPException( + status_code=404, + detail="custom_behavior_not_found", + ) + + async def validate_custom_behavior(self, url: str) -> Dict[str, bool]: + """Validate custom behavior URL + + Implemented: + - Ensure URL is valid (after removing custom git prefix and syntax) + - Ensure URL returns status code < 400 + - Ensure git repository can be reached by git ls-remote and that branch + exists, if provided + """ + git_branch = "" + + is_git_repo, url_parts = self._validate_custom_behavior_url_syntax(url) + url = url_parts[0] + + if is_git_repo and len(url_parts) > 1: + query_str = url_parts[1] + try: + git_branch = urllib.parse.parse_qs(query_str)["branch"][0] + # pylint: disable=broad-exception-caught + except (KeyError, IndexError): + pass + + if is_git_repo: + await self._validate_behavior_git_repo(url, branch=git_branch) + else: + await self._validate_behavior_url(url) + + return {"success": True} + # ============================================================================ # pylint: disable=too-many-locals @@ -1316,6 +1412,14 @@ def init_crawl_config_api( asyncio.create_task(ops.re_add_all_scheduled_cron_jobs()) return {"success": True} + @router.post("/validate/custom-behavior", response_model=SuccessResponse) + async def validate_custom_behavior( + behavior: ValidateCustomBehavior, + # pylint: disable=unused-argument + org: Organization = Depends(org_crawl_dep), + ): + return await ops.validate_custom_behavior(behavior.customBehavior) + org_ops.router.include_router(router) return ops diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 25014e2a..a3e1f2e7 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -333,6 +333,7 @@ class RawCrawlConfig(BaseModel): logging: Optional[str] = None behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" + customBehaviors: List[str] = [] userAgent: Optional[str] = None @@ -541,6 +542,8 @@ class CrawlConfigDefaults(BaseModel): exclude: Optional[List[str]] = None + customBehaviors: List[str] = [] + # ============================================================================ class CrawlConfigAddedResponse(BaseModel): @@ -592,6 +595,13 @@ class CrawlConfigDeletedResponse(BaseModel): status: str +# ============================================================================ +class ValidateCustomBehavior(BaseModel): + """Input model for validating custom behavior URL/Git reference""" + + customBehavior: str + + # ============================================================================ ### CRAWLER VERSIONS ### diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 8e4b44fd..5b889655 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -11,6 +11,7 @@ import re from datetime import datetime, timezone from typing import Optional, Dict, Union, List, Any +from urllib.parse import urlparse from uuid import UUID from fastapi import HTTPException @@ -100,6 +101,15 @@ def is_falsy_bool(stri: Optional[str]) -> bool: return False +def is_url(url: str) -> bool: + """Check if string is a valid URL""" + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + def str_list_to_bools(str_list: List[str], allow_none=True) -> List[Union[bool, None]]: """Return version of input string list cast to bool or None, ignoring other values""" output: List[Union[bool, None]] = [] diff --git a/backend/test/conftest.py b/backend/test/conftest.py index b9a6e717..16617c79 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -326,7 +326,7 @@ def crawler_config_id_only(_crawler_create_config_only): return _crawler_config_id -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def sample_crawl_data(): return { "runNow": False, diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 14bdf6c8..b007eec6 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -543,3 +543,192 @@ def test_add_crawl_config_invalid_exclude_regex( ) assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" + + +def test_add_crawl_config_custom_behaviors_invalid_url( + crawler_auth_headers, default_org_id, sample_crawl_data +): + sample_crawl_data["config"]["customBehaviors"] = ["http"] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_custom_behavior" + + +def test_add_crawl_config_custom_behaviors_valid_url( + crawler_auth_headers, default_org_id, sample_crawl_data +): + url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js" + sample_crawl_data["config"]["customBehaviors"] = [url] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + data = r.json() + config_id = data["id"] + assert config_id + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [url] + + +def test_add_update_crawl_config_custom_behaviors_git_url( + crawler_auth_headers, default_org_id, sample_crawl_data +): + git_url = "git+https://github.com/webrecorder/custom-behaviors" + git_url_with_params = ( + "git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors" + ) + + # Create workflow and validate it looks like we expect + sample_crawl_data["config"]["customBehaviors"] = [git_url] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + data = r.json() + config_id = data["id"] + assert config_id + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [git_url] + + # Try to update custom behaviors with invalid url, validate unchanged + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/", + headers=crawler_auth_headers, + json={ + "config": { + "customBehaviors": [git_url, "not-a-url"], + } + }, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_custom_behavior" + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [git_url] + + # Update custom behaviors with valid url, validate changed + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/", + headers=crawler_auth_headers, + json={ + "config": { + "customBehaviors": [git_url_with_params], + } + }, + ) + assert r.status_code == 200 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [git_url_with_params] + + +def test_validate_custom_behavior(crawler_auth_headers, default_org_id): + valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js" + invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js" + doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com" + malformed_url = "http" + + git_url = "git+https://github.com/webrecorder/custom-behaviors" + invalid_git_url = "git+https://github.com/webrecorder/doesntexist" + private_git_url = "git+https://github.com/webrecorder/website" + + git_url_with_params = ( + "git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors" + ) + git_url_invalid_branch = ( + "git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist" + ) + + # Success + for url in (valid_url, git_url, git_url_with_params): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": url}, + ) + assert r.status_code == 200 + assert r.json()["success"] + + # Behavior 404s + for url in (invalid_url_404, doesnt_resolve_url): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": url}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_not_found" + + # Malformed url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": malformed_url}, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_custom_behavior" + + # Git repo doesn't exist + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": invalid_git_url}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_not_found" + + # Git repo isn't public + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": private_git_url}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_not_found" + + # Git branch doesn't exist + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": git_url_invalid_branch}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_branch_not_found" diff --git a/backend/test/test_filter_sort_results.py b/backend/test/test_filter_sort_results.py index bc40486a..fc104197 100644 --- a/backend/test/test_filter_sort_results.py +++ b/backend/test/test_filter_sort_results.py @@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 3 - assert r.json()["total"] == 3 + assert len(r.json()["items"]) == 5 + assert r.json()["total"] == 5 def test_get_config_by_modified_by( @@ -23,8 +23,8 @@ def test_get_config_by_modified_by( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 3 - assert r.json()["total"] == 3 + assert len(r.json()["items"]) == 5 + assert r.json()["total"] == 5 def test_get_configs_by_first_seed( @@ -362,9 +362,9 @@ def test_sort_crawl_configs( headers=crawler_auth_headers, ) data = r.json() - assert data["total"] == 9 + assert data["total"] == 11 items = data["items"] - assert len(items) == 9 + assert len(items) == 11 last_created = None for config in items: diff --git a/backend/test/test_org.py b/backend/test/test_org.py index 57c0b8fc..755e3711 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -60,7 +60,11 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id): r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling", headers=admin_auth_headers, - json={"maxCrawlSize": 200000, "lang": "fr"}, + json={ + "maxCrawlSize": 200000, + "lang": "fr", + "customBehaviors": ["git+https://github.com/webrecorder/custom-behaviors"], + }, ) assert r.status_code == 200 @@ -72,6 +76,9 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id): assert data["crawlingDefaults"] assert data["crawlingDefaults"]["maxCrawlSize"] == 200000 assert data["crawlingDefaults"]["lang"] == "fr" + assert data["crawlingDefaults"]["customBehaviors"] == [ + "git+https://github.com/webrecorder/custom-behaviors" + ] def test_rename_org(admin_auth_headers, default_org_id):