diff --git a/backend/Dockerfile b/backend/Dockerfile index 9ae13260..413a5800 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,5 +1,10 @@ FROM docker.io/library/python:3.12-slim +RUN apt-get update \ + && apt-get install -y --no-install-recommends git \ + && apt-get purge -y --auto-remove \ + && rm -rf /var/lib/apt/lists/* + WORKDIR /app ADD requirements.txt /app diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 0f66f03e..aa2fd2bd 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -4,7 +4,7 @@ Crawl Config API handling # pylint: disable=too-many-lines -from typing import List, Union, Optional, TYPE_CHECKING, cast +from typing import List, Union, Optional, TYPE_CHECKING, cast, Dict, Tuple import asyncio import json @@ -15,8 +15,9 @@ from datetime import datetime from uuid import UUID, uuid4 import urllib.parse -import pymongo +import aiohttp from fastapi import APIRouter, Depends, HTTPException, Query +import pymongo from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( @@ -43,8 +44,9 @@ from .models import ( CrawlConfigDeletedResponse, CrawlerProxy, CrawlerProxies, + ValidateCustomBehavior, ) -from .utils import dt_now, slug_from_name, validate_regexes +from .utils import dt_now, slug_from_name, validate_regexes, is_url if TYPE_CHECKING: from .orgs import OrgOps @@ -231,6 +233,10 @@ class CrawlConfigOps: exclude = [exclude] validate_regexes(exclude) + if config_in.config.customBehaviors: + for url in config_in.config.customBehaviors: + self._validate_custom_behavior_url_syntax(url) + now = dt_now() crawlconfig = CrawlConfig( id=uuid4(), @@ -291,6 +297,23 @@ class CrawlConfigOps: execMinutesQuotaReached=exec_mins_quota_reached, ) + def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]: + """Validate custom behaviors are valid URLs after removing custom git syntax""" + git_prefix = "git+" + is_git_repo = False + + if url.startswith(git_prefix): + is_git_repo = True + url = url[len(git_prefix) :] + + parts = url.split("?") + url = parts[0] + + if not is_url(url): + raise HTTPException(status_code=400, detail="invalid_custom_behavior") + + return is_git_repo, parts + def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization): """ensure page limit is set to no greater than quota page limit, if any""" if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0: @@ -356,6 +379,10 @@ class CrawlConfigOps: exclude = [exclude] validate_regexes(exclude) + if update.config and update.config.customBehaviors: + for url in update.config.customBehaviors: + self._validate_custom_behavior_url_syntax(url) + # indicates if any k8s crawl config settings changed changed = False changed = changed or ( @@ -1070,6 +1097,75 @@ class CrawlConfigOps: flush=True, ) + async def _validate_behavior_git_repo(self, repo_url: str, branch: str = ""): + """Validate git repository and branch, if specified, exist and are reachable""" + cmd = f"git ls-remote {repo_url} HEAD" + proc = await asyncio.create_subprocess_shell(cmd) + if await proc.wait() > 0: + raise HTTPException( + status_code=404, + detail="custom_behavior_not_found", + ) + + if branch: + await asyncio.sleep(0.5) + git_remote_cmd = ( + f"git ls-remote --exit-code --heads {repo_url} refs/heads/{branch}" + ) + proc = await asyncio.create_subprocess_shell(git_remote_cmd) + if await proc.wait() > 0: + raise HTTPException( + status_code=404, + detail="custom_behavior_branch_not_found", + ) + + async def _validate_behavior_url(self, url: str): + """Validate behavior file exists at url""" + try: + timeout = aiohttp.ClientTimeout(total=10) + async with aiohttp.ClientSession(timeout=timeout) as session: + async with session.get(url) as resp: + if resp.status >= 400: + raise HTTPException( + status_code=404, + detail="custom_behavior_not_found", + ) + # pylint: disable=raise-missing-from + except aiohttp.ClientError: + raise HTTPException( + status_code=404, + detail="custom_behavior_not_found", + ) + + async def validate_custom_behavior(self, url: str) -> Dict[str, bool]: + """Validate custom behavior URL + + Implemented: + - Ensure URL is valid (after removing custom git prefix and syntax) + - Ensure URL returns status code < 400 + - Ensure git repository can be reached by git ls-remote and that branch + exists, if provided + """ + git_branch = "" + + is_git_repo, url_parts = self._validate_custom_behavior_url_syntax(url) + url = url_parts[0] + + if is_git_repo and len(url_parts) > 1: + query_str = url_parts[1] + try: + git_branch = urllib.parse.parse_qs(query_str)["branch"][0] + # pylint: disable=broad-exception-caught + except (KeyError, IndexError): + pass + + if is_git_repo: + await self._validate_behavior_git_repo(url, branch=git_branch) + else: + await self._validate_behavior_url(url) + + return {"success": True} + # ============================================================================ # pylint: disable=too-many-locals @@ -1316,6 +1412,14 @@ def init_crawl_config_api( asyncio.create_task(ops.re_add_all_scheduled_cron_jobs()) return {"success": True} + @router.post("/validate/custom-behavior", response_model=SuccessResponse) + async def validate_custom_behavior( + behavior: ValidateCustomBehavior, + # pylint: disable=unused-argument + org: Organization = Depends(org_crawl_dep), + ): + return await ops.validate_custom_behavior(behavior.customBehavior) + org_ops.router.include_router(router) return ops diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 25014e2a..a3e1f2e7 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -333,6 +333,7 @@ class RawCrawlConfig(BaseModel): logging: Optional[str] = None behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" + customBehaviors: List[str] = [] userAgent: Optional[str] = None @@ -541,6 +542,8 @@ class CrawlConfigDefaults(BaseModel): exclude: Optional[List[str]] = None + customBehaviors: List[str] = [] + # ============================================================================ class CrawlConfigAddedResponse(BaseModel): @@ -592,6 +595,13 @@ class CrawlConfigDeletedResponse(BaseModel): status: str +# ============================================================================ +class ValidateCustomBehavior(BaseModel): + """Input model for validating custom behavior URL/Git reference""" + + customBehavior: str + + # ============================================================================ ### CRAWLER VERSIONS ### diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 8e4b44fd..5b889655 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -11,6 +11,7 @@ import re from datetime import datetime, timezone from typing import Optional, Dict, Union, List, Any +from urllib.parse import urlparse from uuid import UUID from fastapi import HTTPException @@ -100,6 +101,15 @@ def is_falsy_bool(stri: Optional[str]) -> bool: return False +def is_url(url: str) -> bool: + """Check if string is a valid URL""" + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except ValueError: + return False + + def str_list_to_bools(str_list: List[str], allow_none=True) -> List[Union[bool, None]]: """Return version of input string list cast to bool or None, ignoring other values""" output: List[Union[bool, None]] = [] diff --git a/backend/test/conftest.py b/backend/test/conftest.py index b9a6e717..16617c79 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -326,7 +326,7 @@ def crawler_config_id_only(_crawler_create_config_only): return _crawler_config_id -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") def sample_crawl_data(): return { "runNow": False, diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 14bdf6c8..b007eec6 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -543,3 +543,192 @@ def test_add_crawl_config_invalid_exclude_regex( ) assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" + + +def test_add_crawl_config_custom_behaviors_invalid_url( + crawler_auth_headers, default_org_id, sample_crawl_data +): + sample_crawl_data["config"]["customBehaviors"] = ["http"] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_custom_behavior" + + +def test_add_crawl_config_custom_behaviors_valid_url( + crawler_auth_headers, default_org_id, sample_crawl_data +): + url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js" + sample_crawl_data["config"]["customBehaviors"] = [url] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + data = r.json() + config_id = data["id"] + assert config_id + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [url] + + +def test_add_update_crawl_config_custom_behaviors_git_url( + crawler_auth_headers, default_org_id, sample_crawl_data +): + git_url = "git+https://github.com/webrecorder/custom-behaviors" + git_url_with_params = ( + "git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors" + ) + + # Create workflow and validate it looks like we expect + sample_crawl_data["config"]["customBehaviors"] = [git_url] + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 200 + data = r.json() + config_id = data["id"] + assert config_id + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [git_url] + + # Try to update custom behaviors with invalid url, validate unchanged + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/", + headers=crawler_auth_headers, + json={ + "config": { + "customBehaviors": [git_url, "not-a-url"], + } + }, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_custom_behavior" + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [git_url] + + # Update custom behaviors with valid url, validate changed + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}/", + headers=crawler_auth_headers, + json={ + "config": { + "customBehaviors": [git_url_with_params], + } + }, + ) + assert r.status_code == 200 + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == config_id + assert data["config"]["customBehaviors"] == [git_url_with_params] + + +def test_validate_custom_behavior(crawler_auth_headers, default_org_id): + valid_url = "https://raw.githubusercontent.com/webrecorder/custom-behaviors/refs/heads/main/behaviors/fulcrum.js" + invalid_url_404 = "https://webrecorder.net/nonexistent/behavior.js" + doesnt_resolve_url = "https://nonexistenturl-for-testing-browsertrix.com" + malformed_url = "http" + + git_url = "git+https://github.com/webrecorder/custom-behaviors" + invalid_git_url = "git+https://github.com/webrecorder/doesntexist" + private_git_url = "git+https://github.com/webrecorder/website" + + git_url_with_params = ( + "git+https://github.com/webrecorder/custom-behaviors?branch=main&path=behaviors" + ) + git_url_invalid_branch = ( + "git+https://github.com/webrecorder/custom-behaviors?branch=doesntexist" + ) + + # Success + for url in (valid_url, git_url, git_url_with_params): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": url}, + ) + assert r.status_code == 200 + assert r.json()["success"] + + # Behavior 404s + for url in (invalid_url_404, doesnt_resolve_url): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": url}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_not_found" + + # Malformed url + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": malformed_url}, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_custom_behavior" + + # Git repo doesn't exist + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": invalid_git_url}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_not_found" + + # Git repo isn't public + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": private_git_url}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_not_found" + + # Git branch doesn't exist + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/validate/custom-behavior", + headers=crawler_auth_headers, + json={"customBehavior": git_url_invalid_branch}, + ) + assert r.status_code == 404 + assert r.json()["detail"] == "custom_behavior_branch_not_found" diff --git a/backend/test/test_filter_sort_results.py b/backend/test/test_filter_sort_results.py index bc40486a..fc104197 100644 --- a/backend/test/test_filter_sort_results.py +++ b/backend/test/test_filter_sort_results.py @@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 3 - assert r.json()["total"] == 3 + assert len(r.json()["items"]) == 5 + assert r.json()["total"] == 5 def test_get_config_by_modified_by( @@ -23,8 +23,8 @@ def test_get_config_by_modified_by( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}", headers=crawler_auth_headers, ) - assert len(r.json()["items"]) == 3 - assert r.json()["total"] == 3 + assert len(r.json()["items"]) == 5 + assert r.json()["total"] == 5 def test_get_configs_by_first_seed( @@ -362,9 +362,9 @@ def test_sort_crawl_configs( headers=crawler_auth_headers, ) data = r.json() - assert data["total"] == 9 + assert data["total"] == 11 items = data["items"] - assert len(items) == 9 + assert len(items) == 11 last_created = None for config in items: diff --git a/backend/test/test_org.py b/backend/test/test_org.py index 57c0b8fc..755e3711 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -60,7 +60,11 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id): r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling", headers=admin_auth_headers, - json={"maxCrawlSize": 200000, "lang": "fr"}, + json={ + "maxCrawlSize": 200000, + "lang": "fr", + "customBehaviors": ["git+https://github.com/webrecorder/custom-behaviors"], + }, ) assert r.status_code == 200 @@ -72,6 +76,9 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id): assert data["crawlingDefaults"] assert data["crawlingDefaults"]["maxCrawlSize"] == 200000 assert data["crawlingDefaults"]["lang"] == "fr" + assert data["crawlingDefaults"]["customBehaviors"] == [ + "git+https://github.com/webrecorder/custom-behaviors" + ] def test_rename_org(admin_auth_headers, default_org_id):