diff --git a/.github/workflows/deploy-dev.yaml b/.github/workflows/deploy-dev.yaml index b5b60619..b9116005 100644 --- a/.github/workflows/deploy-dev.yaml +++ b/.github/workflows/deploy-dev.yaml @@ -46,7 +46,7 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: "3.x" + python-version: "3.12" cache: "poetry" - name: Install vault decryption dependencies diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index e9b70d4e..48ed8940 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -156,6 +156,7 @@ class BaseCrawlOps: files = res.pop("files", None) res.pop("errors", None) + res.pop("behaviorLogs", None) if not skip_resources: coll_ids = res.get("collectionIds") @@ -568,7 +569,7 @@ class BaseCrawlOps: {"$match": query}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeed": "$firstSeedObject.url"}}, - {"$unset": ["firstSeedObject", "errors", "config"]}, + {"$unset": ["firstSeedObject", "errors", "behaviorLogs", "config"]}, {"$set": {"activeQAStats": "$qa.stats"}}, { "$set": { diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index b36f2309..3718a34b 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -22,7 +22,7 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import ( dt_now, date_to_str, - parse_jsonl_error_messages, + parse_jsonl_log_messages, stream_dict_list_as_csv, validate_regexes, ) @@ -49,7 +49,7 @@ from .models import ( Seed, PaginatedCrawlOutResponse, PaginatedSeedResponse, - PaginatedCrawlErrorResponse, + PaginatedCrawlLogResponse, RUNNING_AND_WAITING_STATES, SUCCESSFUL_STATES, NON_RUNNING_STATES, @@ -186,7 +186,7 @@ class CrawlOps(BaseCrawlOps): {"$match": query}, {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}}, {"$set": {"firstSeed": "$firstSeedObject.url"}}, - {"$unset": ["firstSeedObject", "errors", "config"]}, + {"$unset": ["firstSeedObject", "errors", "behaviorLogs", "config"]}, {"$set": {"activeQAStats": "$qa.stats"}}, { "$set": { @@ -670,6 +670,17 @@ class CrawlOps(BaseCrawlOps): ) return res is not None + async def add_crawl_behavior_log( + self, + crawl_id: str, + log_line: str, + ) -> bool: + """add crawl behavior log from redis to mongodb behaviorLogs field""" + res = await self.crawls.find_one_and_update( + {"_id": crawl_id}, {"$push": {"behaviorLogs": log_line}} + ) + return res is not None + async def add_crawl_file( self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int ) -> bool: @@ -1595,7 +1606,7 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): @app.get( "/orgs/{oid}/crawls/{crawl_id}/errors", tags=["crawls"], - response_model=PaginatedCrawlErrorResponse, + response_model=PaginatedCrawlLogResponse, ) async def get_crawl_errors( crawl_id: str, @@ -1609,7 +1620,31 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): upper_bound = skip + pageSize errors = crawl.errors[skip:upper_bound] if crawl.errors else [] - parsed_errors = parse_jsonl_error_messages(errors) + parsed_errors = parse_jsonl_log_messages(errors) return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize) + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/behaviorLogs", + tags=["crawls"], + response_model=PaginatedCrawlLogResponse, + ) + async def get_crawl_behavior_logs( + crawl_id: str, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + org: Organization = Depends(org_viewer_dep), + ): + crawl = await ops.get_crawl(crawl_id, org) + + skip = (page - 1) * pageSize + upper_bound = skip + pageSize + + behavior_logs = ( + crawl.behaviorLogs[skip:upper_bound] if crawl.behaviorLogs else [] + ) + parsed_logs = parse_jsonl_log_messages(behavior_logs) + return paginated_format( + parsed_logs, len(crawl.behaviorLogs or []), page, pageSize + ) + return ops diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index a3e1f2e7..7a3ad883 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -774,6 +774,7 @@ class CoreCrawlable(BaseModel): fileCount: int = 0 errors: Optional[List[str]] = [] + behaviorLogs: Optional[List[str]] = [] # ============================================================================ @@ -845,6 +846,7 @@ class CrawlOut(BaseMongoModel): tags: Optional[List[str]] = [] errors: Optional[List[str]] = [] + behaviorLogs: Optional[List[str]] = [] collectionIds: Optional[List[UUID]] = [] @@ -1042,8 +1044,8 @@ class CrawlScaleResponse(BaseModel): # ============================================================================ -class CrawlError(BaseModel): - """Crawl error""" +class CrawlLogMessage(BaseModel): + """Crawl log message""" timestamp: str logLevel: str @@ -2892,10 +2894,10 @@ class PaginatedWebhookNotificationResponse(PaginatedResponse): # ============================================================================ -class PaginatedCrawlErrorResponse(PaginatedResponse): - """Response model for crawl errors""" +class PaginatedCrawlLogResponse(PaginatedResponse): + """Response model for crawl logs""" - items: List[CrawlError] + items: List[CrawlLogMessage] # ============================================================================ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index aa8e152e..aaf61df5 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -86,6 +86,7 @@ class CrawlOperator(BaseOperator): done_key: str pages_key: str errors_key: str + behavior_logs_key: str fast_retry_secs: int log_failed_crawl_lines: int @@ -98,6 +99,7 @@ class CrawlOperator(BaseOperator): self.done_key = "crawls-done" self.pages_key = "pages" self.errors_key = "e" + self.behavior_logs_key = "b" self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0) @@ -904,6 +906,13 @@ class CrawlOperator(BaseOperator): ) crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") + behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}") + while behavior_log: + await self.crawl_ops.add_crawl_behavior_log( + crawl.db_crawl_id, behavior_log + ) + behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}") + # ensure filesAdded and filesAddedSize always set status.filesAdded = int(await redis.get("filesAdded") or 0) status.filesAddedSize = int(await redis.get("filesAddedSize") or 0) diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 5b889655..ad72a2ad 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -70,21 +70,21 @@ def register_exit_handler() -> None: loop.add_signal_handler(signal.SIGTERM, exit_handler) -def parse_jsonl_error_messages(errors: list[str]) -> list[dict]: +def parse_jsonl_log_messages(log_lines: list[str]) -> list[dict]: """parse json-l error strings from redis/db into json""" - parsed_errors = [] - for error_line in errors: - if not error_line: + parsed_log_lines = [] + for log_line in log_lines: + if not log_line: continue try: - result = json.loads(error_line) - parsed_errors.append(result) + result = json.loads(log_line) + parsed_log_lines.append(result) except json.JSONDecodeError as err: print( - f"Error decoding json-l error line: {error_line}. Error: {err}", + f"Error decoding json-l log line: {log_line}. Error: {err}", flush=True, ) - return parsed_errors + return parsed_log_lines def is_bool(stri: Optional[str]) -> bool: diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 16617c79..31dd92dd 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -511,6 +511,42 @@ def all_crawls_delete_config_id(admin_crawl_id): return _all_crawls_delete_config_id +@pytest.fixture(scope="session") +def custom_behaviors_crawl_id(admin_auth_headers, default_org_id): + crawl_data = { + "runNow": True, + "name": "Custom Behavior Logs", + "config": { + "seeds": [{"url": "https://specs.webrecorder.net/"}], + "customBehaviors": [ + "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom.js" + ], + "limit": 1, + }, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + + # Wait for crawl to complete + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] in FINISHED_STATES: + break + time.sleep(5) + + return crawl_id + + @pytest.fixture(scope="session") def url_list_config_id(crawler_auth_headers, default_org_id): # Start crawl. diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index be5b84c8..667acec3 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -1283,3 +1283,67 @@ def test_delete_crawls_org_owner( headers=admin_auth_headers, ) assert r.status_code == 404 + + +def test_custom_behavior_logs( + custom_behaviors_crawl_id, crawler_auth_headers, default_org_id +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/behaviorLogs", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + custom_log_line_count = 0 + + assert data["total"] > 0 + for log in data["items"]: + assert log["timestamp"] + assert log["context"] in ("behavior", "behaviorScript", "behaviorScriptCustom") + + if log["context"] == "behaviorScriptCustom": + assert log["message"] in ( + "test-stat", + "done!", + "Using Site-Specific Behavior: TestBehavior", + ) + if log["message"] in ("test-stat", "done!"): + assert log["details"]["behavior"] == "TestBehavior" + assert log["details"]["page"] == "https://specs.webrecorder.net/" + + custom_log_line_count += 1 + + assert custom_log_line_count == 3 + + +def test_crawls_exclude_behavior_logs( + custom_behaviors_crawl_id, admin_auth_headers, default_org_id +): + # Get endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data.get("behaviorLogs") == [] + + # replay.json endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data.get("behaviorLogs") == [] + + # List endpoint + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + crawls = r.json()["items"] + for crawl in crawls: + assert data.get("behaviorLogs") == [] diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index ff35cad9..18a78800 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -607,7 +607,7 @@ def test_get_all_crawls_by_type( ) assert r.status_code == 200 data = r.json() - assert data["total"] == 5 + assert data["total"] == 6 for item in data["items"]: assert item["type"] == "crawl" @@ -823,9 +823,10 @@ def test_all_crawls_search_values( assert r.status_code == 200 data = r.json() - assert len(data["names"]) == 7 + assert len(data["names"]) == 8 expected_names = [ "Crawler User Test Crawl", + "Custom Behavior Logs", "My Upload Updated", "test2.wacz", "All Crawls Test Crawl", @@ -837,6 +838,7 @@ def test_all_crawls_search_values( assert sorted(data["descriptions"]) == ["Lorem ipsum"] assert sorted(data["firstSeeds"]) == [ "https://old.webrecorder.net/", + "https://specs.webrecorder.net/", "https://webrecorder.net/", ] @@ -848,12 +850,13 @@ def test_all_crawls_search_values( assert r.status_code == 200 data = r.json() - assert len(data["names"]) == 4 + assert len(data["names"]) == 5 expected_names = [ "Admin Test Crawl", "All Crawls Test Crawl", "Crawler User Crawl for Testing QA", "Crawler User Test Crawl", + "Custom Behavior Logs", ] for expected_name in expected_names: assert expected_name in data["names"] @@ -861,6 +864,7 @@ def test_all_crawls_search_values( assert sorted(data["descriptions"]) == ["Lorem ipsum"] assert sorted(data["firstSeeds"]) == [ "https://old.webrecorder.net/", + "https://specs.webrecorder.net/", "https://webrecorder.net/", ] diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 6e8cb405..185684a0 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -101,7 +101,7 @@ metadata: namespace: {{ .Values.crawler_namespace }} data: - {{- define "btrix.crawler_args" }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} {{- end }} + {{- define "btrix.crawler_args" }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --logBehaviorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} {{- end }} CRAWL_ARGS: {{- include "btrix.crawler_args" . }}