From a51f7c635ee4eff1e8d4120dd587444014569b82 Mon Sep 17 00:00:00 2001
From: Tessa Walsh <tessa@bitarchivist.net>
Date: Mon, 7 Apr 2025 20:16:10 -0400
Subject: [PATCH] Add behavior logs from Redis to database and add endpoint to
 serve (#2526)

Backend work for #2524

This PR adds a second dedicated endpoint similar to `/errors`, as a
combined log endpoint would give a false impression of being the
complete crawl logs (which is far from what we're serving in Browsertrix
at this point).

Eventually when we have support for streaming live crawl logs in
`crawls/<id>/logs` I'd ideally like to deprecate these two dedicated
endpoints in favor of using that, but for now this seems like the best
solution.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
---
 .github/workflows/deploy-dev.yaml     |  2 +-
 backend/btrixcloud/basecrawls.py      |  3 +-
 backend/btrixcloud/crawls.py          | 45 ++++++++++++++++---
 backend/btrixcloud/models.py          | 12 ++---
 backend/btrixcloud/operator/crawls.py |  9 ++++
 backend/btrixcloud/utils.py           | 16 +++----
 backend/test/conftest.py              | 36 +++++++++++++++
 backend/test/test_run_crawl.py        | 64 +++++++++++++++++++++++++++
 backend/test/test_uploads.py          | 10 +++--
 chart/templates/configmap.yaml        |  2 +-
 10 files changed, 175 insertions(+), 24 deletions(-)
diff --git a/.github/workflows/deploy-dev.yaml b/.github/workflows/deploy-dev.yaml
index b5b60619..b9116005 100644
--- a/.github/workflows/deploy-dev.yaml
+++ b/.github/workflows/deploy-dev.yaml
@@ -46,7 +46,7 @@ jobs:
 
       - uses: actions/setup-python@v5
         with:
-          python-version: "3.x"
+          python-version: "3.12"
           cache: "poetry"
 
       - name: Install vault decryption dependencies
diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
index e9b70d4e..48ed8940 100644
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@@ -156,6 +156,7 @@ class BaseCrawlOps:
 
         files = res.pop("files", None)
         res.pop("errors", None)
+        res.pop("behaviorLogs", None)
 
         if not skip_resources:
             coll_ids = res.get("collectionIds")
@@ -568,7 +569,7 @@ class BaseCrawlOps:
             {"$match": query},
             {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
             {"$set": {"firstSeed": "$firstSeedObject.url"}},
-            {"$unset": ["firstSeedObject", "errors", "config"]},
+            {"$unset": ["firstSeedObject", "errors", "behaviorLogs", "config"]},
             {"$set": {"activeQAStats": "$qa.stats"}},
             {
                 "$set": {
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index b36f2309..3718a34b 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -22,7 +22,7 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format
 from .utils import (
     dt_now,
     date_to_str,
-    parse_jsonl_error_messages,
+    parse_jsonl_log_messages,
     stream_dict_list_as_csv,
     validate_regexes,
 )
@@ -49,7 +49,7 @@ from .models import (
     Seed,
     PaginatedCrawlOutResponse,
     PaginatedSeedResponse,
-    PaginatedCrawlErrorResponse,
+    PaginatedCrawlLogResponse,
     RUNNING_AND_WAITING_STATES,
     SUCCESSFUL_STATES,
     NON_RUNNING_STATES,
@@ -186,7 +186,7 @@ class CrawlOps(BaseCrawlOps):
             {"$match": query},
             {"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
             {"$set": {"firstSeed": "$firstSeedObject.url"}},
-            {"$unset": ["firstSeedObject", "errors", "config"]},
+            {"$unset": ["firstSeedObject", "errors", "behaviorLogs", "config"]},
             {"$set": {"activeQAStats": "$qa.stats"}},
             {
                 "$set": {
@@ -670,6 +670,17 @@ class CrawlOps(BaseCrawlOps):
         )
         return res is not None
 
+    async def add_crawl_behavior_log(
+        self,
+        crawl_id: str,
+        log_line: str,
+    ) -> bool:
+        """add crawl behavior log from redis to mongodb behaviorLogs field"""
+        res = await self.crawls.find_one_and_update(
+            {"_id": crawl_id}, {"$push": {"behaviorLogs": log_line}}
+        )
+        return res is not None
+
     async def add_crawl_file(
         self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int
     ) -> bool:
@@ -1595,7 +1606,7 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
     @app.get(
         "/orgs/{oid}/crawls/{crawl_id}/errors",
         tags=["crawls"],
-        response_model=PaginatedCrawlErrorResponse,
+        response_model=PaginatedCrawlLogResponse,
     )
     async def get_crawl_errors(
         crawl_id: str,
@@ -1609,7 +1620,31 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
         upper_bound = skip + pageSize
 
         errors = crawl.errors[skip:upper_bound] if crawl.errors else []
-        parsed_errors = parse_jsonl_error_messages(errors)
+        parsed_errors = parse_jsonl_log_messages(errors)
         return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize)
 
+    @app.get(
+        "/orgs/{oid}/crawls/{crawl_id}/behaviorLogs",
+        tags=["crawls"],
+        response_model=PaginatedCrawlLogResponse,
+    )
+    async def get_crawl_behavior_logs(
+        crawl_id: str,
+        pageSize: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+        org: Organization = Depends(org_viewer_dep),
+    ):
+        crawl = await ops.get_crawl(crawl_id, org)
+
+        skip = (page - 1) * pageSize
+        upper_bound = skip + pageSize
+
+        behavior_logs = (
+            crawl.behaviorLogs[skip:upper_bound] if crawl.behaviorLogs else []
+        )
+        parsed_logs = parse_jsonl_log_messages(behavior_logs)
+        return paginated_format(
+            parsed_logs, len(crawl.behaviorLogs or []), page, pageSize
+        )
+
     return ops
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index a3e1f2e7..7a3ad883 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -774,6 +774,7 @@ class CoreCrawlable(BaseModel):
     fileCount: int = 0
 
     errors: Optional[List[str]] = []
+    behaviorLogs: Optional[List[str]] = []
 
 
 # ============================================================================
@@ -845,6 +846,7 @@ class CrawlOut(BaseMongoModel):
     tags: Optional[List[str]] = []
 
     errors: Optional[List[str]] = []
+    behaviorLogs: Optional[List[str]] = []
 
     collectionIds: Optional[List[UUID]] = []
 
@@ -1042,8 +1044,8 @@ class CrawlScaleResponse(BaseModel):
 
 
 # ============================================================================
-class CrawlError(BaseModel):
-    """Crawl error"""
+class CrawlLogMessage(BaseModel):
+    """Crawl log message"""
 
     timestamp: str
     logLevel: str
@@ -2892,10 +2894,10 @@ class PaginatedWebhookNotificationResponse(PaginatedResponse):
 
 
 # ============================================================================
-class PaginatedCrawlErrorResponse(PaginatedResponse):
-    """Response model for crawl errors"""
+class PaginatedCrawlLogResponse(PaginatedResponse):
+    """Response model for crawl logs"""
 
-    items: List[CrawlError]
+    items: List[CrawlLogMessage]
 
 
 # ============================================================================
diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py
index aa8e152e..aaf61df5 100644
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@@ -86,6 +86,7 @@ class CrawlOperator(BaseOperator):
     done_key: str
     pages_key: str
     errors_key: str
+    behavior_logs_key: str
 
     fast_retry_secs: int
     log_failed_crawl_lines: int
@@ -98,6 +99,7 @@ class CrawlOperator(BaseOperator):
         self.done_key = "crawls-done"
         self.pages_key = "pages"
         self.errors_key = "e"
+        self.behavior_logs_key = "b"
 
         self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0)
 
@@ -904,6 +906,13 @@ class CrawlOperator(BaseOperator):
                 )
                 crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}")
 
+            behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}")
+            while behavior_log:
+                await self.crawl_ops.add_crawl_behavior_log(
+                    crawl.db_crawl_id, behavior_log
+                )
+                behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}")
+
             # ensure filesAdded and filesAddedSize always set
             status.filesAdded = int(await redis.get("filesAdded") or 0)
             status.filesAddedSize = int(await redis.get("filesAddedSize") or 0)
diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py
index 5b889655..ad72a2ad 100644
--- a/backend/btrixcloud/utils.py
+++ b/backend/btrixcloud/utils.py
@@ -70,21 +70,21 @@ def register_exit_handler() -> None:
     loop.add_signal_handler(signal.SIGTERM, exit_handler)
 
 
-def parse_jsonl_error_messages(errors: list[str]) -> list[dict]:
+def parse_jsonl_log_messages(log_lines: list[str]) -> list[dict]:
     """parse json-l error strings from redis/db into json"""
-    parsed_errors = []
-    for error_line in errors:
-        if not error_line:
+    parsed_log_lines = []
+    for log_line in log_lines:
+        if not log_line:
             continue
         try:
-            result = json.loads(error_line)
-            parsed_errors.append(result)
+            result = json.loads(log_line)
+            parsed_log_lines.append(result)
         except json.JSONDecodeError as err:
             print(
-                f"Error decoding json-l error line: {error_line}. Error: {err}",
+                f"Error decoding json-l log line: {log_line}. Error: {err}",
                 flush=True,
             )
-    return parsed_errors
+    return parsed_log_lines
 
 
 def is_bool(stri: Optional[str]) -> bool:
diff --git a/backend/test/conftest.py b/backend/test/conftest.py
index 16617c79..31dd92dd 100644
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@@ -511,6 +511,42 @@ def all_crawls_delete_config_id(admin_crawl_id):
     return _all_crawls_delete_config_id
 
 
+@pytest.fixture(scope="session")
+def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
+    crawl_data = {
+        "runNow": True,
+        "name": "Custom Behavior Logs",
+        "config": {
+            "seeds": [{"url": "https://specs.webrecorder.net/"}],
+            "customBehaviors": [
+                "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/custom-behaviors/custom.js"
+            ],
+            "limit": 1,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_id = data["run_now_job"]
+
+    # Wait for crawl to complete
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in FINISHED_STATES:
+            break
+        time.sleep(5)
+
+    return crawl_id
+
+
 @pytest.fixture(scope="session")
 def url_list_config_id(crawler_auth_headers, default_org_id):
     # Start crawl.
diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
index be5b84c8..667acec3 100644
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@@ -1283,3 +1283,67 @@ def test_delete_crawls_org_owner(
         headers=admin_auth_headers,
     )
     assert r.status_code == 404
+
+
+def test_custom_behavior_logs(
+    custom_behaviors_crawl_id, crawler_auth_headers, default_org_id
+):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/behaviorLogs",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    custom_log_line_count = 0
+
+    assert data["total"] > 0
+    for log in data["items"]:
+        assert log["timestamp"]
+        assert log["context"] in ("behavior", "behaviorScript", "behaviorScriptCustom")
+
+        if log["context"] == "behaviorScriptCustom":
+            assert log["message"] in (
+                "test-stat",
+                "done!",
+                "Using Site-Specific Behavior: TestBehavior",
+            )
+            if log["message"] in ("test-stat", "done!"):
+                assert log["details"]["behavior"] == "TestBehavior"
+            assert log["details"]["page"] == "https://specs.webrecorder.net/"
+
+            custom_log_line_count += 1
+
+    assert custom_log_line_count == 3
+
+
+def test_crawls_exclude_behavior_logs(
+    custom_behaviors_crawl_id, admin_auth_headers, default_org_id
+):
+    # Get endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data.get("behaviorLogs") == []
+
+    # replay.json endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{custom_behaviors_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data.get("behaviorLogs") == []
+
+    # List endpoint
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    crawls = r.json()["items"]
+    for crawl in crawls:
+        assert data.get("behaviorLogs") == []
diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
index ff35cad9..18a78800 100644
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@@ -607,7 +607,7 @@ def test_get_all_crawls_by_type(
     )
     assert r.status_code == 200
     data = r.json()
-    assert data["total"] == 5
+    assert data["total"] == 6
     for item in data["items"]:
         assert item["type"] == "crawl"
 
@@ -823,9 +823,10 @@ def test_all_crawls_search_values(
     assert r.status_code == 200
     data = r.json()
 
-    assert len(data["names"]) == 7
+    assert len(data["names"]) == 8
     expected_names = [
         "Crawler User Test Crawl",
+        "Custom Behavior Logs",
         "My Upload Updated",
         "test2.wacz",
         "All Crawls Test Crawl",
@@ -837,6 +838,7 @@ def test_all_crawls_search_values(
     assert sorted(data["descriptions"]) == ["Lorem ipsum"]
     assert sorted(data["firstSeeds"]) == [
         "https://old.webrecorder.net/",
+        "https://specs.webrecorder.net/",
         "https://webrecorder.net/",
     ]
 
@@ -848,12 +850,13 @@ def test_all_crawls_search_values(
     assert r.status_code == 200
     data = r.json()
 
-    assert len(data["names"]) == 4
+    assert len(data["names"]) == 5
     expected_names = [
         "Admin Test Crawl",
         "All Crawls Test Crawl",
         "Crawler User Crawl for Testing QA",
         "Crawler User Test Crawl",
+        "Custom Behavior Logs",
     ]
     for expected_name in expected_names:
         assert expected_name in data["names"]
@@ -861,6 +864,7 @@ def test_all_crawls_search_values(
     assert sorted(data["descriptions"]) == ["Lorem ipsum"]
     assert sorted(data["firstSeeds"]) == [
         "https://old.webrecorder.net/",
+        "https://specs.webrecorder.net/",
         "https://webrecorder.net/",
     ]
 
diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml
index 6e8cb405..185684a0 100644
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@@ -101,7 +101,7 @@ metadata:
   namespace: {{ .Values.crawler_namespace }}
 
 data:
-  {{- define "btrix.crawler_args" }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} {{- end }}
+  {{- define "btrix.crawler_args" }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --logBehaviorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }} {{- end }}
 
   CRAWL_ARGS: {{- include "btrix.crawler_args" . }}