From 6b19f72a89456f995954f8644d8c4863b5bb2553 Mon Sep 17 00:00:00 2001
From: Tessa Walsh <tessa@bitarchivist.net>
Date: Mon, 17 Apr 2023 12:59:25 -0400
Subject: [PATCH] Add crawl errors endpoint (#757)

* Add crawl errors endpoint

If this endpoint is called while the crawl is running, errors are
pulled directly from redis.

If this endpoint is called when the crawl is finished, errors are
pulled from mongodb, where they're written when crawls complete.

* Add nightly backend test for errors endpoint

* Add errors for failed and cancelled crawls to mongo

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
---
 backend/btrixcloud/crawl_job.py           | 24 ++++++++++++
 backend/btrixcloud/crawls.py              | 45 +++++++++++++++++++++++
 backend/test_nightly/conftest.py          | 31 +++++++++++++++-
 backend/test_nightly/test_crawl_errors.py | 14 +++++++
 chart/values.yaml                         |  2 +-
 5 files changed, 114 insertions(+), 2 deletions(-)
 create mode 100644 backend/test_nightly/test_crawl_errors.py

diff --git a/backend/btrixcloud/crawl_job.py b/backend/btrixcloud/crawl_job.py
index b6dc6a37..091e7b69 100644
--- a/backend/btrixcloud/crawl_job.py
+++ b/backend/btrixcloud/crawl_job.py
@@ -215,6 +215,8 @@ class CrawlJob(ABC):
 
         await self.update_crawl(state="failed", finished=self.finished)
 
+        await self.add_crawl_errors_to_mongo()
+
     async def finish_crawl(self):
         """finish crawl"""
         if self.finished:
@@ -235,9 +237,29 @@ class CrawlJob(ABC):
 
         await self.update_crawl(state=state, finished=self.finished)
 
+        await self.add_crawl_errors_to_mongo()
+
         if completed:
             await self.inc_crawl_complete_stats()
 
+    async def add_crawl_errors_to_mongo(self, inc=100):
+        """Pull crawl errors from redis and write to mongo"""
+        index = 0
+        while True:
+            skip = index * inc
+            upper_bound = skip + inc - 1
+            errors = await self.redis.lrange(f"{self.job_id}:e", skip, upper_bound)
+            if not errors:
+                break
+            await self.crawls.find_one_and_update(
+                {"_id": self.job_id}, {"$push": {"errors": {"$each": errors}}}
+            )
+            if len(errors) < inc:
+                # If we have fewer than inc errors, we can assume this is the
+                # last page of data to add.
+                break
+            index += 1
+
     async def inc_crawl_complete_stats(self):
         """Increment Crawl Stats"""
 
@@ -341,6 +363,8 @@ class CrawlJob(ABC):
         self.finished = dt_now()
         await self.update_crawl(state="canceled", finished=self.finished)
 
+        await self.add_crawl_errors_to_mongo()
+
         await self.delete_crawl()
 
         return {"success": True}
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index a64861a8..342cc4da 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -101,6 +101,8 @@ class Crawl(CrawlConfigCore):
 
     notes: Optional[str]
 
+    errors: Optional[List[str]] = []
+
 
 # ============================================================================
 class CrawlOut(Crawl):
@@ -113,6 +115,7 @@ class CrawlOut(Crawl):
     resources: Optional[List[CrawlFileOut]] = []
     firstSeed: Optional[str]
     seedCount: Optional[int] = 0
+    errors: Optional[List[str]]
     collections: Optional[List[str]] = []
 
 
@@ -149,6 +152,7 @@ class ListCrawlOut(BaseMongoModel):
 
     firstSeed: Optional[str]
     seedCount: Optional[int] = 0
+    errors: Optional[List[str]]
 
 
 # ============================================================================
@@ -761,6 +765,24 @@ class CrawlOps:
 
         return num_removed
 
+    async def get_errors_from_redis(
+        self, crawl_id: str, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1
+    ):
+        """Get crawl errors from Redis and optionally store in mongodb."""
+        # Zero-index page for query
+        page = page - 1
+        skip = page * page_size
+
+        try:
+            redis = await self.get_redis(crawl_id)
+            errors = await redis.lrange(f"{crawl_id}:e", skip, page_size)
+            total = len(errors)
+        except exceptions.ConnectionError:
+            # pylint: disable=raise-missing-from
+            raise HTTPException(status_code=503, detail="redis_connection_error")
+
+        return errors, total
+
     async def get_redis(self, crawl_id):
         """get redis url for crawl id"""
         # pylint: disable=line-too-long
@@ -1136,6 +1158,29 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
 
         raise HTTPException(status_code=400, detail="crawl_not_finished")
 
+    @app.get(
+        "/orgs/{oid}/crawls/{crawl_id}/errors",
+        tags=["crawls"],
+    )
+    async def get_crawl_errors(
+        crawl_id: str,
+        pageSize: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+        org: Organization = Depends(org_crawl_dep),
+    ):
+        crawl_raw = await ops.get_crawl_raw(crawl_id, org)
+        crawl = Crawl.from_dict(crawl_raw)
+
+        if crawl.finished:
+            skip = (page - 1) * pageSize
+            upper_bound = skip + pageSize - 1
+            errors = crawl.errors[skip:upper_bound]
+            total = len(errors)
+            return paginated_format(errors, total, page, pageSize)
+
+        errors, total = await ops.get_errors_from_redis(crawl_id, pageSize, page)
+        return paginated_format(errors, total, page, pageSize)
+
     return ops
 
 
diff --git a/backend/test_nightly/conftest.py b/backend/test_nightly/conftest.py
index 7041299c..e0001e28 100644
--- a/backend/test_nightly/conftest.py
+++ b/backend/test_nightly/conftest.py
@@ -175,7 +175,6 @@ def large_crawl_id(admin_auth_headers, default_org_id):
 
     crawl_id = data["run_now_job"]
 
-    # Wait for crawl to start running
     while True:
         r = requests.get(
             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
@@ -225,3 +224,33 @@ def timeout_crawl(admin_auth_headers, default_org_id):
     )
     data = r.json()
     return data["run_now_job"]
+
+
+@pytest.fixture(scope="session")
+def error_crawl_id(admin_auth_headers, default_org_id):
+    crawl_data = {
+        "runNow": True,
+        "name": "Youtube crawl with errors",
+        "config": {
+            "seeds": [{"url": "https://www.youtube.com/watch?v=Sh-x3QmbRZc"}],
+            "limit": 10,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_id = data["run_now_job"]
+
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "complete":
+            return crawl_id
+        time.sleep(5)
diff --git a/backend/test_nightly/test_crawl_errors.py b/backend/test_nightly/test_crawl_errors.py
new file mode 100644
index 00000000..fa149e1a
--- /dev/null
+++ b/backend/test_nightly/test_crawl_errors.py
@@ -0,0 +1,14 @@
+import requests
+
+from .conftest import API_PREFIX
+
+
+def test_get_crawl_errors(admin_auth_headers, default_org_id, error_crawl_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{error_crawl_id}/errors",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["total"] > 0
+    assert data["items"]
diff --git a/chart/values.yaml b/chart/values.yaml
index 17b4eea9..0906eed8 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -151,7 +151,7 @@ crawler_namespace: "crawlers"
 crawl_retries: 1000
 
 # browsertrix-crawler args:
-crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone"
+crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --logErrorsToRedis --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone"
 
 crawler_browser_instances: 2