diff --git a/backend/btrixcloud/crawl_job.py b/backend/btrixcloud/crawl_job.py index b6dc6a37..091e7b69 100644 --- a/backend/btrixcloud/crawl_job.py +++ b/backend/btrixcloud/crawl_job.py @@ -215,6 +215,8 @@ class CrawlJob(ABC): await self.update_crawl(state="failed", finished=self.finished) + await self.add_crawl_errors_to_mongo() + async def finish_crawl(self): """finish crawl""" if self.finished: @@ -235,9 +237,29 @@ class CrawlJob(ABC): await self.update_crawl(state=state, finished=self.finished) + await self.add_crawl_errors_to_mongo() + if completed: await self.inc_crawl_complete_stats() + async def add_crawl_errors_to_mongo(self, inc=100): + """Pull crawl errors from redis and write to mongo""" + index = 0 + while True: + skip = index * inc + upper_bound = skip + inc - 1 + errors = await self.redis.lrange(f"{self.job_id}:e", skip, upper_bound) + if not errors: + break + await self.crawls.find_one_and_update( + {"_id": self.job_id}, {"$push": {"errors": {"$each": errors}}} + ) + if len(errors) < inc: + # If we have fewer than inc errors, we can assume this is the + # last page of data to add. + break + index += 1 + async def inc_crawl_complete_stats(self): """Increment Crawl Stats""" @@ -341,6 +363,8 @@ class CrawlJob(ABC): self.finished = dt_now() await self.update_crawl(state="canceled", finished=self.finished) + await self.add_crawl_errors_to_mongo() + await self.delete_crawl() return {"success": True} diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index a64861a8..342cc4da 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -101,6 +101,8 @@ class Crawl(CrawlConfigCore): notes: Optional[str] + errors: Optional[List[str]] = [] + # ============================================================================ class CrawlOut(Crawl): @@ -113,6 +115,7 @@ class CrawlOut(Crawl): resources: Optional[List[CrawlFileOut]] = [] firstSeed: Optional[str] seedCount: Optional[int] = 0 + errors: Optional[List[str]] collections: Optional[List[str]] = [] @@ -149,6 +152,7 @@ class ListCrawlOut(BaseMongoModel): firstSeed: Optional[str] seedCount: Optional[int] = 0 + errors: Optional[List[str]] # ============================================================================ @@ -761,6 +765,24 @@ class CrawlOps: return num_removed + async def get_errors_from_redis( + self, crawl_id: str, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1 + ): + """Get crawl errors from Redis and optionally store in mongodb.""" + # Zero-index page for query + page = page - 1 + skip = page * page_size + + try: + redis = await self.get_redis(crawl_id) + errors = await redis.lrange(f"{crawl_id}:e", skip, page_size) + total = len(errors) + except exceptions.ConnectionError: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=503, detail="redis_connection_error") + + return errors, total + async def get_redis(self, crawl_id): """get redis url for crawl id""" # pylint: disable=line-too-long @@ -1136,6 +1158,29 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user raise HTTPException(status_code=400, detail="crawl_not_finished") + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/errors", + tags=["crawls"], + ) + async def get_crawl_errors( + crawl_id: str, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + org: Organization = Depends(org_crawl_dep), + ): + crawl_raw = await ops.get_crawl_raw(crawl_id, org) + crawl = Crawl.from_dict(crawl_raw) + + if crawl.finished: + skip = (page - 1) * pageSize + upper_bound = skip + pageSize - 1 + errors = crawl.errors[skip:upper_bound] + total = len(errors) + return paginated_format(errors, total, page, pageSize) + + errors, total = await ops.get_errors_from_redis(crawl_id, pageSize, page) + return paginated_format(errors, total, page, pageSize) + return ops diff --git a/backend/test_nightly/conftest.py b/backend/test_nightly/conftest.py index 7041299c..e0001e28 100644 --- a/backend/test_nightly/conftest.py +++ b/backend/test_nightly/conftest.py @@ -175,7 +175,6 @@ def large_crawl_id(admin_auth_headers, default_org_id): crawl_id = data["run_now_job"] - # Wait for crawl to start running while True: r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", @@ -225,3 +224,33 @@ def timeout_crawl(admin_auth_headers, default_org_id): ) data = r.json() return data["run_now_job"] + + +@pytest.fixture(scope="session") +def error_crawl_id(admin_auth_headers, default_org_id): + crawl_data = { + "runNow": True, + "name": "Youtube crawl with errors", + "config": { + "seeds": [{"url": "https://www.youtube.com/watch?v=Sh-x3QmbRZc"}], + "limit": 10, + }, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + return crawl_id + time.sleep(5) diff --git a/backend/test_nightly/test_crawl_errors.py b/backend/test_nightly/test_crawl_errors.py new file mode 100644 index 00000000..fa149e1a --- /dev/null +++ b/backend/test_nightly/test_crawl_errors.py @@ -0,0 +1,14 @@ +import requests + +from .conftest import API_PREFIX + + +def test_get_crawl_errors(admin_auth_headers, default_org_id, error_crawl_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{error_crawl_id}/errors", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] > 0 + assert data["items"] diff --git a/chart/values.yaml b/chart/values.yaml index 17b4eea9..0906eed8 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -151,7 +151,7 @@ crawler_namespace: "crawlers" crawl_retries: 1000 # browsertrix-crawler args: -crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone" +crawler_args: "--logging stats,behaviors,debug --generateWACZ --text --collection thecrawl --screencastPort 9037 --logErrorsToRedis --diskUtilization {{ .Values.disk_utilization_threshold | default 90 }} --waitOnDone" crawler_browser_instances: 2