diff --git a/.github/workflows/k3d-nightly-ci.yaml b/.github/workflows/k3d-nightly-ci.yaml index dbe7fbf3..dcc48a88 100644 --- a/.github/workflows/k3d-nightly-ci.yaml +++ b/.github/workflows/k3d-nightly-ci.yaml @@ -62,7 +62,7 @@ jobs: - name: Start Cluster with Helm run: | - helm upgrade --install -f ./chart/values.yaml -f ./chart/test/test.yaml --set invite_expire_seconds=10 btrix ./chart/ + helm upgrade --install -f ./chart/values.yaml -f ./chart/test/test.yaml --set invite_expire_seconds=10 --set max_pages_per_crawl=10 btrix ./chart/ - name: Install Python uses: actions/setup-python@v3 diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index ee1ce874..6565cac8 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -99,7 +99,7 @@ class Crawl(CrawlConfigCore): state: str - stats: Optional[Dict[str, str]] + stats: Optional[Dict[str, int]] files: Optional[List[CrawlFile]] = [] @@ -146,7 +146,7 @@ class ListCrawlOut(BaseMongoModel): state: str - stats: Optional[Dict[str, str]] + stats: Optional[Dict[str, int]] fileSize: int = 0 fileCount: int = 0 diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator.py index 7695d82b..1f997308 100644 --- a/backend/btrixcloud/operator.py +++ b/backend/btrixcloud/operator.py @@ -409,6 +409,7 @@ class BtrixOperator(K8sAPI): return True + # pylint: disable=too-many-branches async def update_crawl_state(self, redis, crawl, status, pods): """update crawl state and check if crawl is now done""" results = await redis.hvals(f"{crawl.id}:status") @@ -478,8 +479,14 @@ class BtrixOperator(K8sAPI): # check if all crawlers failed if failed >= crawl.scale: + # if stopping, and no pages finished, mark as canceled + if crawl.stopping and not status.pagesDone: + state = "canceled" + else: + state = "failed" + status = await self.mark_finished( - redis, crawl.id, crawl.cid, status, state="failed" + redis, crawl.id, crawl.cid, status, state=state ) return status diff --git a/backend/test/conftest.py b/backend/test/conftest.py index b8b1e6fb..a4b6d379 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -183,7 +183,7 @@ def _crawler_create_config_only(crawler_auth_headers, default_org_id): "description": "crawler test crawl", "config": { "seeds": [{"url": "https://webrecorder.net/"}], - "pageExtraDelay": 5, + "pageExtraDelay": 20, "limit": 4, }, } diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 49cd8a5c..9cf8db54 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -145,8 +145,8 @@ def test_verify_wacz(): pages = z.open("pages/extraPages.jsonl").read().decode("utf-8") assert '"https://webrecorder.net/blog"' in pages - # 1 other page + header line - assert len(pages.strip().split("\n")) == 2 + # 3 other page + header line + assert len(pages.strip().split("\n")) == 4 def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id): diff --git a/backend/test/test_settings.py b/backend/test/test_settings.py index 1aa143a5..1f4515ac 100644 --- a/backend/test/test_settings.py +++ b/backend/test/test_settings.py @@ -13,6 +13,6 @@ def test_settings(): "registrationEnabled": False, "jwtTokenLifetime": 86400, "defaultBehaviorTimeSeconds": 300, - "maxPagesPerCrawl": 2, + "maxPagesPerCrawl": 4, "defaultPageLoadTimeSeconds": 120, } diff --git a/backend/test/test_stop_cancel_crawl.py b/backend/test/test_stop_cancel_crawl.py index a6a42f58..27d9cd4f 100644 --- a/backend/test/test_stop_cancel_crawl.py +++ b/backend/test/test_stop_cancel_crawl.py @@ -48,7 +48,7 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers): data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) - while data["state"] == "running": + while data["state"] in ("running", "waiting"): data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) assert data["state"] == "canceled" @@ -57,8 +57,44 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers): assert len(data["resources"]) == 0 -@pytest.mark.skipif(os.environ.get("CI") is not None, reason="Skip Test on CI") -def test_start_crawl_to_stop( +def test_start_crawl_and_stop_immediately( + default_org_id, crawler_config_id_only, crawler_auth_headers +): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + + crawl_id = data["started"] + + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop", + headers=crawler_auth_headers, + ) + data = r.json() + assert data["success"] == True + + # test crawl + data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) + assert data["stopping"] == True + + # test workflow + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}", + headers=crawler_auth_headers, + ) + assert r.json()["currCrawlStopping"] == True + + while data["state"] in ("starting", "running", "waiting"): + data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) + + assert data["state"] in ("canceled", "partial_complete") + assert data["stopping"] == True + + +def test_start_crawl_to_stop_partial( default_org_id, crawler_config_id_only, crawler_auth_headers ): r = requests.post( @@ -74,12 +110,15 @@ def test_start_crawl_to_stop( crawl_id = data["started"] -@pytest.mark.skipif(os.environ.get("CI") is not None, reason="Skip Test on CI") -def test_stop_crawl(default_org_id, crawler_config_id_only, crawler_auth_headers): +def test_stop_crawl_partial( + default_org_id, crawler_config_id_only, crawler_auth_headers +): data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) - while data["state"] == "starting": - time.sleep(5) + done = False + while not done: + time.sleep(2) data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) + done = data.get("stats") and data.get("stats").get("done") > 0 r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop", diff --git a/backend/test_nightly/conftest.py b/backend/test_nightly/conftest.py index e0001e28..cbf84e4c 100644 --- a/backend/test_nightly/conftest.py +++ b/backend/test_nightly/conftest.py @@ -232,8 +232,11 @@ def error_crawl_id(admin_auth_headers, default_org_id): "runNow": True, "name": "Youtube crawl with errors", "config": { - "seeds": [{"url": "https://www.youtube.com/watch?v=Sh-x3QmbRZc"}], - "limit": 10, + "seeds": [ + {"url": "https://invalid.webrecorder.net/"}, + {"url": "https://invalid-2.webrecorder.net/"}, + ], + "limit": 1, }, } r = requests.post( diff --git a/chart/test/test.yaml b/chart/test/test.yaml index 0f1ceec7..dc605d5e 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -6,7 +6,7 @@ frontend_pull_policy: "Never" default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz" -operator_resync_seconds: 5 +operator_resync_seconds: 3 mongo_auth: # specify either username + password (for local mongo) @@ -26,7 +26,7 @@ superuser: local_service_port: 30870 # test max pages per crawl global limit -max_pages_per_crawl: 2 +max_pages_per_crawl: 4 registration_enabled: "0" diff --git a/chart/values.yaml b/chart/values.yaml index 8d202d55..d45f8eae 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -94,10 +94,10 @@ frontend_image: "docker.io/webrecorder/browsertrix-frontend:latest" frontend_pull_policy: "Always" frontend_requests_cpu: "3m" -frontend_limits_cpu: "10m" +frontend_limits_cpu: "30m" frontend_requests_memory: "12Mi" -frontend_limits_memory: "20Mi" +frontend_limits_memory: "40Mi" # if set, maps nginx to a fixed port on host machine # must be between 30000 - 32767