tests: fixes for crawl cancel + crawl stopped (#864)
* tests: - fix cancel crawl test by ensuring state is not running or waiting - fix stop crawl test by ensuring stop is only initiated after at least one page has been crawled, otherwise result may be failed, as no crawl data has been crawled yet (separate fix in crawler to avoid loop if stopped before any data written webrecorder/browsertrix-crawler#314) - bump page limit to 4 for tests to ensure crawl is partially complete, not fully complete when stopping - allow canceled or partial_complete due to race condition * chart: bump frontend limits in default, not just for tests (addresses #780) * crawl stop before starting: - if crawl stopped before it started, mark as canceled - add test for stopping immediately, which should result in 'canceled' crawl - attempt to increase resync interval for immediate failure - nightly tests: increase page limit to test timeout * backend: - detect stopped-before-start crawl as 'failed' instead of 'done' - stats: return stats counters as int instead of string
This commit is contained in:
parent
28f1c815d0
commit
12f7db3ae2
2
.github/workflows/k3d-nightly-ci.yaml
vendored
2
.github/workflows/k3d-nightly-ci.yaml
vendored
@ -62,7 +62,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Start Cluster with Helm
|
- name: Start Cluster with Helm
|
||||||
run: |
|
run: |
|
||||||
helm upgrade --install -f ./chart/values.yaml -f ./chart/test/test.yaml --set invite_expire_seconds=10 btrix ./chart/
|
helm upgrade --install -f ./chart/values.yaml -f ./chart/test/test.yaml --set invite_expire_seconds=10 --set max_pages_per_crawl=10 btrix ./chart/
|
||||||
|
|
||||||
- name: Install Python
|
- name: Install Python
|
||||||
uses: actions/setup-python@v3
|
uses: actions/setup-python@v3
|
||||||
|
@ -99,7 +99,7 @@ class Crawl(CrawlConfigCore):
|
|||||||
|
|
||||||
state: str
|
state: str
|
||||||
|
|
||||||
stats: Optional[Dict[str, str]]
|
stats: Optional[Dict[str, int]]
|
||||||
|
|
||||||
files: Optional[List[CrawlFile]] = []
|
files: Optional[List[CrawlFile]] = []
|
||||||
|
|
||||||
@ -146,7 +146,7 @@ class ListCrawlOut(BaseMongoModel):
|
|||||||
|
|
||||||
state: str
|
state: str
|
||||||
|
|
||||||
stats: Optional[Dict[str, str]]
|
stats: Optional[Dict[str, int]]
|
||||||
|
|
||||||
fileSize: int = 0
|
fileSize: int = 0
|
||||||
fileCount: int = 0
|
fileCount: int = 0
|
||||||
|
@ -409,6 +409,7 @@ class BtrixOperator(K8sAPI):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# pylint: disable=too-many-branches
|
||||||
async def update_crawl_state(self, redis, crawl, status, pods):
|
async def update_crawl_state(self, redis, crawl, status, pods):
|
||||||
"""update crawl state and check if crawl is now done"""
|
"""update crawl state and check if crawl is now done"""
|
||||||
results = await redis.hvals(f"{crawl.id}:status")
|
results = await redis.hvals(f"{crawl.id}:status")
|
||||||
@ -478,8 +479,14 @@ class BtrixOperator(K8sAPI):
|
|||||||
|
|
||||||
# check if all crawlers failed
|
# check if all crawlers failed
|
||||||
if failed >= crawl.scale:
|
if failed >= crawl.scale:
|
||||||
|
# if stopping, and no pages finished, mark as canceled
|
||||||
|
if crawl.stopping and not status.pagesDone:
|
||||||
|
state = "canceled"
|
||||||
|
else:
|
||||||
|
state = "failed"
|
||||||
|
|
||||||
status = await self.mark_finished(
|
status = await self.mark_finished(
|
||||||
redis, crawl.id, crawl.cid, status, state="failed"
|
redis, crawl.id, crawl.cid, status, state=state
|
||||||
)
|
)
|
||||||
|
|
||||||
return status
|
return status
|
||||||
|
@ -183,7 +183,7 @@ def _crawler_create_config_only(crawler_auth_headers, default_org_id):
|
|||||||
"description": "crawler test crawl",
|
"description": "crawler test crawl",
|
||||||
"config": {
|
"config": {
|
||||||
"seeds": [{"url": "https://webrecorder.net/"}],
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
||||||
"pageExtraDelay": 5,
|
"pageExtraDelay": 20,
|
||||||
"limit": 4,
|
"limit": 4,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -145,8 +145,8 @@ def test_verify_wacz():
|
|||||||
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
|
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
|
||||||
assert '"https://webrecorder.net/blog"' in pages
|
assert '"https://webrecorder.net/blog"' in pages
|
||||||
|
|
||||||
# 1 other page + header line
|
# 3 other page + header line
|
||||||
assert len(pages.strip().split("\n")) == 2
|
assert len(pages.strip().split("\n")) == 4
|
||||||
|
|
||||||
|
|
||||||
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||||
|
@ -13,6 +13,6 @@ def test_settings():
|
|||||||
"registrationEnabled": False,
|
"registrationEnabled": False,
|
||||||
"jwtTokenLifetime": 86400,
|
"jwtTokenLifetime": 86400,
|
||||||
"defaultBehaviorTimeSeconds": 300,
|
"defaultBehaviorTimeSeconds": 300,
|
||||||
"maxPagesPerCrawl": 2,
|
"maxPagesPerCrawl": 4,
|
||||||
"defaultPageLoadTimeSeconds": 120,
|
"defaultPageLoadTimeSeconds": 120,
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,7 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
|||||||
|
|
||||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
while data["state"] == "running":
|
while data["state"] in ("running", "waiting"):
|
||||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
assert data["state"] == "canceled"
|
assert data["state"] == "canceled"
|
||||||
@ -57,8 +57,44 @@ def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
|||||||
assert len(data["resources"]) == 0
|
assert len(data["resources"]) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(os.environ.get("CI") is not None, reason="Skip Test on CI")
|
def test_start_crawl_and_stop_immediately(
|
||||||
def test_start_crawl_to_stop(
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
||||||
|
):
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
crawl_id = data["started"]
|
||||||
|
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
assert data["success"] == True
|
||||||
|
|
||||||
|
# test crawl
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
assert data["stopping"] == True
|
||||||
|
|
||||||
|
# test workflow
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.json()["currCrawlStopping"] == True
|
||||||
|
|
||||||
|
while data["state"] in ("starting", "running", "waiting"):
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
|
assert data["state"] in ("canceled", "partial_complete")
|
||||||
|
assert data["stopping"] == True
|
||||||
|
|
||||||
|
|
||||||
|
def test_start_crawl_to_stop_partial(
|
||||||
default_org_id, crawler_config_id_only, crawler_auth_headers
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
||||||
):
|
):
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
@ -74,12 +110,15 @@ def test_start_crawl_to_stop(
|
|||||||
crawl_id = data["started"]
|
crawl_id = data["started"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(os.environ.get("CI") is not None, reason="Skip Test on CI")
|
def test_stop_crawl_partial(
|
||||||
def test_stop_crawl(default_org_id, crawler_config_id_only, crawler_auth_headers):
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
||||||
|
):
|
||||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
while data["state"] == "starting":
|
done = False
|
||||||
time.sleep(5)
|
while not done:
|
||||||
|
time.sleep(2)
|
||||||
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
done = data.get("stats") and data.get("stats").get("done") > 0
|
||||||
|
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
||||||
|
@ -232,8 +232,11 @@ def error_crawl_id(admin_auth_headers, default_org_id):
|
|||||||
"runNow": True,
|
"runNow": True,
|
||||||
"name": "Youtube crawl with errors",
|
"name": "Youtube crawl with errors",
|
||||||
"config": {
|
"config": {
|
||||||
"seeds": [{"url": "https://www.youtube.com/watch?v=Sh-x3QmbRZc"}],
|
"seeds": [
|
||||||
"limit": 10,
|
{"url": "https://invalid.webrecorder.net/"},
|
||||||
|
{"url": "https://invalid-2.webrecorder.net/"},
|
||||||
|
],
|
||||||
|
"limit": 1,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
|
@ -6,7 +6,7 @@ frontend_pull_policy: "Never"
|
|||||||
|
|
||||||
default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz"
|
default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz"
|
||||||
|
|
||||||
operator_resync_seconds: 5
|
operator_resync_seconds: 3
|
||||||
|
|
||||||
mongo_auth:
|
mongo_auth:
|
||||||
# specify either username + password (for local mongo)
|
# specify either username + password (for local mongo)
|
||||||
@ -26,7 +26,7 @@ superuser:
|
|||||||
local_service_port: 30870
|
local_service_port: 30870
|
||||||
|
|
||||||
# test max pages per crawl global limit
|
# test max pages per crawl global limit
|
||||||
max_pages_per_crawl: 2
|
max_pages_per_crawl: 4
|
||||||
|
|
||||||
registration_enabled: "0"
|
registration_enabled: "0"
|
||||||
|
|
||||||
|
@ -94,10 +94,10 @@ frontend_image: "docker.io/webrecorder/browsertrix-frontend:latest"
|
|||||||
frontend_pull_policy: "Always"
|
frontend_pull_policy: "Always"
|
||||||
|
|
||||||
frontend_requests_cpu: "3m"
|
frontend_requests_cpu: "3m"
|
||||||
frontend_limits_cpu: "10m"
|
frontend_limits_cpu: "30m"
|
||||||
|
|
||||||
frontend_requests_memory: "12Mi"
|
frontend_requests_memory: "12Mi"
|
||||||
frontend_limits_memory: "20Mi"
|
frontend_limits_memory: "40Mi"
|
||||||
|
|
||||||
# if set, maps nginx to a fixed port on host machine
|
# if set, maps nginx to a fixed port on host machine
|
||||||
# must be between 30000 - 32767
|
# must be between 30000 - 32767
|
||||||
|
Loading…
Reference in New Issue
Block a user