* stopping fix: backend fixes for #836 - sets 'stopping' field on crawl when crawl is being stopped (both via db and on k8s object) - k8s: show 'stopping' as part of crawljob object, update subchart - set 'currCrawlStopping' on workflow - support old and new browsertrix-crawler stopping keys - tests: add tests for new stopping state, also test canceling crawl (disable test for stopping crawl, currently failing) - catch redis error when getting stats operator: additional optimizations: - run pvc removal as background task - catch any exceptions in finalizer stage (eg. if db is down), return false until finalizer completes
This commit is contained in:
parent
064cd7e08a
commit
fd7e81b8b7
@ -201,6 +201,7 @@ class CrawlConfigOut(CrawlConfig):
|
|||||||
currCrawlStartTime: Optional[datetime]
|
currCrawlStartTime: Optional[datetime]
|
||||||
currCrawlState: Optional[str]
|
currCrawlState: Optional[str]
|
||||||
currCrawlSize: Optional[int] = 0
|
currCrawlSize: Optional[int] = 0
|
||||||
|
currCrawlStopping: Optional[bool] = False
|
||||||
|
|
||||||
profileName: Optional[str]
|
profileName: Optional[str]
|
||||||
|
|
||||||
@ -637,6 +638,7 @@ class CrawlConfigOps:
|
|||||||
crawlconfig.currCrawlStartTime = crawl.started
|
crawlconfig.currCrawlStartTime = crawl.started
|
||||||
crawlconfig.currCrawlState = crawl.state
|
crawlconfig.currCrawlState = crawl.state
|
||||||
crawlconfig.currCrawlSize = crawl.stats.get("size", 0) if crawl.stats else 0
|
crawlconfig.currCrawlSize = crawl.stats.get("size", 0) if crawl.stats else 0
|
||||||
|
crawlconfig.currCrawlStopping = crawl.stopping
|
||||||
|
|
||||||
async def get_crawl_config_out(self, cid: uuid.UUID, org: Organization):
|
async def get_crawl_config_out(self, cid: uuid.UUID, org: Organization):
|
||||||
"""Return CrawlConfigOut, including state of currently running crawl, if active
|
"""Return CrawlConfigOut, including state of currently running crawl, if active
|
||||||
|
|||||||
@ -27,16 +27,13 @@ from .users import User
|
|||||||
from .utils import dt_now, ts_now, get_redis_crawl_stats, parse_jsonl_error_messages
|
from .utils import dt_now, ts_now, get_redis_crawl_stats, parse_jsonl_error_messages
|
||||||
|
|
||||||
|
|
||||||
CRAWL_STATES = (
|
RUNNING_STATES = ("running", "pending-wait", "generate-wacz", "uploading-wacz")
|
||||||
"starting",
|
|
||||||
"running",
|
RUNNING_AND_STARTING_STATES = ("starting", "waiting", *RUNNING_STATES)
|
||||||
"stopping",
|
|
||||||
"complete",
|
NON_RUNNING_STATES = ("complete", "canceled", "partial_complete", "timed_out", "failed")
|
||||||
"canceled",
|
|
||||||
"partial_complete",
|
ALL_CRAWL_STATES = (*NON_RUNNING_STATES, *RUNNING_AND_STARTING_STATES)
|
||||||
"timed_out",
|
|
||||||
"failed",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -104,6 +101,8 @@ class Crawl(CrawlConfigCore):
|
|||||||
|
|
||||||
errors: Optional[List[str]] = []
|
errors: Optional[List[str]] = []
|
||||||
|
|
||||||
|
stopping: Optional[bool] = False
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlOut(Crawl):
|
class CrawlOut(Crawl):
|
||||||
@ -155,6 +154,8 @@ class ListCrawlOut(BaseMongoModel):
|
|||||||
seedCount: Optional[int] = 0
|
seedCount: Optional[int] = 0
|
||||||
errors: Optional[List[str]]
|
errors: Optional[List[str]]
|
||||||
|
|
||||||
|
stopping: Optional[bool] = False
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlCompleteIn(BaseModel):
|
class CrawlCompleteIn(BaseModel):
|
||||||
@ -236,11 +237,11 @@ class CrawlOps:
|
|||||||
query["userid"] = userid
|
query["userid"] = userid
|
||||||
|
|
||||||
if running_only:
|
if running_only:
|
||||||
query["state"] = {"$in": ["running", "starting", "stopping"]}
|
query["state"] = {"$in": list(RUNNING_AND_STARTING_STATES)}
|
||||||
|
|
||||||
# Override running_only if state list is explicitly passed
|
# Override running_only if state list is explicitly passed
|
||||||
if state:
|
if state:
|
||||||
validated_states = [value for value in state if value in CRAWL_STATES]
|
validated_states = [value for value in state if value in ALL_CRAWL_STATES]
|
||||||
query["state"] = {"$in": validated_states}
|
query["state"] = {"$in": validated_states}
|
||||||
|
|
||||||
if crawl_id:
|
if crawl_id:
|
||||||
@ -430,9 +431,13 @@ class CrawlOps:
|
|||||||
|
|
||||||
# if running, get stats directly from redis
|
# if running, get stats directly from redis
|
||||||
# more responsive, saves db update in operator
|
# more responsive, saves db update in operator
|
||||||
if crawl.state == "running":
|
if crawl.state in RUNNING_STATES:
|
||||||
|
try:
|
||||||
redis = await self.get_redis(crawl.id)
|
redis = await self.get_redis(crawl.id)
|
||||||
crawl.stats = await get_redis_crawl_stats(redis, crawl.id)
|
crawl.stats = await get_redis_crawl_stats(redis, crawl.id)
|
||||||
|
# redis not available, ignore
|
||||||
|
except exceptions.ConnectionError:
|
||||||
|
pass
|
||||||
|
|
||||||
return crawl
|
return crawl
|
||||||
|
|
||||||
@ -596,11 +601,12 @@ class CrawlOps:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if result.get("success"):
|
if result.get("success"):
|
||||||
# for canceletion, just set to canceled immediately if succeeded
|
if graceful:
|
||||||
await self.update_crawl_state(
|
await self.crawls.find_one_and_update(
|
||||||
crawl_id, "stopping" if graceful else "canceled"
|
{"_id": crawl_id, "oid": org.id},
|
||||||
|
{"$set": {"stopping": True}},
|
||||||
)
|
)
|
||||||
return {"success": True}
|
return result
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
# pylint: disable=raise-missing-from
|
# pylint: disable=raise-missing-from
|
||||||
|
|||||||
@ -77,7 +77,7 @@ class CrawlSpec(BaseModel):
|
|||||||
class CrawlStatus(BaseModel):
|
class CrawlStatus(BaseModel):
|
||||||
"""status from k8s CrawlJob object"""
|
"""status from k8s CrawlJob object"""
|
||||||
|
|
||||||
state: str = "waiting"
|
state: str = "new"
|
||||||
pagesFound: int = 0
|
pagesFound: int = 0
|
||||||
pagesDone: int = 0
|
pagesDone: int = 0
|
||||||
size: str = ""
|
size: str = ""
|
||||||
@ -149,10 +149,15 @@ class BtrixOperator(K8sAPI):
|
|||||||
# if finalizing, crawl is being deleted
|
# if finalizing, crawl is being deleted
|
||||||
if data.finalizing:
|
if data.finalizing:
|
||||||
# if not yet finished, assume it was canceled, mark as such
|
# if not yet finished, assume it was canceled, mark as such
|
||||||
|
print(f"Finalizing crawl {crawl_id}, finished {status.finished}")
|
||||||
if not status.finished:
|
if not status.finished:
|
||||||
await self.cancel_crawl(redis_url, crawl_id, cid, status, "canceled")
|
finalize = await self.cancel_crawl(
|
||||||
|
redis_url, crawl_id, cid, status, "canceled"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
finalize = True
|
||||||
|
|
||||||
return await self.finalize_crawl(crawl_id, status, data.related)
|
return await self.finalize_crawl(crawl_id, status, data.related, finalize)
|
||||||
|
|
||||||
if status.finished:
|
if status.finished:
|
||||||
return await self.handle_finished_delete_if_needed(crawl_id, status, spec)
|
return await self.handle_finished_delete_if_needed(crawl_id, status, spec)
|
||||||
@ -184,7 +189,7 @@ class BtrixOperator(K8sAPI):
|
|||||||
has_crawl_children = STS in data.children and crawl_sts in data.children[STS]
|
has_crawl_children = STS in data.children and crawl_sts in data.children[STS]
|
||||||
if has_crawl_children:
|
if has_crawl_children:
|
||||||
status = await self.sync_crawl_state(redis_url, crawl, status)
|
status = await self.sync_crawl_state(redis_url, crawl, status)
|
||||||
else:
|
elif not status.finished:
|
||||||
status.state = "starting"
|
status.state = "starting"
|
||||||
|
|
||||||
if status.finished:
|
if status.finished:
|
||||||
@ -278,9 +283,15 @@ class BtrixOperator(K8sAPI):
|
|||||||
|
|
||||||
# pylint: disable=too-many-arguments
|
# pylint: disable=too-many-arguments
|
||||||
async def cancel_crawl(self, redis_url, crawl_id, cid, status, state):
|
async def cancel_crawl(self, redis_url, crawl_id, cid, status, state):
|
||||||
"""immediately cancel crawl with specified state"""
|
"""immediately cancel crawl with specified state
|
||||||
|
return true if db mark_finished update succeeds"""
|
||||||
|
try:
|
||||||
redis = await self._get_redis(redis_url)
|
redis = await self._get_redis(redis_url)
|
||||||
await self.mark_finished(redis, crawl_id, cid, status, state)
|
await self.mark_finished(redis, crawl_id, cid, status, state)
|
||||||
|
return True
|
||||||
|
# pylint: disable=bare-except
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
def _done_response(self, status, finalized=False):
|
def _done_response(self, status, finalized=False):
|
||||||
"""done response for removing crawl"""
|
"""done response for removing crawl"""
|
||||||
@ -290,17 +301,15 @@ class BtrixOperator(K8sAPI):
|
|||||||
"finalized": finalized,
|
"finalized": finalized,
|
||||||
}
|
}
|
||||||
|
|
||||||
async def finalize_crawl(self, crawl_id, status, related):
|
async def finalize_crawl(self, crawl_id, status, related, finalized=True):
|
||||||
"""ensure crawl id ready for deletion
|
"""ensure crawl id ready for deletion
|
||||||
return with finalized state"""
|
return with finalized state"""
|
||||||
|
|
||||||
pvcs = list(related[PVC].keys())
|
pvcs = list(related[PVC].keys())
|
||||||
if pvcs:
|
if pvcs:
|
||||||
print("Deleting PVCs", pvcs)
|
print("Deleting PVCs", pvcs)
|
||||||
await self.delete_pvc(crawl_id)
|
asyncio.create_task(self.delete_pvc(crawl_id))
|
||||||
finalized = False
|
finalized = False
|
||||||
else:
|
|
||||||
finalized = True
|
|
||||||
|
|
||||||
return self._done_response(status, finalized)
|
return self._done_response(status, finalized)
|
||||||
|
|
||||||
@ -391,6 +400,9 @@ class BtrixOperator(K8sAPI):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if crawl.stopping:
|
if crawl.stopping:
|
||||||
|
print("Graceful Stop")
|
||||||
|
await redis.set(f"{crawl.id}:stopping", "1")
|
||||||
|
# backwards compatibility with older crawler
|
||||||
await redis.set("crawl-stop", "1")
|
await redis.set("crawl-stop", "1")
|
||||||
|
|
||||||
# optimization: don't update db once crawl is already running
|
# optimization: don't update db once crawl is already running
|
||||||
|
|||||||
@ -47,7 +47,7 @@ async def get_redis_crawl_stats(redis, crawl_id):
|
|||||||
pages_done = await redis.llen(f"{crawl_id}:d")
|
pages_done = await redis.llen(f"{crawl_id}:d")
|
||||||
|
|
||||||
pages_found = await redis.scard(f"{crawl_id}:s")
|
pages_found = await redis.scard(f"{crawl_id}:s")
|
||||||
archive_size = await redis.hvals("crawl-size")
|
archive_size = await redis.hvals(f"{crawl_id}:size")
|
||||||
archive_size = sum(int(x) for x in archive_size)
|
archive_size = sum(int(x) for x in archive_size)
|
||||||
|
|
||||||
stats = {"found": pages_found, "done": pages_done, "size": archive_size}
|
stats = {"found": pages_found, "done": pages_done, "size": archive_size}
|
||||||
|
|||||||
@ -174,6 +174,30 @@ def crawler_userid(crawler_auth_headers):
|
|||||||
return r.json()["id"]
|
return r.json()["id"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def _crawler_create_config_only(crawler_auth_headers, default_org_id):
|
||||||
|
# Start crawl.
|
||||||
|
crawl_data = {
|
||||||
|
"runNow": False,
|
||||||
|
"name": "Crawler User Test Crawl",
|
||||||
|
"description": "crawler test crawl",
|
||||||
|
"config": {
|
||||||
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
||||||
|
"pageExtraDelay": 5,
|
||||||
|
"limit": 4,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
json=crawl_data,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
global _crawler_config_id
|
||||||
|
_crawler_config_id = data["added"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def crawler_crawl_id(crawler_auth_headers, default_org_id):
|
def crawler_crawl_id(crawler_auth_headers, default_org_id):
|
||||||
# Start crawl.
|
# Start crawl.
|
||||||
@ -239,6 +263,11 @@ def crawler_config_id(crawler_crawl_id):
|
|||||||
return _crawler_config_id
|
return _crawler_config_id
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def crawler_config_id_only(_crawler_create_config_only):
|
||||||
|
return _crawler_config_id
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def sample_crawl_data():
|
def sample_crawl_data():
|
||||||
return {
|
return {
|
||||||
|
|||||||
108
backend/test/test_stop_cancel_crawl.py
Normal file
108
backend/test/test_stop_cancel_crawl.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from .conftest import API_PREFIX
|
||||||
|
|
||||||
|
crawl_id = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_crawl(org_id, auth_headers, crawl_id):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{org_id}/crawls/{crawl_id}/replay.json",
|
||||||
|
headers=auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
return r.json()
|
||||||
|
|
||||||
|
|
||||||
|
def test_start_crawl_to_cancel(
|
||||||
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
||||||
|
):
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
assert data.get("started")
|
||||||
|
|
||||||
|
global crawl_id
|
||||||
|
crawl_id = data["started"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
while data["state"] == "starting":
|
||||||
|
time.sleep(5)
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
assert data["success"] == True
|
||||||
|
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
|
while data["state"] == "running":
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
|
assert data["state"] == "canceled"
|
||||||
|
assert data["stopping"] == False
|
||||||
|
|
||||||
|
assert len(data["resources"]) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(os.environ.get("CI") is not None, reason="Skip Test on CI")
|
||||||
|
def test_start_crawl_to_stop(
|
||||||
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
||||||
|
):
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
assert data.get("started")
|
||||||
|
|
||||||
|
global crawl_id
|
||||||
|
crawl_id = data["started"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(os.environ.get("CI") is not None, reason="Skip Test on CI")
|
||||||
|
def test_stop_crawl(default_org_id, crawler_config_id_only, crawler_auth_headers):
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
while data["state"] == "starting":
|
||||||
|
time.sleep(5)
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
assert data["success"] == True
|
||||||
|
|
||||||
|
# test crawl
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
assert data["stopping"] == True
|
||||||
|
|
||||||
|
# test workflow
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.json()["currCrawlStopping"] == True
|
||||||
|
|
||||||
|
while data["state"] == "running":
|
||||||
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
||||||
|
|
||||||
|
assert data["state"] == "partial_complete"
|
||||||
|
assert data["stopping"] == True
|
||||||
|
|
||||||
|
assert len(data["resources"]) == 1
|
||||||
@ -66,6 +66,11 @@ spec:
|
|||||||
jsonPath: .status.finished
|
jsonPath: .status.finished
|
||||||
description: "if set, time crawl has finished"
|
description: "if set, time crawl has finished"
|
||||||
|
|
||||||
|
- name: Stopping
|
||||||
|
type: boolean
|
||||||
|
jsonPath: .spec.stopping
|
||||||
|
description: "if set, crawl is being stopped"
|
||||||
|
|
||||||
- name: Files Added
|
- name: Files Added
|
||||||
type: integer
|
type: integer
|
||||||
jsonPath: .status.filesAdded
|
jsonPath: .status.filesAdded
|
||||||
|
|||||||
Binary file not shown.
@ -4,6 +4,8 @@
|
|||||||
backend_pull_policy: "Never"
|
backend_pull_policy: "Never"
|
||||||
frontend_pull_policy: "Never"
|
frontend_pull_policy: "Never"
|
||||||
|
|
||||||
|
operator_resync_seconds: 5
|
||||||
|
|
||||||
mongo_auth:
|
mongo_auth:
|
||||||
# specify either username + password (for local mongo)
|
# specify either username + password (for local mongo)
|
||||||
username: root
|
username: root
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user