Execution time tracking tweaks (#1994)
Tweaks to how execution time is tracked for more accuracy + excluding waiting states: - don't update if crawl state is in a 'waiting state' (waiting for capacity or waiting for org limit) - rename start states -> waiting states for clarity - reset lastUpdatedTime if two consecutive updates of non-running state, to ensure non-running states don't count, but also account for occasional hiccups -- if only one update detects non-running state, don't reset - webhooks: move start webhook to when crawl actually starts for first time (db lastUpdatedTime is not yet + crawl is running) - don't set lastUpdatedTime until pods actually running - set crawljob update interval to every 10 seconds for more accurate execution time tracking - frontend: show seconds in 'Execution Time' display
This commit is contained in:
parent
ec29928b28
commit
7fa2b61b29
@ -23,7 +23,7 @@ from .models import (
|
|||||||
PaginatedCrawlOutResponse,
|
PaginatedCrawlOutResponse,
|
||||||
User,
|
User,
|
||||||
StorageRef,
|
StorageRef,
|
||||||
RUNNING_AND_STARTING_STATES,
|
RUNNING_AND_WAITING_STATES,
|
||||||
SUCCESSFUL_STATES,
|
SUCCESSFUL_STATES,
|
||||||
QARun,
|
QARun,
|
||||||
UpdatedResponse,
|
UpdatedResponse,
|
||||||
@ -272,7 +272,7 @@ class BaseCrawlOps:
|
|||||||
{
|
{
|
||||||
"_id": crawl_id,
|
"_id": crawl_id,
|
||||||
"type": "crawl",
|
"type": "crawl",
|
||||||
"state": {"$in": RUNNING_AND_STARTING_STATES},
|
"state": {"$in": RUNNING_AND_WAITING_STATES},
|
||||||
},
|
},
|
||||||
{"$set": data},
|
{"$set": data},
|
||||||
)
|
)
|
||||||
|
@ -44,7 +44,7 @@ from .models import (
|
|||||||
PaginatedCrawlOutResponse,
|
PaginatedCrawlOutResponse,
|
||||||
PaginatedSeedResponse,
|
PaginatedSeedResponse,
|
||||||
PaginatedCrawlErrorResponse,
|
PaginatedCrawlErrorResponse,
|
||||||
RUNNING_AND_STARTING_STATES,
|
RUNNING_AND_WAITING_STATES,
|
||||||
SUCCESSFUL_STATES,
|
SUCCESSFUL_STATES,
|
||||||
NON_RUNNING_STATES,
|
NON_RUNNING_STATES,
|
||||||
ALL_CRAWL_STATES,
|
ALL_CRAWL_STATES,
|
||||||
@ -165,7 +165,7 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
query["userid"] = userid
|
query["userid"] = userid
|
||||||
|
|
||||||
if running_only:
|
if running_only:
|
||||||
query["state"] = {"$in": RUNNING_AND_STARTING_STATES}
|
query["state"] = {"$in": RUNNING_AND_WAITING_STATES}
|
||||||
|
|
||||||
# Override running_only if state list is explicitly passed
|
# Override running_only if state list is explicitly passed
|
||||||
if state:
|
if state:
|
||||||
@ -425,7 +425,7 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
|
|
||||||
state, _ = await self.get_crawl_state(crawl_id, False)
|
state, _ = await self.get_crawl_state(crawl_id, False)
|
||||||
|
|
||||||
if state not in RUNNING_AND_STARTING_STATES:
|
if state not in RUNNING_AND_WAITING_STATES:
|
||||||
raise HTTPException(status_code=400, detail="crawl_not_running")
|
raise HTTPException(status_code=400, detail="crawl_not_running")
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
@ -463,7 +463,7 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
limit <= next_offset < limit + step"""
|
limit <= next_offset < limit + step"""
|
||||||
state, _ = await self.get_crawl_state(crawl_id, False)
|
state, _ = await self.get_crawl_state(crawl_id, False)
|
||||||
|
|
||||||
if state not in RUNNING_AND_STARTING_STATES:
|
if state not in RUNNING_AND_WAITING_STATES:
|
||||||
raise HTTPException(status_code=400, detail="crawl_not_running")
|
raise HTTPException(status_code=400, detail="crawl_not_running")
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
@ -513,7 +513,7 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
|
|
||||||
crawl = await self.get_crawl(crawl_id, org)
|
crawl = await self.get_crawl(crawl_id, org)
|
||||||
|
|
||||||
if crawl.state not in RUNNING_AND_STARTING_STATES:
|
if crawl.state not in RUNNING_AND_WAITING_STATES:
|
||||||
raise HTTPException(status_code=400, detail="crawl_not_running")
|
raise HTTPException(status_code=400, detail="crawl_not_running")
|
||||||
|
|
||||||
cid = crawl.cid
|
cid = crawl.cid
|
||||||
@ -591,30 +591,36 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
"qaCrawlExecSeconds": exec_time,
|
"qaCrawlExecSeconds": exec_time,
|
||||||
"qa.crawlExecSeconds": exec_time,
|
"qa.crawlExecSeconds": exec_time,
|
||||||
}
|
}
|
||||||
|
field = "qa._lut"
|
||||||
else:
|
else:
|
||||||
inc_update = {"crawlExecSeconds": exec_time}
|
inc_update = {"crawlExecSeconds": exec_time}
|
||||||
|
field = "_lut"
|
||||||
|
|
||||||
res = await self.crawls.find_one_and_update(
|
res = await self.crawls.find_one_and_update(
|
||||||
{
|
{
|
||||||
"_id": crawl_id,
|
"_id": crawl_id,
|
||||||
"type": "crawl",
|
"type": "crawl",
|
||||||
"_lut": {"$ne": last_updated_time},
|
field: {"$ne": last_updated_time},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$inc": inc_update,
|
"$inc": inc_update,
|
||||||
"$set": {"_lut": last_updated_time},
|
"$set": {field: last_updated_time},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
return res is not None
|
return res is not None
|
||||||
|
|
||||||
async def get_crawl_exec_last_update_time(
|
async def get_crawl_exec_last_update_time(
|
||||||
self, crawl_id: str
|
self, crawl_id: str, is_qa: bool
|
||||||
) -> Optional[datetime]:
|
) -> Optional[datetime]:
|
||||||
"""get crawl last updated time"""
|
"""get crawl last updated time"""
|
||||||
|
field = "_lut" if not is_qa else "qa._lut"
|
||||||
res = await self.crawls.find_one(
|
res = await self.crawls.find_one(
|
||||||
{"_id": crawl_id, "type": "crawl"}, projection=["_lut"]
|
{"_id": crawl_id, "type": "crawl"}, projection=[field]
|
||||||
)
|
)
|
||||||
return res and res.get("_lut")
|
if not res:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return res.get("qa", {}).get("_lut") if is_qa else res.get("_lut")
|
||||||
|
|
||||||
async def get_crawl_state(
|
async def get_crawl_state(
|
||||||
self, crawl_id: str, is_qa: bool
|
self, crawl_id: str, is_qa: bool
|
||||||
|
@ -208,8 +208,8 @@ TYPE_RUNNING_STATES = Literal[
|
|||||||
]
|
]
|
||||||
RUNNING_STATES = get_args(TYPE_RUNNING_STATES)
|
RUNNING_STATES = get_args(TYPE_RUNNING_STATES)
|
||||||
|
|
||||||
TYPE_STARTING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"]
|
TYPE_WAITING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"]
|
||||||
STARTING_STATES = get_args(TYPE_STARTING_STATES)
|
WAITING_STATES = get_args(TYPE_WAITING_STATES)
|
||||||
|
|
||||||
TYPE_FAILED_STATES = Literal[
|
TYPE_FAILED_STATES = Literal[
|
||||||
"canceled",
|
"canceled",
|
||||||
@ -228,8 +228,8 @@ TYPE_SUCCESSFUL_STATES = Literal[
|
|||||||
]
|
]
|
||||||
SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES)
|
SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES)
|
||||||
|
|
||||||
TYPE_RUNNING_AND_STARTING_STATES = Literal[TYPE_STARTING_STATES, TYPE_RUNNING_STATES]
|
TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES]
|
||||||
RUNNING_AND_STARTING_STATES = [*STARTING_STATES, *RUNNING_STATES]
|
RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES]
|
||||||
|
|
||||||
RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES]
|
RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES]
|
||||||
|
|
||||||
@ -237,9 +237,9 @@ TYPE_NON_RUNNING_STATES = Literal[TYPE_FAILED_STATES, TYPE_SUCCESSFUL_STATES]
|
|||||||
NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]
|
NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]
|
||||||
|
|
||||||
TYPE_ALL_CRAWL_STATES = Literal[
|
TYPE_ALL_CRAWL_STATES = Literal[
|
||||||
TYPE_RUNNING_AND_STARTING_STATES, TYPE_NON_RUNNING_STATES
|
TYPE_RUNNING_AND_WAITING_STATES, TYPE_NON_RUNNING_STATES
|
||||||
]
|
]
|
||||||
ALL_CRAWL_STATES = [*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES]
|
ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -19,8 +19,9 @@ from btrixcloud.models import (
|
|||||||
TYPE_ALL_CRAWL_STATES,
|
TYPE_ALL_CRAWL_STATES,
|
||||||
NON_RUNNING_STATES,
|
NON_RUNNING_STATES,
|
||||||
RUNNING_STATES,
|
RUNNING_STATES,
|
||||||
|
WAITING_STATES,
|
||||||
RUNNING_AND_STARTING_ONLY,
|
RUNNING_AND_STARTING_ONLY,
|
||||||
RUNNING_AND_STARTING_STATES,
|
RUNNING_AND_WAITING_STATES,
|
||||||
SUCCESSFUL_STATES,
|
SUCCESSFUL_STATES,
|
||||||
FAILED_STATES,
|
FAILED_STATES,
|
||||||
CrawlStats,
|
CrawlStats,
|
||||||
@ -119,6 +120,7 @@ class CrawlOperator(BaseOperator):
|
|||||||
"""sync crawls"""
|
"""sync crawls"""
|
||||||
|
|
||||||
status = CrawlStatus(**data.parent.get("status", {}))
|
status = CrawlStatus(**data.parent.get("status", {}))
|
||||||
|
status.last_state = status.state
|
||||||
|
|
||||||
spec = data.parent.get("spec", {})
|
spec = data.parent.get("spec", {})
|
||||||
crawl_id = spec["id"]
|
crawl_id = spec["id"]
|
||||||
@ -250,11 +252,6 @@ class CrawlOperator(BaseOperator):
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
status.scale = 1
|
status.scale = 1
|
||||||
now = dt_now()
|
|
||||||
await self.crawl_ops.inc_crawl_exec_time(
|
|
||||||
crawl.db_crawl_id, crawl.is_qa, 0, now
|
|
||||||
)
|
|
||||||
status.lastUpdatedTime = to_k8s_date(now)
|
|
||||||
|
|
||||||
children = self._load_redis(params, status, data.children)
|
children = self._load_redis(params, status, data.children)
|
||||||
|
|
||||||
@ -807,25 +804,13 @@ class CrawlOperator(BaseOperator):
|
|||||||
status.resync_after = self.fast_retry_secs
|
status.resync_after = self.fast_retry_secs
|
||||||
return status
|
return status
|
||||||
|
|
||||||
# if true (state is set), also run webhook
|
# ensure running state is set
|
||||||
if await self.set_state(
|
await self.set_state(
|
||||||
"running",
|
"running",
|
||||||
status,
|
status,
|
||||||
crawl,
|
crawl,
|
||||||
allowed_from=["starting", "waiting_capacity"],
|
allowed_from=["starting", "waiting_capacity"],
|
||||||
):
|
)
|
||||||
if not crawl.qa_source_crawl_id:
|
|
||||||
self.run_task(
|
|
||||||
self.event_webhook_ops.create_crawl_started_notification(
|
|
||||||
crawl.id, crawl.oid, scheduled=crawl.scheduled
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.run_task(
|
|
||||||
self.event_webhook_ops.create_qa_analysis_started_notification(
|
|
||||||
crawl.id, crawl.oid, crawl.qa_source_crawl_id
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# update lastActiveTime if crawler is running
|
# update lastActiveTime if crawler is running
|
||||||
if crawler_running:
|
if crawler_running:
|
||||||
@ -967,11 +952,33 @@ class CrawlOperator(BaseOperator):
|
|||||||
"""inc exec time tracking"""
|
"""inc exec time tracking"""
|
||||||
now = dt_now()
|
now = dt_now()
|
||||||
|
|
||||||
|
# don't count time crawl is not running
|
||||||
|
if status.state in WAITING_STATES:
|
||||||
|
# reset lastUpdatedTime if at least 2 consecutive updates of non-running state
|
||||||
|
if status.last_state in WAITING_STATES:
|
||||||
|
status.lastUpdatedTime = to_k8s_date(now)
|
||||||
|
return
|
||||||
|
|
||||||
update_start_time = await self.crawl_ops.get_crawl_exec_last_update_time(
|
update_start_time = await self.crawl_ops.get_crawl_exec_last_update_time(
|
||||||
crawl.db_crawl_id
|
crawl.db_crawl_id, crawl.is_qa
|
||||||
)
|
)
|
||||||
|
|
||||||
if not update_start_time:
|
if not update_start_time:
|
||||||
|
print("Crawl first started, webhooks called", now, crawl.id)
|
||||||
|
# call initial running webhook
|
||||||
|
if not crawl.qa_source_crawl_id:
|
||||||
|
self.run_task(
|
||||||
|
self.event_webhook_ops.create_crawl_started_notification(
|
||||||
|
crawl.id, crawl.oid, scheduled=crawl.scheduled
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.run_task(
|
||||||
|
self.event_webhook_ops.create_qa_analysis_started_notification(
|
||||||
|
crawl.id, crawl.oid, crawl.qa_source_crawl_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
await self.crawl_ops.inc_crawl_exec_time(
|
await self.crawl_ops.inc_crawl_exec_time(
|
||||||
crawl.db_crawl_id, crawl.is_qa, 0, now
|
crawl.db_crawl_id, crawl.is_qa, 0, now
|
||||||
)
|
)
|
||||||
@ -1414,7 +1421,7 @@ class CrawlOperator(BaseOperator):
|
|||||||
|
|
||||||
finished = dt_now()
|
finished = dt_now()
|
||||||
|
|
||||||
allowed_from = RUNNING_AND_STARTING_STATES
|
allowed_from = RUNNING_AND_WAITING_STATES
|
||||||
|
|
||||||
# if set_state returns false, already set to same status, return
|
# if set_state returns false, already set to same status, return
|
||||||
if not await self.set_state(
|
if not await self.set_state(
|
||||||
|
@ -223,3 +223,6 @@ class CrawlStatus(BaseModel):
|
|||||||
|
|
||||||
# don't include in status, use by metacontroller
|
# don't include in status, use by metacontroller
|
||||||
resync_after: Optional[int] = Field(default=None, exclude=True)
|
resync_after: Optional[int] = Field(default=None, exclude=True)
|
||||||
|
|
||||||
|
# last state
|
||||||
|
last_state: TYPE_ALL_CRAWL_STATES = Field(default="starting", exclude=True)
|
||||||
|
@ -27,7 +27,7 @@ from aiostream import stream
|
|||||||
from .models import (
|
from .models import (
|
||||||
SUCCESSFUL_STATES,
|
SUCCESSFUL_STATES,
|
||||||
RUNNING_STATES,
|
RUNNING_STATES,
|
||||||
STARTING_STATES,
|
WAITING_STATES,
|
||||||
BaseCrawl,
|
BaseCrawl,
|
||||||
Organization,
|
Organization,
|
||||||
StorageRef,
|
StorageRef,
|
||||||
@ -890,7 +890,7 @@ class OrgOps:
|
|||||||
{"oid": org.id, "state": {"$in": RUNNING_STATES}}
|
{"oid": org.id, "state": {"$in": RUNNING_STATES}}
|
||||||
)
|
)
|
||||||
workflows_queued_count = await self.crawls_db.count_documents(
|
workflows_queued_count = await self.crawls_db.count_documents(
|
||||||
{"oid": org.id, "state": {"$in": STARTING_STATES}}
|
{"oid": org.id, "state": {"$in": WAITING_STATES}}
|
||||||
)
|
)
|
||||||
collections_count = await self.colls_db.count_documents({"oid": org.id})
|
collections_count = await self.colls_db.count_documents({"oid": org.id})
|
||||||
public_collections_count = await self.colls_db.count_documents(
|
public_collections_count = await self.colls_db.count_documents(
|
||||||
|
@ -5,7 +5,7 @@ metadata:
|
|||||||
name: crawljobs-operator
|
name: crawljobs-operator
|
||||||
spec:
|
spec:
|
||||||
generateSelector: false
|
generateSelector: false
|
||||||
resyncPeriodSeconds: {{ .Values.operator_resync_seconds | default 30 }}
|
resyncPeriodSeconds: {{ .Values.operator_resync_seconds | default 10 }}
|
||||||
parentResource:
|
parentResource:
|
||||||
apiVersion: btrix.cloud/v1
|
apiVersion: btrix.cloud/v1
|
||||||
resource: crawljobs
|
resource: crawljobs
|
||||||
|
@ -802,6 +802,7 @@ export class ArchivedItemDetail extends TailwindElement {
|
|||||||
? html`<span
|
? html`<span
|
||||||
>${humanizeExecutionSeconds(
|
>${humanizeExecutionSeconds(
|
||||||
this.crawl!.crawlExecSeconds,
|
this.crawl!.crawlExecSeconds,
|
||||||
|
{ displaySeconds: true },
|
||||||
)}</span
|
)}</span
|
||||||
>`
|
>`
|
||||||
: html`<span class="text-0-400">${msg("Pending")}</span>`}
|
: html`<span class="text-0-400">${msg("Pending")}</span>`}
|
||||||
|
Loading…
Reference in New Issue
Block a user