Execution time tracking tweaks (#1994)
Tweaks to how execution time is tracked for more accuracy + excluding waiting states: - don't update if crawl state is in a 'waiting state' (waiting for capacity or waiting for org limit) - rename start states -> waiting states for clarity - reset lastUpdatedTime if two consecutive updates of non-running state, to ensure non-running states don't count, but also account for occasional hiccups -- if only one update detects non-running state, don't reset - webhooks: move start webhook to when crawl actually starts for first time (db lastUpdatedTime is not yet + crawl is running) - don't set lastUpdatedTime until pods actually running - set crawljob update interval to every 10 seconds for more accurate execution time tracking - frontend: show seconds in 'Execution Time' display
This commit is contained in:
		
							parent
							
								
									ec29928b28
								
							
						
					
					
						commit
						7fa2b61b29
					
				| @ -23,7 +23,7 @@ from .models import ( | |||||||
|     PaginatedCrawlOutResponse, |     PaginatedCrawlOutResponse, | ||||||
|     User, |     User, | ||||||
|     StorageRef, |     StorageRef, | ||||||
|     RUNNING_AND_STARTING_STATES, |     RUNNING_AND_WAITING_STATES, | ||||||
|     SUCCESSFUL_STATES, |     SUCCESSFUL_STATES, | ||||||
|     QARun, |     QARun, | ||||||
|     UpdatedResponse, |     UpdatedResponse, | ||||||
| @ -272,7 +272,7 @@ class BaseCrawlOps: | |||||||
|             { |             { | ||||||
|                 "_id": crawl_id, |                 "_id": crawl_id, | ||||||
|                 "type": "crawl", |                 "type": "crawl", | ||||||
|                 "state": {"$in": RUNNING_AND_STARTING_STATES}, |                 "state": {"$in": RUNNING_AND_WAITING_STATES}, | ||||||
|             }, |             }, | ||||||
|             {"$set": data}, |             {"$set": data}, | ||||||
|         ) |         ) | ||||||
|  | |||||||
| @ -44,7 +44,7 @@ from .models import ( | |||||||
|     PaginatedCrawlOutResponse, |     PaginatedCrawlOutResponse, | ||||||
|     PaginatedSeedResponse, |     PaginatedSeedResponse, | ||||||
|     PaginatedCrawlErrorResponse, |     PaginatedCrawlErrorResponse, | ||||||
|     RUNNING_AND_STARTING_STATES, |     RUNNING_AND_WAITING_STATES, | ||||||
|     SUCCESSFUL_STATES, |     SUCCESSFUL_STATES, | ||||||
|     NON_RUNNING_STATES, |     NON_RUNNING_STATES, | ||||||
|     ALL_CRAWL_STATES, |     ALL_CRAWL_STATES, | ||||||
| @ -165,7 +165,7 @@ class CrawlOps(BaseCrawlOps): | |||||||
|             query["userid"] = userid |             query["userid"] = userid | ||||||
| 
 | 
 | ||||||
|         if running_only: |         if running_only: | ||||||
|             query["state"] = {"$in": RUNNING_AND_STARTING_STATES} |             query["state"] = {"$in": RUNNING_AND_WAITING_STATES} | ||||||
| 
 | 
 | ||||||
|         # Override running_only if state list is explicitly passed |         # Override running_only if state list is explicitly passed | ||||||
|         if state: |         if state: | ||||||
| @ -425,7 +425,7 @@ class CrawlOps(BaseCrawlOps): | |||||||
| 
 | 
 | ||||||
|         state, _ = await self.get_crawl_state(crawl_id, False) |         state, _ = await self.get_crawl_state(crawl_id, False) | ||||||
| 
 | 
 | ||||||
|         if state not in RUNNING_AND_STARTING_STATES: |         if state not in RUNNING_AND_WAITING_STATES: | ||||||
|             raise HTTPException(status_code=400, detail="crawl_not_running") |             raise HTTPException(status_code=400, detail="crawl_not_running") | ||||||
| 
 | 
 | ||||||
|         total = 0 |         total = 0 | ||||||
| @ -463,7 +463,7 @@ class CrawlOps(BaseCrawlOps): | |||||||
|         limit <= next_offset < limit + step""" |         limit <= next_offset < limit + step""" | ||||||
|         state, _ = await self.get_crawl_state(crawl_id, False) |         state, _ = await self.get_crawl_state(crawl_id, False) | ||||||
| 
 | 
 | ||||||
|         if state not in RUNNING_AND_STARTING_STATES: |         if state not in RUNNING_AND_WAITING_STATES: | ||||||
|             raise HTTPException(status_code=400, detail="crawl_not_running") |             raise HTTPException(status_code=400, detail="crawl_not_running") | ||||||
| 
 | 
 | ||||||
|         total = 0 |         total = 0 | ||||||
| @ -513,7 +513,7 @@ class CrawlOps(BaseCrawlOps): | |||||||
| 
 | 
 | ||||||
|         crawl = await self.get_crawl(crawl_id, org) |         crawl = await self.get_crawl(crawl_id, org) | ||||||
| 
 | 
 | ||||||
|         if crawl.state not in RUNNING_AND_STARTING_STATES: |         if crawl.state not in RUNNING_AND_WAITING_STATES: | ||||||
|             raise HTTPException(status_code=400, detail="crawl_not_running") |             raise HTTPException(status_code=400, detail="crawl_not_running") | ||||||
| 
 | 
 | ||||||
|         cid = crawl.cid |         cid = crawl.cid | ||||||
| @ -591,30 +591,36 @@ class CrawlOps(BaseCrawlOps): | |||||||
|                 "qaCrawlExecSeconds": exec_time, |                 "qaCrawlExecSeconds": exec_time, | ||||||
|                 "qa.crawlExecSeconds": exec_time, |                 "qa.crawlExecSeconds": exec_time, | ||||||
|             } |             } | ||||||
|  |             field = "qa._lut" | ||||||
|         else: |         else: | ||||||
|             inc_update = {"crawlExecSeconds": exec_time} |             inc_update = {"crawlExecSeconds": exec_time} | ||||||
|  |             field = "_lut" | ||||||
| 
 | 
 | ||||||
|         res = await self.crawls.find_one_and_update( |         res = await self.crawls.find_one_and_update( | ||||||
|             { |             { | ||||||
|                 "_id": crawl_id, |                 "_id": crawl_id, | ||||||
|                 "type": "crawl", |                 "type": "crawl", | ||||||
|                 "_lut": {"$ne": last_updated_time}, |                 field: {"$ne": last_updated_time}, | ||||||
|             }, |             }, | ||||||
|             { |             { | ||||||
|                 "$inc": inc_update, |                 "$inc": inc_update, | ||||||
|                 "$set": {"_lut": last_updated_time}, |                 "$set": {field: last_updated_time}, | ||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|         return res is not None |         return res is not None | ||||||
| 
 | 
 | ||||||
|     async def get_crawl_exec_last_update_time( |     async def get_crawl_exec_last_update_time( | ||||||
|         self, crawl_id: str |         self, crawl_id: str, is_qa: bool | ||||||
|     ) -> Optional[datetime]: |     ) -> Optional[datetime]: | ||||||
|         """get crawl last updated time""" |         """get crawl last updated time""" | ||||||
|  |         field = "_lut" if not is_qa else "qa._lut" | ||||||
|         res = await self.crawls.find_one( |         res = await self.crawls.find_one( | ||||||
|             {"_id": crawl_id, "type": "crawl"}, projection=["_lut"] |             {"_id": crawl_id, "type": "crawl"}, projection=[field] | ||||||
|         ) |         ) | ||||||
|         return res and res.get("_lut") |         if not res: | ||||||
|  |             return None | ||||||
|  | 
 | ||||||
|  |         return res.get("qa", {}).get("_lut") if is_qa else res.get("_lut") | ||||||
| 
 | 
 | ||||||
|     async def get_crawl_state( |     async def get_crawl_state( | ||||||
|         self, crawl_id: str, is_qa: bool |         self, crawl_id: str, is_qa: bool | ||||||
|  | |||||||
| @ -208,8 +208,8 @@ TYPE_RUNNING_STATES = Literal[ | |||||||
| ] | ] | ||||||
| RUNNING_STATES = get_args(TYPE_RUNNING_STATES) | RUNNING_STATES = get_args(TYPE_RUNNING_STATES) | ||||||
| 
 | 
 | ||||||
| TYPE_STARTING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"] | TYPE_WAITING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"] | ||||||
| STARTING_STATES = get_args(TYPE_STARTING_STATES) | WAITING_STATES = get_args(TYPE_WAITING_STATES) | ||||||
| 
 | 
 | ||||||
| TYPE_FAILED_STATES = Literal[ | TYPE_FAILED_STATES = Literal[ | ||||||
|     "canceled", |     "canceled", | ||||||
| @ -228,8 +228,8 @@ TYPE_SUCCESSFUL_STATES = Literal[ | |||||||
| ] | ] | ||||||
| SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES) | SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES) | ||||||
| 
 | 
 | ||||||
| TYPE_RUNNING_AND_STARTING_STATES = Literal[TYPE_STARTING_STATES, TYPE_RUNNING_STATES] | TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES] | ||||||
| RUNNING_AND_STARTING_STATES = [*STARTING_STATES, *RUNNING_STATES] | RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES] | ||||||
| 
 | 
 | ||||||
| RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES] | RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES] | ||||||
| 
 | 
 | ||||||
| @ -237,9 +237,9 @@ TYPE_NON_RUNNING_STATES = Literal[TYPE_FAILED_STATES, TYPE_SUCCESSFUL_STATES] | |||||||
| NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES] | NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES] | ||||||
| 
 | 
 | ||||||
| TYPE_ALL_CRAWL_STATES = Literal[ | TYPE_ALL_CRAWL_STATES = Literal[ | ||||||
|     TYPE_RUNNING_AND_STARTING_STATES, TYPE_NON_RUNNING_STATES |     TYPE_RUNNING_AND_WAITING_STATES, TYPE_NON_RUNNING_STATES | ||||||
| ] | ] | ||||||
| ALL_CRAWL_STATES = [*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES] | ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
|  | |||||||
| @ -19,8 +19,9 @@ from btrixcloud.models import ( | |||||||
|     TYPE_ALL_CRAWL_STATES, |     TYPE_ALL_CRAWL_STATES, | ||||||
|     NON_RUNNING_STATES, |     NON_RUNNING_STATES, | ||||||
|     RUNNING_STATES, |     RUNNING_STATES, | ||||||
|  |     WAITING_STATES, | ||||||
|     RUNNING_AND_STARTING_ONLY, |     RUNNING_AND_STARTING_ONLY, | ||||||
|     RUNNING_AND_STARTING_STATES, |     RUNNING_AND_WAITING_STATES, | ||||||
|     SUCCESSFUL_STATES, |     SUCCESSFUL_STATES, | ||||||
|     FAILED_STATES, |     FAILED_STATES, | ||||||
|     CrawlStats, |     CrawlStats, | ||||||
| @ -119,6 +120,7 @@ class CrawlOperator(BaseOperator): | |||||||
|         """sync crawls""" |         """sync crawls""" | ||||||
| 
 | 
 | ||||||
|         status = CrawlStatus(**data.parent.get("status", {})) |         status = CrawlStatus(**data.parent.get("status", {})) | ||||||
|  |         status.last_state = status.state | ||||||
| 
 | 
 | ||||||
|         spec = data.parent.get("spec", {}) |         spec = data.parent.get("spec", {}) | ||||||
|         crawl_id = spec["id"] |         crawl_id = spec["id"] | ||||||
| @ -250,11 +252,6 @@ class CrawlOperator(BaseOperator): | |||||||
| 
 | 
 | ||||||
|         else: |         else: | ||||||
|             status.scale = 1 |             status.scale = 1 | ||||||
|             now = dt_now() |  | ||||||
|             await self.crawl_ops.inc_crawl_exec_time( |  | ||||||
|                 crawl.db_crawl_id, crawl.is_qa, 0, now |  | ||||||
|             ) |  | ||||||
|             status.lastUpdatedTime = to_k8s_date(now) |  | ||||||
| 
 | 
 | ||||||
|         children = self._load_redis(params, status, data.children) |         children = self._load_redis(params, status, data.children) | ||||||
| 
 | 
 | ||||||
| @ -807,25 +804,13 @@ class CrawlOperator(BaseOperator): | |||||||
|                 status.resync_after = self.fast_retry_secs |                 status.resync_after = self.fast_retry_secs | ||||||
|                 return status |                 return status | ||||||
| 
 | 
 | ||||||
|             # if true (state is set), also run webhook |             # ensure running state is set | ||||||
|             if await self.set_state( |             await self.set_state( | ||||||
|                 "running", |                 "running", | ||||||
|                 status, |                 status, | ||||||
|                 crawl, |                 crawl, | ||||||
|                 allowed_from=["starting", "waiting_capacity"], |                 allowed_from=["starting", "waiting_capacity"], | ||||||
|             ): |             ) | ||||||
|                 if not crawl.qa_source_crawl_id: |  | ||||||
|                     self.run_task( |  | ||||||
|                         self.event_webhook_ops.create_crawl_started_notification( |  | ||||||
|                             crawl.id, crawl.oid, scheduled=crawl.scheduled |  | ||||||
|                         ) |  | ||||||
|                     ) |  | ||||||
|                 else: |  | ||||||
|                     self.run_task( |  | ||||||
|                         self.event_webhook_ops.create_qa_analysis_started_notification( |  | ||||||
|                             crawl.id, crawl.oid, crawl.qa_source_crawl_id |  | ||||||
|                         ) |  | ||||||
|                     ) |  | ||||||
| 
 | 
 | ||||||
|             # update lastActiveTime if crawler is running |             # update lastActiveTime if crawler is running | ||||||
|             if crawler_running: |             if crawler_running: | ||||||
| @ -967,11 +952,33 @@ class CrawlOperator(BaseOperator): | |||||||
|         """inc exec time tracking""" |         """inc exec time tracking""" | ||||||
|         now = dt_now() |         now = dt_now() | ||||||
| 
 | 
 | ||||||
|  |         # don't count time crawl is not running | ||||||
|  |         if status.state in WAITING_STATES: | ||||||
|  |             # reset lastUpdatedTime if at least 2 consecutive updates of non-running state | ||||||
|  |             if status.last_state in WAITING_STATES: | ||||||
|  |                 status.lastUpdatedTime = to_k8s_date(now) | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|         update_start_time = await self.crawl_ops.get_crawl_exec_last_update_time( |         update_start_time = await self.crawl_ops.get_crawl_exec_last_update_time( | ||||||
|             crawl.db_crawl_id |             crawl.db_crawl_id, crawl.is_qa | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         if not update_start_time: |         if not update_start_time: | ||||||
|  |             print("Crawl first started, webhooks called", now, crawl.id) | ||||||
|  |             # call initial running webhook | ||||||
|  |             if not crawl.qa_source_crawl_id: | ||||||
|  |                 self.run_task( | ||||||
|  |                     self.event_webhook_ops.create_crawl_started_notification( | ||||||
|  |                         crawl.id, crawl.oid, scheduled=crawl.scheduled | ||||||
|  |                     ) | ||||||
|  |                 ) | ||||||
|  |             else: | ||||||
|  |                 self.run_task( | ||||||
|  |                     self.event_webhook_ops.create_qa_analysis_started_notification( | ||||||
|  |                         crawl.id, crawl.oid, crawl.qa_source_crawl_id | ||||||
|  |                     ) | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|             await self.crawl_ops.inc_crawl_exec_time( |             await self.crawl_ops.inc_crawl_exec_time( | ||||||
|                 crawl.db_crawl_id, crawl.is_qa, 0, now |                 crawl.db_crawl_id, crawl.is_qa, 0, now | ||||||
|             ) |             ) | ||||||
| @ -1414,7 +1421,7 @@ class CrawlOperator(BaseOperator): | |||||||
| 
 | 
 | ||||||
|         finished = dt_now() |         finished = dt_now() | ||||||
| 
 | 
 | ||||||
|         allowed_from = RUNNING_AND_STARTING_STATES |         allowed_from = RUNNING_AND_WAITING_STATES | ||||||
| 
 | 
 | ||||||
|         # if set_state returns false, already set to same status, return |         # if set_state returns false, already set to same status, return | ||||||
|         if not await self.set_state( |         if not await self.set_state( | ||||||
|  | |||||||
| @ -223,3 +223,6 @@ class CrawlStatus(BaseModel): | |||||||
| 
 | 
 | ||||||
|     # don't include in status, use by metacontroller |     # don't include in status, use by metacontroller | ||||||
|     resync_after: Optional[int] = Field(default=None, exclude=True) |     resync_after: Optional[int] = Field(default=None, exclude=True) | ||||||
|  | 
 | ||||||
|  |     # last state | ||||||
|  |     last_state: TYPE_ALL_CRAWL_STATES = Field(default="starting", exclude=True) | ||||||
|  | |||||||
| @ -27,7 +27,7 @@ from aiostream import stream | |||||||
| from .models import ( | from .models import ( | ||||||
|     SUCCESSFUL_STATES, |     SUCCESSFUL_STATES, | ||||||
|     RUNNING_STATES, |     RUNNING_STATES, | ||||||
|     STARTING_STATES, |     WAITING_STATES, | ||||||
|     BaseCrawl, |     BaseCrawl, | ||||||
|     Organization, |     Organization, | ||||||
|     StorageRef, |     StorageRef, | ||||||
| @ -890,7 +890,7 @@ class OrgOps: | |||||||
|             {"oid": org.id, "state": {"$in": RUNNING_STATES}} |             {"oid": org.id, "state": {"$in": RUNNING_STATES}} | ||||||
|         ) |         ) | ||||||
|         workflows_queued_count = await self.crawls_db.count_documents( |         workflows_queued_count = await self.crawls_db.count_documents( | ||||||
|             {"oid": org.id, "state": {"$in": STARTING_STATES}} |             {"oid": org.id, "state": {"$in": WAITING_STATES}} | ||||||
|         ) |         ) | ||||||
|         collections_count = await self.colls_db.count_documents({"oid": org.id}) |         collections_count = await self.colls_db.count_documents({"oid": org.id}) | ||||||
|         public_collections_count = await self.colls_db.count_documents( |         public_collections_count = await self.colls_db.count_documents( | ||||||
|  | |||||||
| @ -5,7 +5,7 @@ metadata: | |||||||
|   name: crawljobs-operator |   name: crawljobs-operator | ||||||
| spec: | spec: | ||||||
|   generateSelector: false |   generateSelector: false | ||||||
|   resyncPeriodSeconds: {{ .Values.operator_resync_seconds | default 30 }} |   resyncPeriodSeconds: {{ .Values.operator_resync_seconds | default 10 }} | ||||||
|   parentResource: |   parentResource: | ||||||
|     apiVersion: btrix.cloud/v1 |     apiVersion: btrix.cloud/v1 | ||||||
|     resource: crawljobs |     resource: crawljobs | ||||||
|  | |||||||
| @ -802,6 +802,7 @@ export class ArchivedItemDetail extends TailwindElement { | |||||||
|                     ? html`<span
 |                     ? html`<span
 | ||||||
|                         >${humanizeExecutionSeconds( |                         >${humanizeExecutionSeconds( | ||||||
|                           this.crawl!.crawlExecSeconds, |                           this.crawl!.crawlExecSeconds, | ||||||
|  |                           { displaySeconds: true }, | ||||||
|                         )}</span |                         )}</span | ||||||
|                       >` |                       >` | ||||||
|                     : html`<span class="text-0-400">${msg("Pending")}</span>`} |                     : html`<span class="text-0-400">${msg("Pending")}</span>`} | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user