Compute crawl execution time in operator (#1256)
* store execution time in operator: - rename isNewCrash -> isNewExit, crashTime -> exitTime - keep track of exitCode - add execTime counter, increment when state has a 'finishedAt' and 'startedAt' state - ensure pods are complete before deleting - store 'crawlExecSeconds' on crawl and org levels, add to Crawl, CrawlOut, Organization models * support for fast cancel: - set redis ':canceled' key to immediately cancel crawl - delete crawl pods to ensure pod exits immediately - in finalizer, don't wait for pods to complete when canceling (but still check if terminated) - add currentTime in pod.status.running.startedAt times for all existing pods - logging: log exec time, missing finishedAt - logging: don't log exit code 11 (interrupt due to time/size limits) as a crash * don't wait for pods completed on failed with existing browsertrix-crawler image --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
		
							parent
							
								
									748c86700d
								
							
						
					
					
						commit
						5cad9acee9
					
				| @ -521,6 +521,13 @@ class CrawlOps(BaseCrawlOps): | ||||
|         query = {"_id": crawl_id, "type": "crawl", "state": "running"} | ||||
|         return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}}) | ||||
| 
 | ||||
|     async def store_exec_time(self, crawl_id, exec_time): | ||||
|         """set exec time, only if not already set""" | ||||
|         query = {"_id": crawl_id, "type": "crawl", "execTime": {"$in": [0, None]}} | ||||
|         return await self.crawls.find_one_and_update( | ||||
|             query, {"$set": {"execTime": exec_time}} | ||||
|         ) | ||||
| 
 | ||||
|     async def get_crawl_state(self, crawl_id): | ||||
|         """return current crawl state of a crawl""" | ||||
|         res = await self.crawls.find_one( | ||||
|  | ||||
| @ -374,6 +374,8 @@ class CrawlOut(BaseMongoModel): | ||||
| 
 | ||||
|     collectionIds: Optional[List[UUID4]] = [] | ||||
| 
 | ||||
|     crawlExecSeconds: int = 0 | ||||
| 
 | ||||
|     # automated crawl fields | ||||
|     config: Optional[RawCrawlConfig] | ||||
|     cid: Optional[UUID4] | ||||
| @ -441,6 +443,8 @@ class Crawl(BaseCrawl, CrawlConfigCore): | ||||
| 
 | ||||
|     stopping: Optional[bool] = False | ||||
| 
 | ||||
|     crawlExecSeconds: int = 0 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CrawlCompleteIn(BaseModel): | ||||
| @ -666,6 +670,7 @@ class Organization(BaseMongoModel): | ||||
|     storage: Union[S3Storage, DefaultStorage] | ||||
| 
 | ||||
|     usage: Dict[str, int] = {} | ||||
|     crawlExecSeconds: Dict[str, int] = {} | ||||
| 
 | ||||
|     bytesStored: int = 0 | ||||
|     bytesStoredCrawls: int = 0 | ||||
| @ -713,6 +718,7 @@ class Organization(BaseMongoModel): | ||||
| 
 | ||||
|         if not self.is_crawler(user): | ||||
|             exclude.add("usage") | ||||
|             exclude.add("crawlExecSeconds") | ||||
| 
 | ||||
|         result = self.to_dict( | ||||
|             exclude_unset=True, | ||||
| @ -747,6 +753,7 @@ class OrgOut(BaseMongoModel): | ||||
|     name: str | ||||
|     users: Optional[Dict[str, Any]] | ||||
|     usage: Optional[Dict[str, int]] | ||||
|     crawlExecSeconds: Optional[Dict[str, int]] | ||||
|     default: bool = False | ||||
|     bytesStored: int | ||||
|     bytesStoredCrawls: int | ||||
|  | ||||
| @ -132,8 +132,9 @@ class PodResources(BaseModel): | ||||
| class PodInfo(BaseModel): | ||||
|     """Aggregate pod status info held in CrawlJob""" | ||||
| 
 | ||||
|     crashTime: Optional[str] = None | ||||
|     isNewCrash: Optional[bool] = Field(default=None, exclude=True) | ||||
|     exitTime: Optional[str] = None | ||||
|     exitCode: Optional[int] = None | ||||
|     isNewExit: Optional[bool] = Field(default=None, exclude=True) | ||||
|     reason: Optional[str] = None | ||||
| 
 | ||||
|     allocated: PodResources = PodResources() | ||||
| @ -211,13 +212,15 @@ class CrawlStatus(BaseModel): | ||||
|         lambda: PodInfo()  # pylint: disable=unnecessary-lambda | ||||
|     ) | ||||
|     restartTime: Optional[str] | ||||
|     execTime: int = 0 | ||||
|     canceled: bool = False | ||||
| 
 | ||||
|     # don't include in status, use by metacontroller | ||||
|     resync_after: Optional[int] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| # pylint: disable=too-many-statements, too-many-public-methods, too-many-branches | ||||
| # pylint: disable=too-many-statements, too-many-public-methods, too-many-branches, too-many-nested-blocks | ||||
| # pylint: disable=too-many-instance-attributes, too-many-locals, too-many-lines, too-many-arguments | ||||
| class BtrixOperator(K8sAPI): | ||||
|     """BtrixOperator Handler""" | ||||
| @ -331,15 +334,17 @@ class BtrixOperator(K8sAPI): | ||||
|         if data.finalizing: | ||||
|             if not status.finished: | ||||
|                 # if can't cancel, already finished | ||||
|                 if not await self.mark_finished( | ||||
|                     crawl_id, uuid.UUID(cid), uuid.UUID(oid), status, "canceled" | ||||
|                 if not await self.cancel_crawl( | ||||
|                     crawl_id, uuid.UUID(cid), uuid.UUID(oid), status, data.children[POD] | ||||
|                 ): | ||||
|                     # instead of fetching the state (that was already set) | ||||
|                     # return exception to ignore this request, keep previous | ||||
|                     # finished state | ||||
|                     raise HTTPException(status_code=400, detail="out_of_sync_status") | ||||
| 
 | ||||
|             return self.finalize_response(crawl_id, status, spec, data.children, params) | ||||
|             return await self.finalize_response( | ||||
|                 crawl_id, uuid.UUID(oid), status, spec, data.children, params | ||||
|             ) | ||||
| 
 | ||||
|         # just in case, finished but not deleted, can only get here if | ||||
|         # do_crawl_finished_tasks() doesn't reach the end or taking too long | ||||
| @ -348,7 +353,9 @@ class BtrixOperator(K8sAPI): | ||||
|                 f"warn crawl {crawl_id} finished but not deleted, post-finish taking too long?" | ||||
|             ) | ||||
|             asyncio.create_task(self.delete_crawl_job(crawl_id)) | ||||
|             return self.finalize_response(crawl_id, status, spec, data.children, params) | ||||
|             return await self.finalize_response( | ||||
|                 crawl_id, uuid.UUID(oid), status, spec, data.children, params | ||||
|             ) | ||||
| 
 | ||||
|         try: | ||||
|             configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"] | ||||
| @ -407,8 +414,8 @@ class BtrixOperator(K8sAPI): | ||||
|             self.handle_auto_size(crawl.id, status.podStatus) | ||||
| 
 | ||||
|             if status.finished: | ||||
|                 return self.finalize_response( | ||||
|                     crawl_id, status, spec, data.children, params | ||||
|                 return await self.finalize_response( | ||||
|                     crawl_id, uuid.UUID(oid), status, spec, data.children, params | ||||
|                 ) | ||||
|         else: | ||||
|             status.scale = crawl.scale | ||||
| @ -654,6 +661,43 @@ class BtrixOperator(K8sAPI): | ||||
|         ) | ||||
|         return False | ||||
| 
 | ||||
|     async def cancel_crawl(self, crawl_id, cid, oid, status, pods): | ||||
|         """Mark crawl as canceled""" | ||||
|         if not await self.mark_finished(crawl_id, cid, oid, status, "canceled"): | ||||
|             return False | ||||
| 
 | ||||
|         await self.mark_for_cancelation(crawl_id) | ||||
| 
 | ||||
|         if not status.canceled: | ||||
|             cancel_time = datetime.utcnow() | ||||
| 
 | ||||
|             for name, pod in pods.items(): | ||||
|                 pstatus = pod["status"] | ||||
|                 role = pod["metadata"]["labels"]["role"] | ||||
| 
 | ||||
|                 if role != "crawler": | ||||
|                     continue | ||||
| 
 | ||||
|                 if "containerStatuses" not in pstatus: | ||||
|                     continue | ||||
| 
 | ||||
|                 cstatus = pstatus["containerStatuses"][0] | ||||
| 
 | ||||
|                 running = cstatus["state"].get("running") | ||||
| 
 | ||||
|                 if running: | ||||
|                     self.inc_exec_time( | ||||
|                         name, status, cancel_time, running.get("startedAt") | ||||
|                     ) | ||||
| 
 | ||||
|                 self.handle_terminated_pod( | ||||
|                     name, role, status, cstatus["state"].get("terminated") | ||||
|                 ) | ||||
| 
 | ||||
|             status.canceled = True | ||||
| 
 | ||||
|         return status.canceled | ||||
| 
 | ||||
|     async def fail_crawl(self, crawl_id, cid, status, pods, stats=None): | ||||
|         """Mark crawl as failed, log crawl state and print crawl logs, if possible""" | ||||
|         prev_state = status.state | ||||
| @ -683,16 +727,29 @@ class BtrixOperator(K8sAPI): | ||||
|             "children": [], | ||||
|         } | ||||
| 
 | ||||
|     def finalize_response(self, crawl_id, status, spec, children, params): | ||||
|     async def finalize_response( | ||||
|         self, | ||||
|         crawl_id: str, | ||||
|         oid: uuid.UUID, | ||||
|         status: CrawlStatus, | ||||
|         spec: dict, | ||||
|         children: dict, | ||||
|         params: dict, | ||||
|     ): | ||||
|         """ensure crawl id ready for deletion""" | ||||
| 
 | ||||
|         redis_pod = f"redis-{crawl_id}" | ||||
|         new_children = [] | ||||
| 
 | ||||
|         finalized = False | ||||
| 
 | ||||
|         if redis_pod in children[POD]: | ||||
|         exec_updated = False | ||||
| 
 | ||||
|         pods = children[POD] | ||||
| 
 | ||||
|         if redis_pod in pods: | ||||
|             # if has other pods, keep redis pod until they are removed | ||||
|             if len(children[POD]) > 1: | ||||
|             if len(pods) > 1: | ||||
|                 new_children = self._load_redis(params, status, children) | ||||
| 
 | ||||
|         # keep pvs until pods are removed | ||||
| @ -700,6 +757,9 @@ class BtrixOperator(K8sAPI): | ||||
|             new_children.extend(list(children[PVC].values())) | ||||
| 
 | ||||
|         if not children[POD] and not children[PVC]: | ||||
|             # ensure exec time was successfully updated | ||||
|             exec_updated = await self.store_exec_time(crawl_id, oid, status.execTime) | ||||
| 
 | ||||
|             # keep parent until ttl expired, if any | ||||
|             if status.finished: | ||||
|                 ttl = spec.get("ttlSecondsAfterFinished", DEFAULT_TTL) | ||||
| @ -713,7 +773,7 @@ class BtrixOperator(K8sAPI): | ||||
|         return { | ||||
|             "status": status.dict(exclude_none=True, exclude={"resync_after": True}), | ||||
|             "children": new_children, | ||||
|             "finalized": finalized, | ||||
|             "finalized": finalized and exec_updated, | ||||
|         } | ||||
| 
 | ||||
|     async def _get_redis(self, redis_url): | ||||
| @ -735,7 +795,7 @@ class BtrixOperator(K8sAPI): | ||||
|     async def sync_crawl_state(self, redis_url, crawl, status, pods, metrics): | ||||
|         """sync crawl state for running crawl""" | ||||
|         # check if at least one crawler pod started running | ||||
|         crawler_running, redis_running = self.sync_pod_status(pods, status) | ||||
|         crawler_running, redis_running, done = self.sync_pod_status(pods, status) | ||||
|         redis = None | ||||
| 
 | ||||
|         try: | ||||
| @ -809,7 +869,7 @@ class BtrixOperator(K8sAPI): | ||||
|             status.filesAddedSize = int(await redis.get("filesAddedSize") or 0) | ||||
| 
 | ||||
|             # update stats and get status | ||||
|             return await self.update_crawl_state(redis, crawl, status, pods) | ||||
|             return await self.update_crawl_state(redis, crawl, status, pods, done) | ||||
| 
 | ||||
|         # pylint: disable=broad-except | ||||
|         except Exception as exc: | ||||
| @ -825,6 +885,7 @@ class BtrixOperator(K8sAPI): | ||||
|         """check status of pods""" | ||||
|         crawler_running = False | ||||
|         redis_running = False | ||||
|         done = True | ||||
|         try: | ||||
|             for name, pod in pods.items(): | ||||
|                 running = False | ||||
| @ -848,30 +909,50 @@ class BtrixOperator(K8sAPI): | ||||
|                     ): | ||||
|                         running = True | ||||
| 
 | ||||
|                     terminated = cstatus["state"].get("terminated") | ||||
|                     exit_code = terminated and terminated.get("exitCode") | ||||
|                     if terminated and exit_code: | ||||
|                         crash_time = terminated.get("finishedAt") | ||||
|                         pod_status = status.podStatus[name] | ||||
|                         pod_status.isNewCrash = pod_status.crashTime != crash_time | ||||
|                         pod_status.crashTime = crash_time | ||||
| 
 | ||||
|                         # detect reason | ||||
|                         if terminated.get("reason") == "OOMKilled" or exit_code == 137: | ||||
|                             pod_status.reason = "oom" | ||||
|                         else: | ||||
|                             pod_status.reason = "interrupt: " + str(exit_code) | ||||
|                     self.handle_terminated_pod( | ||||
|                         name, role, status, cstatus["state"].get("terminated") | ||||
|                     ) | ||||
| 
 | ||||
|                 if role == "crawler": | ||||
|                     crawler_running = crawler_running or running | ||||
|                     done = done and phase == "Succeeded" | ||||
|                 elif role == "redis": | ||||
|                     redis_running = redis_running or running | ||||
| 
 | ||||
|         # pylint: disable=broad-except | ||||
|         except Exception as exc: | ||||
|             done = False | ||||
|             print(exc) | ||||
| 
 | ||||
|         return crawler_running, redis_running | ||||
|         return crawler_running, redis_running, done | ||||
| 
 | ||||
|     def handle_terminated_pod(self, name, role, status, terminated): | ||||
|         """handle terminated pod state""" | ||||
|         if not terminated: | ||||
|             return | ||||
| 
 | ||||
|         exit_time = terminated.get("finishedAt") | ||||
|         if not exit_time: | ||||
|             print("warn: terminated pod missing finishedAt", flush=True) | ||||
|             return | ||||
| 
 | ||||
|         pod_status = status.podStatus[name] | ||||
| 
 | ||||
|         pod_status.isNewExit = pod_status.exitTime != exit_time | ||||
|         if pod_status.isNewExit and role == "crawler": | ||||
|             self.inc_exec_time(name, status, exit_time, terminated.get("startedAt")) | ||||
|             pod_status.exitTime = exit_time | ||||
| 
 | ||||
|         # detect reason | ||||
|         exit_code = terminated.get("exitCode") | ||||
|         pod_status.reason = "done" | ||||
| 
 | ||||
|         if exit_code == 0: | ||||
|             pod_status.reason = "done" | ||||
|         elif terminated.get("reason") == "OOMKilled" or exit_code == 137: | ||||
|             pod_status.reason = "oom" | ||||
|         else: | ||||
|             pod_status.reason = "interrupt: " + str(exit_code) | ||||
| 
 | ||||
|     def should_mark_waiting(self, state, started): | ||||
|         """Should the crawl be marked as waiting for capacity?""" | ||||
| @ -912,7 +993,7 @@ class BtrixOperator(K8sAPI): | ||||
|         """auto scale pods here, experimental""" | ||||
|         for name, pod in pod_status.items(): | ||||
|             # if pod crashed due to OOM, increase mem | ||||
|             # if pod.isNewCrash and pod.reason == "oom": | ||||
|             # if pod.isNewExit and pod.reason == "oom": | ||||
|             #    pod.newMemory = int(float(pod.allocated.memory) * 1.2) | ||||
|             #    print(f"Resizing pod {name} -> mem {pod.newMemory} - OOM Detected") | ||||
| 
 | ||||
| @ -924,7 +1005,10 @@ class BtrixOperator(K8sAPI): | ||||
|     async def log_crashes(self, crawl_id, pod_status, redis): | ||||
|         """report/log any pod crashes here""" | ||||
|         for name, pod in pod_status.items(): | ||||
|             if not pod.isNewCrash: | ||||
|             # log only unexpected exits as crashes | ||||
|             # - 0 is success / intended shutdown | ||||
|             # - 11 is interrupt / intended restart | ||||
|             if not pod.isNewExit or pod.exitCode in (0, 11): | ||||
|                 continue | ||||
| 
 | ||||
|             log = self.get_log_line( | ||||
| @ -991,7 +1075,7 @@ class BtrixOperator(K8sAPI): | ||||
| 
 | ||||
|         return False | ||||
| 
 | ||||
|     async def update_crawl_state(self, redis, crawl, status, pods): | ||||
|     async def update_crawl_state(self, redis, crawl, status, pods, done): | ||||
|         """update crawl state and check if crawl is now done""" | ||||
|         results = await redis.hgetall(f"{crawl.id}:status") | ||||
|         stats, sizes = await get_redis_crawl_stats(redis, crawl.id) | ||||
| @ -1034,7 +1118,7 @@ class BtrixOperator(K8sAPI): | ||||
|                 status_count[res] = status_count.get(res, 0) + 1 | ||||
| 
 | ||||
|         # check if all crawlers are done | ||||
|         if status_count.get("done", 0) >= crawl.scale: | ||||
|         if done and status_count.get("done", 0) >= crawl.scale: | ||||
|             # check if one-page crawls actually succeeded | ||||
|             # if only one page found, and no files, assume failed | ||||
|             if status.pagesFound == 1 and not status.filesAdded: | ||||
| @ -1138,6 +1222,32 @@ class BtrixOperator(K8sAPI): | ||||
|         # finally, delete job | ||||
|         await self.delete_crawl_job(crawl_id) | ||||
| 
 | ||||
|     def inc_exec_time(self, name, status, finished_at, started_at): | ||||
|         """increment execTime on pod status""" | ||||
|         end_time = ( | ||||
|             from_k8s_date(finished_at) | ||||
|             if not isinstance(finished_at, datetime) | ||||
|             else finished_at | ||||
|         ) | ||||
|         start_time = from_k8s_date(started_at) | ||||
|         exec_time = int((end_time - start_time).total_seconds()) | ||||
|         status.execTime += exec_time | ||||
|         print(f"{name} exec time: {exec_time}") | ||||
|         return exec_time | ||||
| 
 | ||||
|     async def store_exec_time(self, crawl_id, oid, exec_time): | ||||
|         """store execTime in crawl (if not already set), and increment org counter""" | ||||
|         try: | ||||
|             if await self.crawl_ops.store_exec_time(crawl_id, exec_time): | ||||
|                 print(f"Exec Time: {exec_time}", flush=True) | ||||
|                 await self.org_ops.inc_org_time_stats(oid, exec_time, True) | ||||
| 
 | ||||
|             return True | ||||
|         # pylint: disable=broad-except | ||||
|         except Exception as exc: | ||||
|             print(exc, flush=True) | ||||
|             return False | ||||
| 
 | ||||
|     async def inc_crawl_complete_stats(self, crawl, finished): | ||||
|         """Increment Crawl Stats""" | ||||
| 
 | ||||
| @ -1147,7 +1257,21 @@ class BtrixOperator(K8sAPI): | ||||
| 
 | ||||
|         print(f"Duration: {duration}", flush=True) | ||||
| 
 | ||||
|         await self.org_ops.inc_org_stats(crawl.oid, duration) | ||||
|         await self.org_ops.inc_org_time_stats(crawl.oid, duration) | ||||
| 
 | ||||
|     async def mark_for_cancelation(self, crawl_id): | ||||
|         """mark crawl as canceled in redis""" | ||||
|         try: | ||||
|             redis_url = self.get_redis_url(crawl_id) | ||||
|             redis = await self._get_redis(redis_url) | ||||
|             if not redis: | ||||
|                 return False | ||||
| 
 | ||||
|             await redis.set(f"{crawl_id}:canceled", "1") | ||||
|             return True | ||||
|         finally: | ||||
|             if redis: | ||||
|                 await redis.close() | ||||
| 
 | ||||
|     async def add_crawl_errors_to_db(self, crawl_id, inc=100): | ||||
|         """Pull crawl errors from redis and write to mongo db""" | ||||
|  | ||||
| @ -320,12 +320,13 @@ class OrgOps: | ||||
|             {"_id": org.id}, {"$set": {"origin": origin}} | ||||
|         ) | ||||
| 
 | ||||
|     async def inc_org_stats(self, oid, duration): | ||||
|     async def inc_org_time_stats(self, oid, duration, is_exec_time=False): | ||||
|         """inc crawl duration stats for org oid""" | ||||
|         # init org crawl stats | ||||
|         key = "crawlExecSeconds" if is_exec_time else "usage" | ||||
|         yymm = datetime.utcnow().strftime("%Y-%m") | ||||
|         await self.orgs.find_one_and_update( | ||||
|             {"_id": oid}, {"$inc": {f"usage.{yymm}": duration}} | ||||
|             {"_id": oid}, {"$inc": {f"{key}.{yymm}": duration}} | ||||
|         ) | ||||
| 
 | ||||
|     async def get_max_concurrent_crawls(self, oid): | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user