- increase time for going to waiting_capacity from starting to 150 seconds - relax requirement for state transitions, allow complete from waiting - additional type safety for different states, ensure mark_finished() only called with non-running states, add `Literal` types for all the state types.
This commit is contained in:
		
							parent
							
								
									c1817cbe04
								
							
						
					
					
						commit
						ffc4b5b58f
					
				| @ -9,7 +9,7 @@ import urllib.parse | ||||
| from datetime import datetime | ||||
| from uuid import UUID | ||||
| 
 | ||||
| from typing import Optional, List, Dict, Union, Any | ||||
| from typing import Optional, List, Dict, Union, Any, Sequence | ||||
| 
 | ||||
| from fastapi import Depends, HTTPException | ||||
| from fastapi.responses import StreamingResponse | ||||
| @ -41,6 +41,7 @@ from .models import ( | ||||
|     RUNNING_AND_STARTING_STATES, | ||||
|     SUCCESSFUL_STATES, | ||||
|     ALL_CRAWL_STATES, | ||||
|     TYPE_ALL_CRAWL_STATES, | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| @ -442,8 +443,8 @@ class CrawlOps(BaseCrawlOps): | ||||
|         self, | ||||
|         crawl_id: str, | ||||
|         is_qa: bool, | ||||
|         state: str, | ||||
|         allowed_from: List[str], | ||||
|         state: TYPE_ALL_CRAWL_STATES, | ||||
|         allowed_from: Sequence[TYPE_ALL_CRAWL_STATES], | ||||
|         finished: Optional[datetime] = None, | ||||
|         stats: Optional[CrawlStats] = None, | ||||
|     ): | ||||
|  | ||||
| @ -7,7 +7,7 @@ from enum import Enum, IntEnum | ||||
| from uuid import UUID | ||||
| import os | ||||
| 
 | ||||
| from typing import Optional, List, Dict, Union, Literal, Any | ||||
| from typing import Optional, List, Dict, Union, Literal, Any, get_args | ||||
| from pydantic import ( | ||||
|     BaseModel, | ||||
|     conint, | ||||
| @ -153,20 +153,31 @@ class UserOut(BaseModel): | ||||
| ### CRAWL STATES | ||||
| 
 | ||||
| # ============================================================================ | ||||
| RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"] | ||||
| TYPE_RUNNING_STATES = Literal[ | ||||
|     "running", "pending-wait", "generate-wacz", "uploading-wacz" | ||||
| ] | ||||
| RUNNING_STATES = get_args(TYPE_RUNNING_STATES) | ||||
| 
 | ||||
| STARTING_STATES = ["starting", "waiting_capacity", "waiting_org_limit"] | ||||
| TYPE_STARTING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"] | ||||
| STARTING_STATES = get_args(TYPE_STARTING_STATES) | ||||
| 
 | ||||
| FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"] | ||||
| TYPE_FAILED_STATES = Literal["canceled", "failed", "skipped_quota_reached"] | ||||
| FAILED_STATES = get_args(TYPE_FAILED_STATES) | ||||
| 
 | ||||
| SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"] | ||||
| TYPE_SUCCESSFUL_STATES = Literal["complete", "stopped_by_user", "stopped_quota_reached"] | ||||
| SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES) | ||||
| 
 | ||||
| TYPE_RUNNING_AND_STARTING_STATES = Literal[TYPE_STARTING_STATES, TYPE_RUNNING_STATES] | ||||
| RUNNING_AND_STARTING_STATES = [*STARTING_STATES, *RUNNING_STATES] | ||||
| 
 | ||||
| RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES] | ||||
| 
 | ||||
| TYPE_NON_RUNNING_STATES = Literal[TYPE_FAILED_STATES, TYPE_SUCCESSFUL_STATES] | ||||
| NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES] | ||||
| 
 | ||||
| TYPE_ALL_CRAWL_STATES = Literal[ | ||||
|     TYPE_RUNNING_AND_STARTING_STATES, TYPE_NON_RUNNING_STATES | ||||
| ] | ||||
| ALL_CRAWL_STATES = [*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES] | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -3,7 +3,7 @@ | ||||
| import traceback | ||||
| import os | ||||
| from pprint import pprint | ||||
| from typing import Optional, Any | ||||
| from typing import Optional, Any, Sequence | ||||
| from datetime import datetime | ||||
| 
 | ||||
| import json | ||||
| @ -14,6 +14,9 @@ from kubernetes.utils import parse_quantity | ||||
| from redis import asyncio as exceptions | ||||
| 
 | ||||
| from btrixcloud.models import ( | ||||
|     TYPE_NON_RUNNING_STATES, | ||||
|     TYPE_RUNNING_STATES, | ||||
|     TYPE_ALL_CRAWL_STATES, | ||||
|     NON_RUNNING_STATES, | ||||
|     RUNNING_STATES, | ||||
|     RUNNING_AND_STARTING_ONLY, | ||||
| @ -37,6 +40,7 @@ from .baseoperator import BaseOperator, Redis | ||||
| from .models import ( | ||||
|     CrawlSpec, | ||||
|     CrawlStatus, | ||||
|     StopReason, | ||||
|     MCBaseRequest, | ||||
|     MCSyncData, | ||||
|     PodInfo, | ||||
| @ -56,7 +60,7 @@ DEFAULT_TTL = 30 | ||||
| REDIS_TTL = 60 | ||||
| 
 | ||||
| # time in seconds before a crawl is deemed 'waiting' instead of 'starting' | ||||
| STARTING_TIME_SECS = 60 | ||||
| STARTING_TIME_SECS = 150 | ||||
| 
 | ||||
| # how often to update execution time seconds | ||||
| EXEC_TIME_UPDATE_SECS = 60 | ||||
| @ -428,10 +432,10 @@ class CrawlOperator(BaseOperator): | ||||
| 
 | ||||
|     async def set_state( | ||||
|         self, | ||||
|         state: str, | ||||
|         state: TYPE_ALL_CRAWL_STATES, | ||||
|         status: CrawlStatus, | ||||
|         crawl: CrawlSpec, | ||||
|         allowed_from: list[str], | ||||
|         allowed_from: Sequence[TYPE_ALL_CRAWL_STATES], | ||||
|         finished: Optional[datetime] = None, | ||||
|         stats: Optional[CrawlStats] = None, | ||||
|     ): | ||||
| @ -1132,7 +1136,7 @@ class CrawlOperator(BaseOperator): | ||||
| 
 | ||||
|     async def is_crawl_stopping( | ||||
|         self, crawl: CrawlSpec, status: CrawlStatus | ||||
|     ) -> Optional[str]: | ||||
|     ) -> Optional[StopReason]: | ||||
|         """check if crawl is stopping and set reason""" | ||||
|         # if user requested stop, then enter stopping phase | ||||
|         if crawl.stopping: | ||||
| @ -1242,8 +1246,11 @@ class CrawlOperator(BaseOperator): | ||||
|                 await self.fail_crawl(crawl, status, pods, stats) | ||||
|                 return status | ||||
| 
 | ||||
|             if status.stopReason in ("stopped_by_user", "stopped_quota_reached"): | ||||
|                 state = status.stopReason | ||||
|             state: TYPE_NON_RUNNING_STATES | ||||
|             if status.stopReason == "stopped_by_user": | ||||
|                 state = "stopped_by_user" | ||||
|             elif status.stopReason == "stopped_quota_reached": | ||||
|                 state = "stopped_quota_reached" | ||||
|             else: | ||||
|                 state = "complete" | ||||
| 
 | ||||
| @ -1259,7 +1266,7 @@ class CrawlOperator(BaseOperator): | ||||
| 
 | ||||
|         # check for other statuses | ||||
|         else: | ||||
|             new_status = None | ||||
|             new_status: TYPE_RUNNING_STATES | ||||
|             if status_count.get("running"): | ||||
|                 if status.state in ("generate-wacz", "uploading-wacz", "pending-wacz"): | ||||
|                     new_status = "running" | ||||
| @ -1282,17 +1289,14 @@ class CrawlOperator(BaseOperator): | ||||
|         self, | ||||
|         crawl: CrawlSpec, | ||||
|         status: CrawlStatus, | ||||
|         state: str, | ||||
|         state: TYPE_NON_RUNNING_STATES, | ||||
|         stats: Optional[CrawlStats] = None, | ||||
|     ) -> bool: | ||||
|         """mark crawl as finished, set finished timestamp and final state""" | ||||
| 
 | ||||
|         finished = dt_now() | ||||
| 
 | ||||
|         if state in SUCCESSFUL_STATES: | ||||
|             allowed_from = RUNNING_STATES | ||||
|         else: | ||||
|             allowed_from = RUNNING_AND_STARTING_STATES | ||||
|         allowed_from = RUNNING_AND_STARTING_STATES | ||||
| 
 | ||||
|         # if set_state returns false, already set to same status, return | ||||
|         if not await self.set_state( | ||||
| @ -1329,7 +1333,7 @@ class CrawlOperator(BaseOperator): | ||||
|         self, | ||||
|         crawl: CrawlSpec, | ||||
|         status: CrawlStatus, | ||||
|         state: str, | ||||
|         state: TYPE_NON_RUNNING_STATES, | ||||
|     ) -> None: | ||||
|         """Run tasks after crawl completes in asyncio.task coroutine.""" | ||||
|         await self.crawl_config_ops.stats_recompute_last( | ||||
| @ -1357,7 +1361,7 @@ class CrawlOperator(BaseOperator): | ||||
|     async def do_qa_run_finished_tasks( | ||||
|         self, | ||||
|         crawl: CrawlSpec, | ||||
|         state: str, | ||||
|         state: TYPE_NON_RUNNING_STATES, | ||||
|     ) -> None: | ||||
|         """Run tasks after qa run completes in asyncio.task coroutine.""" | ||||
| 
 | ||||
|  | ||||
| @ -2,10 +2,10 @@ | ||||
| 
 | ||||
| from collections import defaultdict | ||||
| from uuid import UUID | ||||
| from typing import Optional, DefaultDict | ||||
| from typing import Optional, DefaultDict, Literal | ||||
| from pydantic import BaseModel, Field | ||||
| from kubernetes.utils import parse_quantity | ||||
| from btrixcloud.models import StorageRef | ||||
| from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES | ||||
| 
 | ||||
| 
 | ||||
| BTRIX_API = "btrix.cloud/v1" | ||||
| @ -15,6 +15,10 @@ PVC = "PersistentVolumeClaim.v1" | ||||
| POD = "Pod.v1" | ||||
| CJS = f"CrawlJob.{BTRIX_API}" | ||||
| 
 | ||||
| StopReason = Literal[ | ||||
|     "stopped_by_user", "time-limit", "size-limit", "stopped_quota_reached" | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class MCBaseRequest(BaseModel): | ||||
| @ -166,7 +170,7 @@ class PodInfo(BaseModel): | ||||
| class CrawlStatus(BaseModel): | ||||
|     """status from k8s CrawlJob object""" | ||||
| 
 | ||||
|     state: str = "starting" | ||||
|     state: TYPE_ALL_CRAWL_STATES = "starting" | ||||
|     pagesFound: int = 0 | ||||
|     pagesDone: int = 0 | ||||
|     size: int = 0 | ||||
| @ -177,7 +181,7 @@ class CrawlStatus(BaseModel): | ||||
|     filesAddedSize: int = 0 | ||||
|     finished: Optional[str] = None | ||||
|     stopping: bool = False | ||||
|     stopReason: Optional[str] = None | ||||
|     stopReason: Optional[StopReason] = None | ||||
|     initRedis: bool = False | ||||
|     crawlerImage: Optional[str] = None | ||||
|     lastActiveTime: str = "" | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user