diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 90eaa4ad..c96fcd12 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -263,7 +263,7 @@ class BaseCrawlOps: async def update_crawl_state(self, crawl_id: str, state: str): """called only when job container is being stopped/canceled""" - data = {"state": state} + data: dict[str, Any] = {"state": state} # if cancelation, set the finish time here if state == "canceled": data["finished"] = dt_now() @@ -462,7 +462,11 @@ class BaseCrawlOps: presigned_url = file_.presignedUrl now = dt_now() - if update_presigned_url or not presigned_url or now >= file_.expireAt: + if ( + update_presigned_url + or not presigned_url + or (file_.expireAt and now >= file_.expireAt) + ): exp = now + delta presigned_url = await self.storage_ops.get_presigned_url( org, file_, self.presign_duration_seconds diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 354857f6..be157531 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -342,7 +342,7 @@ class CrawlOps(BaseCrawlOps): crawl_id: str, crawlconfig: CrawlConfig, userid: UUID, - started: str, + started: datetime, manual: bool, username: str = "", ) -> None: @@ -582,7 +582,7 @@ class CrawlOps(BaseCrawlOps): crawl_id: str, is_qa: bool, exec_time: int, - last_updated_time: str, + last_updated_time: datetime, ) -> bool: """increment exec time""" # update both crawl-shared qa exec seconds and per-qa run exec seconds diff --git a/backend/btrixcloud/operator/bgjobs.py b/backend/btrixcloud/operator/bgjobs.py index fad3deea..2552ebaf 100644 --- a/backend/btrixcloud/operator/bgjobs.py +++ b/backend/btrixcloud/operator/bgjobs.py @@ -43,7 +43,11 @@ class BgJobOperator(BaseOperator): finalized = True - finished = from_k8s_date(completion_time) if completion_time else dt_now() + finished = None + if completion_time: + finished = from_k8s_date(completion_time) + if not finished: + finished = dt_now() try: await self.background_job_ops.job_finished( diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 27f91f84..d970a563 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -713,7 +713,7 @@ class CrawlOperator(BaseOperator): if status.finished: ttl = spec.get("ttlSecondsAfterFinished", DEFAULT_TTL) finished = from_k8s_date(status.finished) - if (dt_now() - finished).total_seconds() > ttl >= 0: + if finished and (dt_now() - finished).total_seconds() > ttl >= 0: print("CrawlJob expired, deleting: " + crawl.id) finalized = True else: @@ -789,11 +789,9 @@ class CrawlOperator(BaseOperator): # but not right away in case crawler pod is just restarting. # avoids keeping redis pods around while no crawler pods are up # (eg. due to resource constraints) - if status.lastActiveTime and ( - ( - dt_now() - from_k8s_date(status.lastActiveTime) - ).total_seconds() - > REDIS_TTL + last_active_time = from_k8s_date(status.lastActiveTime) + if last_active_time and ( + (dt_now() - last_active_time).total_seconds() > REDIS_TTL ): print( f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs" @@ -1233,10 +1231,9 @@ class CrawlOperator(BaseOperator): # check timeout if timeout time exceeds elapsed time if crawl.timeout: elapsed = status.elapsedCrawlTime - if status.lastUpdatedTime: - elapsed += ( - dt_now() - from_k8s_date(status.lastUpdatedTime) - ).total_seconds() + last_updated_time = from_k8s_date(status.lastUpdatedTime) + if last_updated_time: + elapsed += int((dt_now() - last_updated_time).total_seconds()) if elapsed > crawl.timeout: return "time-limit" diff --git a/backend/btrixcloud/operator/profiles.py b/backend/btrixcloud/operator/profiles.py index 713252d7..03b6c585 100644 --- a/backend/btrixcloud/operator/profiles.py +++ b/backend/btrixcloud/operator/profiles.py @@ -1,9 +1,6 @@ """ Operator handler for ProfileJobs """ -from btrixcloud.utils import ( - from_k8s_date, - dt_now, -) +from btrixcloud.utils import from_k8s_date, dt_now from btrixcloud.models import StorageRef @@ -29,7 +26,7 @@ class ProfileOperator(BaseOperator): expire_time = from_k8s_date(spec.get("expireTime")) browserid = spec.get("id") - if dt_now() >= expire_time: + if expire_time and dt_now() >= expire_time: self.run_task(self.k8s.delete_profile_browser(browserid)) return {"status": {}, "children": []} diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 79bb141d..539b1e0e 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -38,7 +38,7 @@ class JSONSerializer(json.JSONEncoder): return super().default(o) -def get_templates_dir(): +def get_templates_dir() -> str: """return directory containing templates for loading""" return os.path.join(os.path.dirname(__file__), "templates") @@ -53,17 +53,17 @@ def to_k8s_date(dt_val: datetime) -> str: return dt_val.isoformat("T") + "Z" -def dt_now(): +def dt_now() -> datetime: """get current ts""" return datetime.now(timezone.utc).replace(microsecond=0, tzinfo=None) -def ts_now(): +def ts_now() -> str: """get current ts""" return str(dt_now()) -def run_once_lock(name): +def run_once_lock(name) -> bool: """run once lock via temp directory - if dir doesn't exist, return true - if exists, return false""" @@ -83,7 +83,7 @@ def run_once_lock(name): return True -def register_exit_handler(): +def register_exit_handler() -> None: """register exit handler to exit on SIGTERM""" loop = asyncio.get_running_loop() @@ -95,7 +95,7 @@ def register_exit_handler(): loop.add_signal_handler(signal.SIGTERM, exit_handler) -def parse_jsonl_error_messages(errors): +def parse_jsonl_error_messages(errors: list[str]) -> list[dict]: """parse json-l error strings from redis/db into json""" parsed_errors = [] for error_line in errors: @@ -153,7 +153,9 @@ def validate_slug(slug: str) -> None: raise HTTPException(status_code=400, detail="invalid_slug") -def stream_dict_list_as_csv(data: List[Dict[str, Union[str, int]]], filename: str): +def stream_dict_list_as_csv( + data: List[Dict[str, Union[str, int]]], filename: str +) -> StreamingResponse: """Stream list of dictionaries as CSV with attachment filename header""" if not data: raise HTTPException(status_code=404, detail="crawls_not_found")