From 335700e6830ff09a61e4e8b515795698eabf75d5 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 17 Jul 2024 10:49:22 -0700 Subject: [PATCH] Additional typing cleanup (#1938) Misc typing fixes, including in profiles and time functions --------- Co-authored-by: Tessa Walsh --- backend/btrixcloud/background_jobs.py | 1 + backend/btrixcloud/basecrawls.py | 12 +-- backend/btrixcloud/crawlconfigs.py | 17 +--- backend/btrixcloud/emailsender.py | 6 +- backend/btrixcloud/operator/baseoperator.py | 14 +-- backend/btrixcloud/operator/crawls.py | 3 + backend/btrixcloud/pages.py | 7 +- backend/btrixcloud/profiles.py | 94 ++++++++++++++------- backend/btrixcloud/storages.py | 3 +- backend/btrixcloud/utils.py | 4 +- 10 files changed, 95 insertions(+), 66 deletions(-) diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index 1f7224b1..9848e8c3 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -406,6 +406,7 @@ class BackgroundJobOps: try: if job.object_type == "profile": profile = await self.profile_ops.get_profile(UUID(job.object_id), org) + assert profile.resource return BaseFile(**profile.resource.dict()) item_res = await self.base_crawl_ops.get_base_crawl(job.object_id, org) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 1ee8b1ea..2def7865 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -783,11 +783,13 @@ class BaseCrawlOps: for cid in cids: if not cid: continue - config = await self.crawl_configs.get_crawl_config(cid, org.id) - if not config: - continue - first_seed = config.config.seeds[0] - first_seeds.add(first_seed.url) + try: + config = await self.crawl_configs.get_crawl_config(cid, org.id) + first_seed = config.config.seeds[0] + first_seeds.add(first_seed.url) + # pylint: disable=bare-except + except: + pass return { "names": names, diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 36c997cb..338a749d 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -115,7 +115,7 @@ class CrawlConfigOps: self.crawler_images_map = {} channels = [] with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh: - crawler_list: list[dict] = json.loads(fh.read()) + crawler_list = json.loads(fh.read()) for channel_data in crawler_list: channel = CrawlerChannel(**channel_data) channels.append(channel) @@ -297,8 +297,6 @@ class CrawlConfigOps: """Update name, scale, schedule, and/or tags for an existing crawl config""" orig_crawl_config = await self.get_crawl_config(cid, org.id) - if not orig_crawl_config: - raise HTTPException(status_code=400, detail="config_not_found") # indicates if any k8s crawl config settings changed changed = False @@ -437,7 +435,7 @@ class CrawlConfigOps: schedule: Optional[bool] = None, sort_by: str = "lastRun", sort_direction: int = -1, - ): + ) -> tuple[list[CrawlConfigOut], int]: """Get all crawl configs for an organization is a member of""" # pylint: disable=too-many-locals,too-many-branches # Zero-index page for query @@ -535,7 +533,7 @@ class CrawlConfigOps: async def get_crawl_config_info_for_profile( self, profileid: UUID, org: Organization - ): + ) -> list[CrawlConfigProfileOut]: """Return all crawl configs that are associated with a given profileid""" query = {"profileid": profileid, "inactive": {"$ne": True}} if org: @@ -633,10 +631,6 @@ class CrawlConfigOps: crawlconfig = await self.get_crawl_config( cid, org.id, active_only=False, config_cls=CrawlConfigOut ) - if not crawlconfig: - raise HTTPException( - status_code=404, detail=f"Crawl Config '{cid}' not found" - ) if not crawlconfig.inactive: self._add_curr_crawl_stats( @@ -1136,11 +1130,6 @@ def init_crawl_config_api( async def make_inactive(cid: UUID, org: Organization = Depends(org_crawl_dep)): crawlconfig = await ops.get_crawl_config(cid, org.id) - if not crawlconfig: - raise HTTPException( - status_code=404, detail=f"Crawl Config '{cid}' not found" - ) - return await ops.do_make_inactive(crawlconfig) org_ops.router.include_router(router) diff --git a/backend/btrixcloud/emailsender.py b/backend/btrixcloud/emailsender.py index 963ffeeb..50b1d2dc 100644 --- a/backend/btrixcloud/emailsender.py +++ b/backend/btrixcloud/emailsender.py @@ -33,6 +33,8 @@ class EmailSender: log_sent_emails: bool + default_origin: str + def __init__(self): self.sender = os.environ.get("EMAIL_SENDER") or "Browsertrix admin" self.password = os.environ.get("EMAIL_PASSWORD") or "" @@ -44,7 +46,7 @@ class EmailSender: self.log_sent_emails = is_bool(os.environ.get("LOG_SENT_EMAILS")) - self.default_origin = os.environ.get("APP_ORIGIN") + self.default_origin = os.environ.get("APP_ORIGIN", "") self.templates = Jinja2Templates( directory=os.path.join(os.path.dirname(__file__), "email-templates") @@ -99,7 +101,7 @@ class EmailSender: server.send_message(msg) # server.sendmail(self.sender, receiver, message) - def get_origin(self, headers): + def get_origin(self, headers) -> str: """Return origin of the received request""" if not headers: return self.default_origin diff --git a/backend/btrixcloud/operator/baseoperator.py b/backend/btrixcloud/operator/baseoperator.py index 47307224..d0845498 100644 --- a/backend/btrixcloud/operator/baseoperator.py +++ b/backend/btrixcloud/operator/baseoperator.py @@ -2,7 +2,7 @@ import asyncio import os -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from kubernetes.utils import parse_quantity import yaml @@ -44,7 +44,7 @@ class K8sOpAPI(K8sAPI): self.compute_crawler_resources() self.compute_profile_resources() - def compute_crawler_resources(self): + def compute_crawler_resources(self) -> None: """compute memory / cpu resources for crawlers""" p = self.shared_params num_workers = max(int(p["crawler_browser_instances"]), 1) @@ -105,7 +105,7 @@ class K8sOpAPI(K8sAPI): p["qa_memory"] = qa_memory p["qa_workers"] = qa_num_workers - def compute_profile_resources(self): + def compute_profile_resources(self) -> None: """compute memory /cpu resources for a single profile browser""" p = self.shared_params # if no profile specific options provided, default to crawler base for one browser @@ -122,7 +122,7 @@ class K8sOpAPI(K8sAPI): print(f"cpu = {profile_cpu}") print(f"memory = {profile_memory}") - async def async_init(self): + async def async_init(self) -> None: """perform any async init here""" self.has_pod_metrics = await self.is_pod_metrics_available() print("Pod Metrics Available:", self.has_pod_metrics) @@ -172,16 +172,16 @@ class BaseOperator: # see: https://stackoverflow.com/a/74059981 self.bg_tasks = set() - def init_routes(self, app): + def init_routes(self, app) -> None: """init routes for this operator""" - def run_task(self, func): + def run_task(self, func) -> None: """add bg tasks to set to avoid premature garbage collection""" task = asyncio.create_task(func) self.bg_tasks.add(task) task.add_done_callback(self.bg_tasks.discard) - def load_from_yaml(self, filename, params): + def load_from_yaml(self, filename, params) -> list[Any]: """load and parse k8s template from yaml file""" return list( yaml.safe_load_all( diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index c38d2dde..99735e12 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1449,6 +1449,9 @@ class CrawlOperator(BaseOperator): """Increment Crawl Stats""" started = from_k8s_date(crawl.started) + if not started: + print("Missing crawl start time, unable to increment crawl stats") + return duration = int((finished - started).total_seconds()) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index ba8f40b3..c55b40d9 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -102,6 +102,7 @@ class PageOps: if not status and page_dict.get("loadState"): status = 200 + ts = page_dict.get("ts") p = Page( id=page_id, oid=oid, @@ -111,9 +112,7 @@ class PageOps: loadState=page_dict.get("loadState"), status=status, mime=page_dict.get("mime", "text/html"), - ts=( - from_k8s_date(page_dict.get("ts")) if page_dict.get("ts") else dt_now() - ), + ts=(from_k8s_date(ts) if ts else dt_now()), ) p.compute_page_type() return p @@ -403,7 +402,7 @@ class PageOps: remaining_notes = [] for note in page_notes: - if not note.get("id") in delete.delete_list: + if note.get("id") not in delete.delete_list: remaining_notes.append(note) modified = dt_now() diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index 852296e0..0d367ed6 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -1,12 +1,13 @@ """ Profile Management """ -from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List +from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List, Tuple from uuid import UUID, uuid4 import os from urllib.parse import urlencode from fastapi import APIRouter, Depends, Request, HTTPException +from starlette.requests import Headers import aiohttp from .pagination import DEFAULT_PAGE_SIZE, paginated_format @@ -30,6 +31,7 @@ from .models import ( SuccessResponseStorageQuota, ProfilePingResponse, ProfileBrowserGetUrlResponse, + CrawlConfigProfileOut, ) from .utils import dt_now @@ -58,6 +60,9 @@ class ProfileOps: crawlconfigs: CrawlConfigOps background_job_ops: BackgroundJobOps + browser_fqdn_suffix: str + router: APIRouter + def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops): self.profiles = mdb["profiles"] self.orgs = orgs @@ -66,7 +71,7 @@ class ProfileOps: self.crawl_manager = crawl_manager self.storage_ops = storage_ops - self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX") + self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX", "") self.router = APIRouter( prefix="/profiles", @@ -82,16 +87,16 @@ class ProfileOps: async def create_new_browser( self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn - ): + ) -> BrowserId: """Create new profile""" - prev_profile = "" + prev_profile_path = "" prev_profile_id = "" if profile_launch.profileId: - prev_profile = await self.get_profile_storage_path( + prev_profile_path = await self.get_profile_storage_path( profile_launch.profileId, org ) - if not prev_profile: + if not prev_profile_path: raise HTTPException(status_code=400, detail="invalid_base_profile") prev_profile_id = str(profile_launch.profileId) @@ -109,7 +114,7 @@ class ProfileOps: storage=org.storage, crawler_image=crawler_image, baseprofile=prev_profile_id, - profile_filename=prev_profile, + profile_filename=prev_profile_path, ) if not browserid: @@ -117,7 +122,9 @@ class ProfileOps: return BrowserId(browserid=browserid) - async def get_profile_browser_url(self, browserid, oid, headers): + async def get_profile_browser_url( + self, browserid: str, oid: str, headers: Headers + ) -> dict[str, str | int]: """get profile browser url""" json = await self._send_browser_req(browserid, "/vncpass") @@ -130,7 +137,7 @@ class ProfileOps: host = headers.get("Host") or "localhost" # ws_scheme = "wss" if scheme == "https" else "ws" - auth_bearer = headers.get("Authorization").split(" ")[1] + auth_bearer = headers.get("Authorization", "").split(" ")[1] params = { "path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}", @@ -144,7 +151,7 @@ class ProfileOps: params["url"] = url return params - async def ping_profile_browser(self, browserid): + async def ping_profile_browser(self, browserid: str) -> dict[str, Any]: """ping profile browser to keep it running""" await self.crawl_manager.ping_profile_browser(browserid) @@ -152,7 +159,9 @@ class ProfileOps: return {"success": True, "origins": json.get("origins") or []} - async def navigate_profile_browser(self, browserid, urlin: UrlIn): + async def navigate_profile_browser( + self, browserid: str, urlin: UrlIn + ) -> dict[str, bool]: """ping profile browser to keep it running""" await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict()) @@ -255,7 +264,7 @@ class ProfileOps: async def update_profile_metadata( self, profileid: UUID, update: ProfileUpdate, user: User - ): + ) -> dict[str, bool]: """Update name and description metadata only on existing profile""" query = { "name": update.name, @@ -282,7 +291,7 @@ class ProfileOps: page: int = 1, sort_by: str = "modified", sort_direction: int = -1, - ): + ) -> Tuple[list[Profile], int]: """list all profiles""" # pylint: disable=too-many-locals,duplicate-code @@ -334,7 +343,9 @@ class ProfileOps: profiles = [Profile.from_dict(res) for res in items] return profiles, total - async def get_profile(self, profileid: UUID, org: Optional[Organization] = None): + async def get_profile( + self, profileid: UUID, org: Optional[Organization] = None + ) -> Profile: """get profile by id and org""" query: dict[str, object] = {"_id": profileid} if org: @@ -346,7 +357,9 @@ class ProfileOps: return Profile.from_dict(res) - async def get_profile_with_configs(self, profileid: UUID, org: Organization): + async def get_profile_with_configs( + self, profileid: UUID, org: Organization + ) -> ProfileWithCrawlConfigs: """get profile for api output, with crawlconfigs""" profile = await self.get_profile(profileid, org) @@ -357,27 +370,33 @@ class ProfileOps: async def get_profile_storage_path( self, profileid: UUID, org: Optional[Organization] = None - ): + ) -> str: """return profile path filename (relative path) for given profile id and org""" try: profile = await self.get_profile(profileid, org) - return profile.resource.filename + return profile.resource.filename if profile.resource else "" # pylint: disable=bare-except except: - return None + pass + + return "" async def get_profile_name( self, profileid: UUID, org: Optional[Organization] = None - ): + ) -> str: """return profile for given profile id and org""" try: profile = await self.get_profile(profileid, org) return profile.name # pylint: disable=bare-except except: - return None + pass - async def get_crawl_configs_for_profile(self, profileid: UUID, org: Organization): + return "" + + async def get_crawl_configs_for_profile( + self, profileid: UUID, org: Organization + ) -> list[CrawlConfigProfileOut]: """Get list of crawl configs with basic info for that use a particular profile""" crawlconfig_info = await self.crawlconfigs.get_crawl_config_info_for_profile( @@ -386,7 +405,9 @@ class ProfileOps: return crawlconfig_info - async def delete_profile(self, profileid: UUID, org: Organization): + async def delete_profile( + self, profileid: UUID, org: Organization + ) -> dict[str, Any]: """delete profile, if not used in active crawlconfig""" profile = await self.get_profile_with_configs(profileid, org) @@ -403,27 +424,32 @@ class ProfileOps: await self.orgs.inc_org_bytes_stored( org.id, -profile.resource.size, "profile" ) + await self.background_job_ops.create_delete_replica_jobs( + org, profile.resource, str(profile.id), "profile" + ) res = await self.profiles.delete_one(query) if not res or res.deleted_count != 1: raise HTTPException(status_code=404, detail="profile_not_found") - await self.background_job_ops.create_delete_replica_jobs( - org, profile.resource, profile.id, "profile" - ) - quota_reached = await self.orgs.storage_quota_reached(org.id) return {"success": True, "storageQuotaReached": quota_reached} - async def delete_profile_browser(self, browserid): + async def delete_profile_browser(self, browserid: str) -> dict[str, bool]: """delete profile browser immediately""" if not await self.crawl_manager.delete_profile_browser(browserid): raise HTTPException(status_code=404, detail="browser_not_found") return {"success": True} - async def _send_browser_req(self, browserid, path, method="GET", json=None): + async def _send_browser_req( + self, + browserid: str, + path: str, + method: str = "GET", + json: Optional[dict[str, Any]] = None, + ) -> dict[str, Any]: """make request to browser api to get state""" try: async with aiohttp.ClientSession() as session: @@ -438,7 +464,7 @@ class ProfileOps: # pylint: disable=raise-missing-from raise HTTPException(status_code=200, detail="waiting_for_browser") - return json + return json or {} async def add_profile_file_replica( self, profileid: UUID, filename: str, ref: StorageRef @@ -453,7 +479,12 @@ class ProfileOps: # ============================================================================ # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments def init_profiles_api( - mdb, org_ops, crawl_manager, storage_ops, background_job_ops, user_dep + mdb, + org_ops: OrgOps, + crawl_manager: CrawlManager, + storage_ops: StorageOps, + background_job_ops: BackgroundJobOps, + user_dep, ): """init profile ops system""" ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops) @@ -584,6 +615,7 @@ def init_profiles_api( async def delete_profile_browser(browserid: str = Depends(browser_dep)): return await ops.delete_profile_browser(browserid) - org_ops.router.include_router(router) + if org_ops.router: + org_ops.router.include_router(router) return ops diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 0db57a65..8d86d127 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -37,6 +37,7 @@ from mypy_boto3_s3.type_defs import CompletedPartTypeDef from types_aiobotocore_s3 import S3Client as AIOS3Client from .models import ( + BaseFile, CrawlFile, CrawlFileOut, Organization, @@ -504,7 +505,7 @@ class StorageOps: return presigned_url async def delete_crawl_file_object( - self, org: Organization, crawlfile: CrawlFile + self, org: Organization, crawlfile: BaseFile ) -> bool: """delete crawl file from storage.""" return await self._delete_file(org, crawlfile.filename, crawlfile.storage) diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 5b0f60e6..f6bafa4e 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -40,12 +40,12 @@ def get_templates_dir(): return os.path.join(os.path.dirname(__file__), "templates") -def from_k8s_date(string): +def from_k8s_date(string: str) -> Optional[datetime]: """convert k8s date string to datetime""" return datetime.fromisoformat(string[:-1]) if string else None -def to_k8s_date(dt_val): +def to_k8s_date(dt_val: datetime) -> str: """convert datetime to string for k8s""" return dt_val.isoformat("T") + "Z"