Additional typing cleanup (#1938)

Misc typing fixes, including in profiles and time functions --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-07-17 10:49:22 -07:00 · 2024-07-17 10:49:22 -07:00 · 335700e683
commit 335700e683
parent 4db3053a9f
10 changed files with 95 additions and 66 deletions
--- a/backend/btrixcloud/background_jobs.py
+++ b/backend/btrixcloud/background_jobs.py
@ -406,6 +406,7 @@ class BackgroundJobOps:
        try:
            if job.object_type == "profile":
                profile = await self.profile_ops.get_profile(UUID(job.object_id), org)
+                assert profile.resource
                return BaseFile(**profile.resource.dict())

            item_res = await self.base_crawl_ops.get_base_crawl(job.object_id, org)
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@ -783,11 +783,13 @@ class BaseCrawlOps:
        for cid in cids:
            if not cid:
                continue
-            config = await self.crawl_configs.get_crawl_config(cid, org.id)
-            if not config:
-                continue
-            first_seed = config.config.seeds[0]
-            first_seeds.add(first_seed.url)
+            try:
+                config = await self.crawl_configs.get_crawl_config(cid, org.id)
+                first_seed = config.config.seeds[0]
+                first_seeds.add(first_seed.url)
+            # pylint: disable=bare-except
+            except:
+                pass

        return {
            "names": names,
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -115,7 +115,7 @@ class CrawlConfigOps:
        self.crawler_images_map = {}
        channels = []
        with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
-            crawler_list: list[dict] = json.loads(fh.read())
+            crawler_list = json.loads(fh.read())
            for channel_data in crawler_list:
                channel = CrawlerChannel(**channel_data)
                channels.append(channel)
@ -297,8 +297,6 @@ class CrawlConfigOps:
        """Update name, scale, schedule, and/or tags for an existing crawl config"""

        orig_crawl_config = await self.get_crawl_config(cid, org.id)
-        if not orig_crawl_config:
-            raise HTTPException(status_code=400, detail="config_not_found")

        # indicates if any k8s crawl config settings changed
        changed = False
@ -437,7 +435,7 @@ class CrawlConfigOps:
        schedule: Optional[bool] = None,
        sort_by: str = "lastRun",
        sort_direction: int = -1,
-    ):
+    ) -> tuple[list[CrawlConfigOut], int]:
        """Get all crawl configs for an organization is a member of"""
        # pylint: disable=too-many-locals,too-many-branches
        # Zero-index page for query
@ -535,7 +533,7 @@ class CrawlConfigOps:

    async def get_crawl_config_info_for_profile(
        self, profileid: UUID, org: Organization
-    ):
+    ) -> list[CrawlConfigProfileOut]:
        """Return all crawl configs that are associated with a given profileid"""
        query = {"profileid": profileid, "inactive": {"$ne": True}}
        if org:
@ -633,10 +631,6 @@ class CrawlConfigOps:
        crawlconfig = await self.get_crawl_config(
            cid, org.id, active_only=False, config_cls=CrawlConfigOut
        )
-        if not crawlconfig:
-            raise HTTPException(
-                status_code=404, detail=f"Crawl Config '{cid}' not found"
-            )

        if not crawlconfig.inactive:
            self._add_curr_crawl_stats(
@ -1136,11 +1130,6 @@ def init_crawl_config_api(
    async def make_inactive(cid: UUID, org: Organization = Depends(org_crawl_dep)):
        crawlconfig = await ops.get_crawl_config(cid, org.id)

-        if not crawlconfig:
-            raise HTTPException(
-                status_code=404, detail=f"Crawl Config '{cid}' not found"
-            )
-
        return await ops.do_make_inactive(crawlconfig)

    org_ops.router.include_router(router)
--- a/backend/btrixcloud/emailsender.py
+++ b/backend/btrixcloud/emailsender.py
@ -33,6 +33,8 @@ class EmailSender:

    log_sent_emails: bool

+    default_origin: str
+
    def __init__(self):
        self.sender = os.environ.get("EMAIL_SENDER") or "Browsertrix admin"
        self.password = os.environ.get("EMAIL_PASSWORD") or ""
@ -44,7 +46,7 @@ class EmailSender:

        self.log_sent_emails = is_bool(os.environ.get("LOG_SENT_EMAILS"))

-        self.default_origin = os.environ.get("APP_ORIGIN")
+        self.default_origin = os.environ.get("APP_ORIGIN", "")

        self.templates = Jinja2Templates(
            directory=os.path.join(os.path.dirname(__file__), "email-templates")
@ -99,7 +101,7 @@ class EmailSender:
            server.send_message(msg)
            # server.sendmail(self.sender, receiver, message)

-    def get_origin(self, headers):
+    def get_origin(self, headers) -> str:
        """Return origin of the received request"""
        if not headers:
            return self.default_origin
--- a/backend/btrixcloud/operator/baseoperator.py
+++ b/backend/btrixcloud/operator/baseoperator.py
@ -2,7 +2,7 @@

 import asyncio
 import os
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 from kubernetes.utils import parse_quantity

 import yaml
@ -44,7 +44,7 @@ class K8sOpAPI(K8sAPI):
        self.compute_crawler_resources()
        self.compute_profile_resources()

-    def compute_crawler_resources(self):
+    def compute_crawler_resources(self) -> None:
        """compute memory / cpu resources for crawlers"""
        p = self.shared_params
        num_workers = max(int(p["crawler_browser_instances"]), 1)
@ -105,7 +105,7 @@ class K8sOpAPI(K8sAPI):
        p["qa_memory"] = qa_memory
        p["qa_workers"] = qa_num_workers

-    def compute_profile_resources(self):
+    def compute_profile_resources(self) -> None:
        """compute memory /cpu resources for a single profile browser"""
        p = self.shared_params
        # if no profile specific options provided, default to crawler base for one browser
@ -122,7 +122,7 @@ class K8sOpAPI(K8sAPI):
        print(f"cpu = {profile_cpu}")
        print(f"memory = {profile_memory}")

-    async def async_init(self):
+    async def async_init(self) -> None:
        """perform any async init here"""
        self.has_pod_metrics = await self.is_pod_metrics_available()
        print("Pod Metrics Available:", self.has_pod_metrics)
@ -172,16 +172,16 @@ class BaseOperator:
        # see: https://stackoverflow.com/a/74059981
        self.bg_tasks = set()

-    def init_routes(self, app):
+    def init_routes(self, app) -> None:
        """init routes for this operator"""

-    def run_task(self, func):
+    def run_task(self, func) -> None:
        """add bg tasks to set to avoid premature garbage collection"""
        task = asyncio.create_task(func)
        self.bg_tasks.add(task)
        task.add_done_callback(self.bg_tasks.discard)

-    def load_from_yaml(self, filename, params):
+    def load_from_yaml(self, filename, params) -> list[Any]:
        """load and parse k8s template from yaml file"""
        return list(
            yaml.safe_load_all(
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@ -1449,6 +1449,9 @@ class CrawlOperator(BaseOperator):
        """Increment Crawl Stats"""

        started = from_k8s_date(crawl.started)
+        if not started:
+            print("Missing crawl start time, unable to increment crawl stats")
+            return

        duration = int((finished - started).total_seconds())

--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@ -102,6 +102,7 @@ class PageOps:
        if not status and page_dict.get("loadState"):
            status = 200

+        ts = page_dict.get("ts")
        p = Page(
            id=page_id,
            oid=oid,
@ -111,9 +112,7 @@ class PageOps:
            loadState=page_dict.get("loadState"),
            status=status,
            mime=page_dict.get("mime", "text/html"),
-            ts=(
-                from_k8s_date(page_dict.get("ts")) if page_dict.get("ts") else dt_now()
-            ),
+            ts=(from_k8s_date(ts) if ts else dt_now()),
        )
        p.compute_page_type()
        return p
@ -403,7 +402,7 @@ class PageOps:

        remaining_notes = []
        for note in page_notes:
-            if not note.get("id") in delete.delete_list:
+            if note.get("id") not in delete.delete_list:
                remaining_notes.append(note)

        modified = dt_now()
--- a/backend/btrixcloud/profiles.py
+++ b/backend/btrixcloud/profiles.py
@ -1,12 +1,13 @@
 """ Profile Management """

-from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List
+from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List, Tuple
 from uuid import UUID, uuid4
 import os

 from urllib.parse import urlencode

 from fastapi import APIRouter, Depends, Request, HTTPException
+from starlette.requests import Headers
 import aiohttp

 from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@ -30,6 +31,7 @@ from .models import (
    SuccessResponseStorageQuota,
    ProfilePingResponse,
    ProfileBrowserGetUrlResponse,
+    CrawlConfigProfileOut,
 )
 from .utils import dt_now

@ -58,6 +60,9 @@ class ProfileOps:
    crawlconfigs: CrawlConfigOps
    background_job_ops: BackgroundJobOps

+    browser_fqdn_suffix: str
+    router: APIRouter
+
    def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops):
        self.profiles = mdb["profiles"]
        self.orgs = orgs
@ -66,7 +71,7 @@ class ProfileOps:
        self.crawl_manager = crawl_manager
        self.storage_ops = storage_ops

-        self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX")
+        self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX", "")

        self.router = APIRouter(
            prefix="/profiles",
@ -82,16 +87,16 @@ class ProfileOps:

    async def create_new_browser(
        self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn
-    ):
+    ) -> BrowserId:
        """Create new profile"""
-        prev_profile = ""
+        prev_profile_path = ""
        prev_profile_id = ""
        if profile_launch.profileId:
-            prev_profile = await self.get_profile_storage_path(
+            prev_profile_path = await self.get_profile_storage_path(
                profile_launch.profileId, org
            )

-            if not prev_profile:
+            if not prev_profile_path:
                raise HTTPException(status_code=400, detail="invalid_base_profile")

            prev_profile_id = str(profile_launch.profileId)
@ -109,7 +114,7 @@ class ProfileOps:
            storage=org.storage,
            crawler_image=crawler_image,
            baseprofile=prev_profile_id,
-            profile_filename=prev_profile,
+            profile_filename=prev_profile_path,
        )

        if not browserid:
@ -117,7 +122,9 @@ class ProfileOps:

        return BrowserId(browserid=browserid)

-    async def get_profile_browser_url(self, browserid, oid, headers):
+    async def get_profile_browser_url(
+        self, browserid: str, oid: str, headers: Headers
+    ) -> dict[str, str | int]:
        """get profile browser url"""
        json = await self._send_browser_req(browserid, "/vncpass")

@ -130,7 +137,7 @@ class ProfileOps:
        host = headers.get("Host") or "localhost"
        # ws_scheme = "wss" if scheme == "https" else "ws"

-        auth_bearer = headers.get("Authorization").split(" ")[1]
+        auth_bearer = headers.get("Authorization", "").split(" ")[1]

        params = {
            "path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}",
@ -144,7 +151,7 @@ class ProfileOps:
        params["url"] = url
        return params

-    async def ping_profile_browser(self, browserid):
+    async def ping_profile_browser(self, browserid: str) -> dict[str, Any]:
        """ping profile browser to keep it running"""
        await self.crawl_manager.ping_profile_browser(browserid)

@ -152,7 +159,9 @@ class ProfileOps:

        return {"success": True, "origins": json.get("origins") or []}

-    async def navigate_profile_browser(self, browserid, urlin: UrlIn):
+    async def navigate_profile_browser(
+        self, browserid: str, urlin: UrlIn
+    ) -> dict[str, bool]:
        """ping profile browser to keep it running"""
        await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict())

@ -255,7 +264,7 @@ class ProfileOps:

    async def update_profile_metadata(
        self, profileid: UUID, update: ProfileUpdate, user: User
-    ):
+    ) -> dict[str, bool]:
        """Update name and description metadata only on existing profile"""
        query = {
            "name": update.name,
@ -282,7 +291,7 @@ class ProfileOps:
        page: int = 1,
        sort_by: str = "modified",
        sort_direction: int = -1,
-    ):
+    ) -> Tuple[list[Profile], int]:
        """list all profiles"""
        # pylint: disable=too-many-locals,duplicate-code

@ -334,7 +343,9 @@ class ProfileOps:
        profiles = [Profile.from_dict(res) for res in items]
        return profiles, total

-    async def get_profile(self, profileid: UUID, org: Optional[Organization] = None):
+    async def get_profile(
+        self, profileid: UUID, org: Optional[Organization] = None
+    ) -> Profile:
        """get profile by id and org"""
        query: dict[str, object] = {"_id": profileid}
        if org:
@ -346,7 +357,9 @@ class ProfileOps:

        return Profile.from_dict(res)

-    async def get_profile_with_configs(self, profileid: UUID, org: Organization):
+    async def get_profile_with_configs(
+        self, profileid: UUID, org: Organization
+    ) -> ProfileWithCrawlConfigs:
        """get profile for api output, with crawlconfigs"""

        profile = await self.get_profile(profileid, org)
@ -357,27 +370,33 @@ class ProfileOps:

    async def get_profile_storage_path(
        self, profileid: UUID, org: Optional[Organization] = None
-    ):
+    ) -> str:
        """return profile path filename (relative path) for given profile id and org"""
        try:
            profile = await self.get_profile(profileid, org)
-            return profile.resource.filename
+            return profile.resource.filename if profile.resource else ""
        # pylint: disable=bare-except
        except:
-            return None
+            pass
+
+        return ""

    async def get_profile_name(
        self, profileid: UUID, org: Optional[Organization] = None
-    ):
+    ) -> str:
        """return profile for given profile id and org"""
        try:
            profile = await self.get_profile(profileid, org)
            return profile.name
        # pylint: disable=bare-except
        except:
-            return None
+            pass

-    async def get_crawl_configs_for_profile(self, profileid: UUID, org: Organization):
+        return ""
+
+    async def get_crawl_configs_for_profile(
+        self, profileid: UUID, org: Organization
+    ) -> list[CrawlConfigProfileOut]:
        """Get list of crawl configs with basic info for that use a particular profile"""

        crawlconfig_info = await self.crawlconfigs.get_crawl_config_info_for_profile(
@ -386,7 +405,9 @@ class ProfileOps:

        return crawlconfig_info

-    async def delete_profile(self, profileid: UUID, org: Organization):
+    async def delete_profile(
+        self, profileid: UUID, org: Organization
+    ) -> dict[str, Any]:
        """delete profile, if not used in active crawlconfig"""
        profile = await self.get_profile_with_configs(profileid, org)

@ -403,27 +424,32 @@ class ProfileOps:
            await self.orgs.inc_org_bytes_stored(
                org.id, -profile.resource.size, "profile"
            )
+            await self.background_job_ops.create_delete_replica_jobs(
+                org, profile.resource, str(profile.id), "profile"
+            )

        res = await self.profiles.delete_one(query)
        if not res or res.deleted_count != 1:
            raise HTTPException(status_code=404, detail="profile_not_found")

-        await self.background_job_ops.create_delete_replica_jobs(
-            org, profile.resource, profile.id, "profile"
-        )
-
        quota_reached = await self.orgs.storage_quota_reached(org.id)

        return {"success": True, "storageQuotaReached": quota_reached}

-    async def delete_profile_browser(self, browserid):
+    async def delete_profile_browser(self, browserid: str) -> dict[str, bool]:
        """delete profile browser immediately"""
        if not await self.crawl_manager.delete_profile_browser(browserid):
            raise HTTPException(status_code=404, detail="browser_not_found")

        return {"success": True}

-    async def _send_browser_req(self, browserid, path, method="GET", json=None):
+    async def _send_browser_req(
+        self,
+        browserid: str,
+        path: str,
+        method: str = "GET",
+        json: Optional[dict[str, Any]] = None,
+    ) -> dict[str, Any]:
        """make request to browser api to get state"""
        try:
            async with aiohttp.ClientSession() as session:
@ -438,7 +464,7 @@ class ProfileOps:
            # pylint: disable=raise-missing-from
            raise HTTPException(status_code=200, detail="waiting_for_browser")

-        return json
+        return json or {}

    async def add_profile_file_replica(
        self, profileid: UUID, filename: str, ref: StorageRef
@ -453,7 +479,12 @@ class ProfileOps:
 # ============================================================================
 # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
 def init_profiles_api(
-    mdb, org_ops, crawl_manager, storage_ops, background_job_ops, user_dep
+    mdb,
+    org_ops: OrgOps,
+    crawl_manager: CrawlManager,
+    storage_ops: StorageOps,
+    background_job_ops: BackgroundJobOps,
+    user_dep,
 ):
    """init profile ops system"""
    ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops)
@ -584,6 +615,7 @@ def init_profiles_api(
    async def delete_profile_browser(browserid: str = Depends(browser_dep)):
        return await ops.delete_profile_browser(browserid)

-    org_ops.router.include_router(router)
+    if org_ops.router:
+        org_ops.router.include_router(router)

    return ops
--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@ -37,6 +37,7 @@ from mypy_boto3_s3.type_defs import CompletedPartTypeDef
 from types_aiobotocore_s3 import S3Client as AIOS3Client

 from .models import (
+    BaseFile,
    CrawlFile,
    CrawlFileOut,
    Organization,
@ -504,7 +505,7 @@ class StorageOps:
        return presigned_url

    async def delete_crawl_file_object(
-        self, org: Organization, crawlfile: CrawlFile
+        self, org: Organization, crawlfile: BaseFile
    ) -> bool:
        """delete crawl file from storage."""
        return await self._delete_file(org, crawlfile.filename, crawlfile.storage)
--- a/backend/btrixcloud/utils.py
+++ b/backend/btrixcloud/utils.py
@ -40,12 +40,12 @@ def get_templates_dir():
    return os.path.join(os.path.dirname(__file__), "templates")


-def from_k8s_date(string):
+def from_k8s_date(string: str) -> Optional[datetime]:
    """convert k8s date string to datetime"""
    return datetime.fromisoformat(string[:-1]) if string else None


-def to_k8s_date(dt_val):
+def to_k8s_date(dt_val: datetime) -> str:
    """convert datetime to string for k8s"""
    return dt_val.isoformat("T") + "Z"