Additional typing cleanup (#1938)

Misc typing fixes, including in profiles and time functions

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2024-07-17 10:49:22 -07:00 committed by GitHub
parent 4db3053a9f
commit 335700e683
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 95 additions and 66 deletions

View File

@ -406,6 +406,7 @@ class BackgroundJobOps:
try:
if job.object_type == "profile":
profile = await self.profile_ops.get_profile(UUID(job.object_id), org)
assert profile.resource
return BaseFile(**profile.resource.dict())
item_res = await self.base_crawl_ops.get_base_crawl(job.object_id, org)

View File

@ -783,11 +783,13 @@ class BaseCrawlOps:
for cid in cids:
if not cid:
continue
config = await self.crawl_configs.get_crawl_config(cid, org.id)
if not config:
continue
first_seed = config.config.seeds[0]
first_seeds.add(first_seed.url)
try:
config = await self.crawl_configs.get_crawl_config(cid, org.id)
first_seed = config.config.seeds[0]
first_seeds.add(first_seed.url)
# pylint: disable=bare-except
except:
pass
return {
"names": names,

View File

@ -115,7 +115,7 @@ class CrawlConfigOps:
self.crawler_images_map = {}
channels = []
with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
crawler_list: list[dict] = json.loads(fh.read())
crawler_list = json.loads(fh.read())
for channel_data in crawler_list:
channel = CrawlerChannel(**channel_data)
channels.append(channel)
@ -297,8 +297,6 @@ class CrawlConfigOps:
"""Update name, scale, schedule, and/or tags for an existing crawl config"""
orig_crawl_config = await self.get_crawl_config(cid, org.id)
if not orig_crawl_config:
raise HTTPException(status_code=400, detail="config_not_found")
# indicates if any k8s crawl config settings changed
changed = False
@ -437,7 +435,7 @@ class CrawlConfigOps:
schedule: Optional[bool] = None,
sort_by: str = "lastRun",
sort_direction: int = -1,
):
) -> tuple[list[CrawlConfigOut], int]:
"""Get all crawl configs for an organization is a member of"""
# pylint: disable=too-many-locals,too-many-branches
# Zero-index page for query
@ -535,7 +533,7 @@ class CrawlConfigOps:
async def get_crawl_config_info_for_profile(
self, profileid: UUID, org: Organization
):
) -> list[CrawlConfigProfileOut]:
"""Return all crawl configs that are associated with a given profileid"""
query = {"profileid": profileid, "inactive": {"$ne": True}}
if org:
@ -633,10 +631,6 @@ class CrawlConfigOps:
crawlconfig = await self.get_crawl_config(
cid, org.id, active_only=False, config_cls=CrawlConfigOut
)
if not crawlconfig:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
if not crawlconfig.inactive:
self._add_curr_crawl_stats(
@ -1136,11 +1130,6 @@ def init_crawl_config_api(
async def make_inactive(cid: UUID, org: Organization = Depends(org_crawl_dep)):
crawlconfig = await ops.get_crawl_config(cid, org.id)
if not crawlconfig:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
return await ops.do_make_inactive(crawlconfig)
org_ops.router.include_router(router)

View File

@ -33,6 +33,8 @@ class EmailSender:
log_sent_emails: bool
default_origin: str
def __init__(self):
self.sender = os.environ.get("EMAIL_SENDER") or "Browsertrix admin"
self.password = os.environ.get("EMAIL_PASSWORD") or ""
@ -44,7 +46,7 @@ class EmailSender:
self.log_sent_emails = is_bool(os.environ.get("LOG_SENT_EMAILS"))
self.default_origin = os.environ.get("APP_ORIGIN")
self.default_origin = os.environ.get("APP_ORIGIN", "")
self.templates = Jinja2Templates(
directory=os.path.join(os.path.dirname(__file__), "email-templates")
@ -99,7 +101,7 @@ class EmailSender:
server.send_message(msg)
# server.sendmail(self.sender, receiver, message)
def get_origin(self, headers):
def get_origin(self, headers) -> str:
"""Return origin of the received request"""
if not headers:
return self.default_origin

View File

@ -2,7 +2,7 @@
import asyncio
import os
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any
from kubernetes.utils import parse_quantity
import yaml
@ -44,7 +44,7 @@ class K8sOpAPI(K8sAPI):
self.compute_crawler_resources()
self.compute_profile_resources()
def compute_crawler_resources(self):
def compute_crawler_resources(self) -> None:
"""compute memory / cpu resources for crawlers"""
p = self.shared_params
num_workers = max(int(p["crawler_browser_instances"]), 1)
@ -105,7 +105,7 @@ class K8sOpAPI(K8sAPI):
p["qa_memory"] = qa_memory
p["qa_workers"] = qa_num_workers
def compute_profile_resources(self):
def compute_profile_resources(self) -> None:
"""compute memory /cpu resources for a single profile browser"""
p = self.shared_params
# if no profile specific options provided, default to crawler base for one browser
@ -122,7 +122,7 @@ class K8sOpAPI(K8sAPI):
print(f"cpu = {profile_cpu}")
print(f"memory = {profile_memory}")
async def async_init(self):
async def async_init(self) -> None:
"""perform any async init here"""
self.has_pod_metrics = await self.is_pod_metrics_available()
print("Pod Metrics Available:", self.has_pod_metrics)
@ -172,16 +172,16 @@ class BaseOperator:
# see: https://stackoverflow.com/a/74059981
self.bg_tasks = set()
def init_routes(self, app):
def init_routes(self, app) -> None:
"""init routes for this operator"""
def run_task(self, func):
def run_task(self, func) -> None:
"""add bg tasks to set to avoid premature garbage collection"""
task = asyncio.create_task(func)
self.bg_tasks.add(task)
task.add_done_callback(self.bg_tasks.discard)
def load_from_yaml(self, filename, params):
def load_from_yaml(self, filename, params) -> list[Any]:
"""load and parse k8s template from yaml file"""
return list(
yaml.safe_load_all(

View File

@ -1449,6 +1449,9 @@ class CrawlOperator(BaseOperator):
"""Increment Crawl Stats"""
started = from_k8s_date(crawl.started)
if not started:
print("Missing crawl start time, unable to increment crawl stats")
return
duration = int((finished - started).total_seconds())

View File

@ -102,6 +102,7 @@ class PageOps:
if not status and page_dict.get("loadState"):
status = 200
ts = page_dict.get("ts")
p = Page(
id=page_id,
oid=oid,
@ -111,9 +112,7 @@ class PageOps:
loadState=page_dict.get("loadState"),
status=status,
mime=page_dict.get("mime", "text/html"),
ts=(
from_k8s_date(page_dict.get("ts")) if page_dict.get("ts") else dt_now()
),
ts=(from_k8s_date(ts) if ts else dt_now()),
)
p.compute_page_type()
return p
@ -403,7 +402,7 @@ class PageOps:
remaining_notes = []
for note in page_notes:
if not note.get("id") in delete.delete_list:
if note.get("id") not in delete.delete_list:
remaining_notes.append(note)
modified = dt_now()

View File

@ -1,12 +1,13 @@
""" Profile Management """
from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List
from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List, Tuple
from uuid import UUID, uuid4
import os
from urllib.parse import urlencode
from fastapi import APIRouter, Depends, Request, HTTPException
from starlette.requests import Headers
import aiohttp
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@ -30,6 +31,7 @@ from .models import (
SuccessResponseStorageQuota,
ProfilePingResponse,
ProfileBrowserGetUrlResponse,
CrawlConfigProfileOut,
)
from .utils import dt_now
@ -58,6 +60,9 @@ class ProfileOps:
crawlconfigs: CrawlConfigOps
background_job_ops: BackgroundJobOps
browser_fqdn_suffix: str
router: APIRouter
def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops):
self.profiles = mdb["profiles"]
self.orgs = orgs
@ -66,7 +71,7 @@ class ProfileOps:
self.crawl_manager = crawl_manager
self.storage_ops = storage_ops
self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX")
self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX", "")
self.router = APIRouter(
prefix="/profiles",
@ -82,16 +87,16 @@ class ProfileOps:
async def create_new_browser(
self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn
):
) -> BrowserId:
"""Create new profile"""
prev_profile = ""
prev_profile_path = ""
prev_profile_id = ""
if profile_launch.profileId:
prev_profile = await self.get_profile_storage_path(
prev_profile_path = await self.get_profile_storage_path(
profile_launch.profileId, org
)
if not prev_profile:
if not prev_profile_path:
raise HTTPException(status_code=400, detail="invalid_base_profile")
prev_profile_id = str(profile_launch.profileId)
@ -109,7 +114,7 @@ class ProfileOps:
storage=org.storage,
crawler_image=crawler_image,
baseprofile=prev_profile_id,
profile_filename=prev_profile,
profile_filename=prev_profile_path,
)
if not browserid:
@ -117,7 +122,9 @@ class ProfileOps:
return BrowserId(browserid=browserid)
async def get_profile_browser_url(self, browserid, oid, headers):
async def get_profile_browser_url(
self, browserid: str, oid: str, headers: Headers
) -> dict[str, str | int]:
"""get profile browser url"""
json = await self._send_browser_req(browserid, "/vncpass")
@ -130,7 +137,7 @@ class ProfileOps:
host = headers.get("Host") or "localhost"
# ws_scheme = "wss" if scheme == "https" else "ws"
auth_bearer = headers.get("Authorization").split(" ")[1]
auth_bearer = headers.get("Authorization", "").split(" ")[1]
params = {
"path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}",
@ -144,7 +151,7 @@ class ProfileOps:
params["url"] = url
return params
async def ping_profile_browser(self, browserid):
async def ping_profile_browser(self, browserid: str) -> dict[str, Any]:
"""ping profile browser to keep it running"""
await self.crawl_manager.ping_profile_browser(browserid)
@ -152,7 +159,9 @@ class ProfileOps:
return {"success": True, "origins": json.get("origins") or []}
async def navigate_profile_browser(self, browserid, urlin: UrlIn):
async def navigate_profile_browser(
self, browserid: str, urlin: UrlIn
) -> dict[str, bool]:
"""ping profile browser to keep it running"""
await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict())
@ -255,7 +264,7 @@ class ProfileOps:
async def update_profile_metadata(
self, profileid: UUID, update: ProfileUpdate, user: User
):
) -> dict[str, bool]:
"""Update name and description metadata only on existing profile"""
query = {
"name": update.name,
@ -282,7 +291,7 @@ class ProfileOps:
page: int = 1,
sort_by: str = "modified",
sort_direction: int = -1,
):
) -> Tuple[list[Profile], int]:
"""list all profiles"""
# pylint: disable=too-many-locals,duplicate-code
@ -334,7 +343,9 @@ class ProfileOps:
profiles = [Profile.from_dict(res) for res in items]
return profiles, total
async def get_profile(self, profileid: UUID, org: Optional[Organization] = None):
async def get_profile(
self, profileid: UUID, org: Optional[Organization] = None
) -> Profile:
"""get profile by id and org"""
query: dict[str, object] = {"_id": profileid}
if org:
@ -346,7 +357,9 @@ class ProfileOps:
return Profile.from_dict(res)
async def get_profile_with_configs(self, profileid: UUID, org: Organization):
async def get_profile_with_configs(
self, profileid: UUID, org: Organization
) -> ProfileWithCrawlConfigs:
"""get profile for api output, with crawlconfigs"""
profile = await self.get_profile(profileid, org)
@ -357,27 +370,33 @@ class ProfileOps:
async def get_profile_storage_path(
self, profileid: UUID, org: Optional[Organization] = None
):
) -> str:
"""return profile path filename (relative path) for given profile id and org"""
try:
profile = await self.get_profile(profileid, org)
return profile.resource.filename
return profile.resource.filename if profile.resource else ""
# pylint: disable=bare-except
except:
return None
pass
return ""
async def get_profile_name(
self, profileid: UUID, org: Optional[Organization] = None
):
) -> str:
"""return profile for given profile id and org"""
try:
profile = await self.get_profile(profileid, org)
return profile.name
# pylint: disable=bare-except
except:
return None
pass
async def get_crawl_configs_for_profile(self, profileid: UUID, org: Organization):
return ""
async def get_crawl_configs_for_profile(
self, profileid: UUID, org: Organization
) -> list[CrawlConfigProfileOut]:
"""Get list of crawl configs with basic info for that use a particular profile"""
crawlconfig_info = await self.crawlconfigs.get_crawl_config_info_for_profile(
@ -386,7 +405,9 @@ class ProfileOps:
return crawlconfig_info
async def delete_profile(self, profileid: UUID, org: Organization):
async def delete_profile(
self, profileid: UUID, org: Organization
) -> dict[str, Any]:
"""delete profile, if not used in active crawlconfig"""
profile = await self.get_profile_with_configs(profileid, org)
@ -403,27 +424,32 @@ class ProfileOps:
await self.orgs.inc_org_bytes_stored(
org.id, -profile.resource.size, "profile"
)
await self.background_job_ops.create_delete_replica_jobs(
org, profile.resource, str(profile.id), "profile"
)
res = await self.profiles.delete_one(query)
if not res or res.deleted_count != 1:
raise HTTPException(status_code=404, detail="profile_not_found")
await self.background_job_ops.create_delete_replica_jobs(
org, profile.resource, profile.id, "profile"
)
quota_reached = await self.orgs.storage_quota_reached(org.id)
return {"success": True, "storageQuotaReached": quota_reached}
async def delete_profile_browser(self, browserid):
async def delete_profile_browser(self, browserid: str) -> dict[str, bool]:
"""delete profile browser immediately"""
if not await self.crawl_manager.delete_profile_browser(browserid):
raise HTTPException(status_code=404, detail="browser_not_found")
return {"success": True}
async def _send_browser_req(self, browserid, path, method="GET", json=None):
async def _send_browser_req(
self,
browserid: str,
path: str,
method: str = "GET",
json: Optional[dict[str, Any]] = None,
) -> dict[str, Any]:
"""make request to browser api to get state"""
try:
async with aiohttp.ClientSession() as session:
@ -438,7 +464,7 @@ class ProfileOps:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=200, detail="waiting_for_browser")
return json
return json or {}
async def add_profile_file_replica(
self, profileid: UUID, filename: str, ref: StorageRef
@ -453,7 +479,12 @@ class ProfileOps:
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
def init_profiles_api(
mdb, org_ops, crawl_manager, storage_ops, background_job_ops, user_dep
mdb,
org_ops: OrgOps,
crawl_manager: CrawlManager,
storage_ops: StorageOps,
background_job_ops: BackgroundJobOps,
user_dep,
):
"""init profile ops system"""
ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops)
@ -584,6 +615,7 @@ def init_profiles_api(
async def delete_profile_browser(browserid: str = Depends(browser_dep)):
return await ops.delete_profile_browser(browserid)
org_ops.router.include_router(router)
if org_ops.router:
org_ops.router.include_router(router)
return ops

View File

@ -37,6 +37,7 @@ from mypy_boto3_s3.type_defs import CompletedPartTypeDef
from types_aiobotocore_s3 import S3Client as AIOS3Client
from .models import (
BaseFile,
CrawlFile,
CrawlFileOut,
Organization,
@ -504,7 +505,7 @@ class StorageOps:
return presigned_url
async def delete_crawl_file_object(
self, org: Organization, crawlfile: CrawlFile
self, org: Organization, crawlfile: BaseFile
) -> bool:
"""delete crawl file from storage."""
return await self._delete_file(org, crawlfile.filename, crawlfile.storage)

View File

@ -40,12 +40,12 @@ def get_templates_dir():
return os.path.join(os.path.dirname(__file__), "templates")
def from_k8s_date(string):
def from_k8s_date(string: str) -> Optional[datetime]:
"""convert k8s date string to datetime"""
return datetime.fromisoformat(string[:-1]) if string else None
def to_k8s_date(dt_val):
def to_k8s_date(dt_val: datetime) -> str:
"""convert datetime to string for k8s"""
return dt_val.isoformat("T") + "Z"