Additional typing cleanup (#1938)
Misc typing fixes, including in profiles and time functions --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
4db3053a9f
commit
335700e683
@ -406,6 +406,7 @@ class BackgroundJobOps:
|
||||
try:
|
||||
if job.object_type == "profile":
|
||||
profile = await self.profile_ops.get_profile(UUID(job.object_id), org)
|
||||
assert profile.resource
|
||||
return BaseFile(**profile.resource.dict())
|
||||
|
||||
item_res = await self.base_crawl_ops.get_base_crawl(job.object_id, org)
|
||||
|
@ -783,11 +783,13 @@ class BaseCrawlOps:
|
||||
for cid in cids:
|
||||
if not cid:
|
||||
continue
|
||||
config = await self.crawl_configs.get_crawl_config(cid, org.id)
|
||||
if not config:
|
||||
continue
|
||||
first_seed = config.config.seeds[0]
|
||||
first_seeds.add(first_seed.url)
|
||||
try:
|
||||
config = await self.crawl_configs.get_crawl_config(cid, org.id)
|
||||
first_seed = config.config.seeds[0]
|
||||
first_seeds.add(first_seed.url)
|
||||
# pylint: disable=bare-except
|
||||
except:
|
||||
pass
|
||||
|
||||
return {
|
||||
"names": names,
|
||||
|
@ -115,7 +115,7 @@ class CrawlConfigOps:
|
||||
self.crawler_images_map = {}
|
||||
channels = []
|
||||
with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
|
||||
crawler_list: list[dict] = json.loads(fh.read())
|
||||
crawler_list = json.loads(fh.read())
|
||||
for channel_data in crawler_list:
|
||||
channel = CrawlerChannel(**channel_data)
|
||||
channels.append(channel)
|
||||
@ -297,8 +297,6 @@ class CrawlConfigOps:
|
||||
"""Update name, scale, schedule, and/or tags for an existing crawl config"""
|
||||
|
||||
orig_crawl_config = await self.get_crawl_config(cid, org.id)
|
||||
if not orig_crawl_config:
|
||||
raise HTTPException(status_code=400, detail="config_not_found")
|
||||
|
||||
# indicates if any k8s crawl config settings changed
|
||||
changed = False
|
||||
@ -437,7 +435,7 @@ class CrawlConfigOps:
|
||||
schedule: Optional[bool] = None,
|
||||
sort_by: str = "lastRun",
|
||||
sort_direction: int = -1,
|
||||
):
|
||||
) -> tuple[list[CrawlConfigOut], int]:
|
||||
"""Get all crawl configs for an organization is a member of"""
|
||||
# pylint: disable=too-many-locals,too-many-branches
|
||||
# Zero-index page for query
|
||||
@ -535,7 +533,7 @@ class CrawlConfigOps:
|
||||
|
||||
async def get_crawl_config_info_for_profile(
|
||||
self, profileid: UUID, org: Organization
|
||||
):
|
||||
) -> list[CrawlConfigProfileOut]:
|
||||
"""Return all crawl configs that are associated with a given profileid"""
|
||||
query = {"profileid": profileid, "inactive": {"$ne": True}}
|
||||
if org:
|
||||
@ -633,10 +631,6 @@ class CrawlConfigOps:
|
||||
crawlconfig = await self.get_crawl_config(
|
||||
cid, org.id, active_only=False, config_cls=CrawlConfigOut
|
||||
)
|
||||
if not crawlconfig:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Crawl Config '{cid}' not found"
|
||||
)
|
||||
|
||||
if not crawlconfig.inactive:
|
||||
self._add_curr_crawl_stats(
|
||||
@ -1136,11 +1130,6 @@ def init_crawl_config_api(
|
||||
async def make_inactive(cid: UUID, org: Organization = Depends(org_crawl_dep)):
|
||||
crawlconfig = await ops.get_crawl_config(cid, org.id)
|
||||
|
||||
if not crawlconfig:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Crawl Config '{cid}' not found"
|
||||
)
|
||||
|
||||
return await ops.do_make_inactive(crawlconfig)
|
||||
|
||||
org_ops.router.include_router(router)
|
||||
|
@ -33,6 +33,8 @@ class EmailSender:
|
||||
|
||||
log_sent_emails: bool
|
||||
|
||||
default_origin: str
|
||||
|
||||
def __init__(self):
|
||||
self.sender = os.environ.get("EMAIL_SENDER") or "Browsertrix admin"
|
||||
self.password = os.environ.get("EMAIL_PASSWORD") or ""
|
||||
@ -44,7 +46,7 @@ class EmailSender:
|
||||
|
||||
self.log_sent_emails = is_bool(os.environ.get("LOG_SENT_EMAILS"))
|
||||
|
||||
self.default_origin = os.environ.get("APP_ORIGIN")
|
||||
self.default_origin = os.environ.get("APP_ORIGIN", "")
|
||||
|
||||
self.templates = Jinja2Templates(
|
||||
directory=os.path.join(os.path.dirname(__file__), "email-templates")
|
||||
@ -99,7 +101,7 @@ class EmailSender:
|
||||
server.send_message(msg)
|
||||
# server.sendmail(self.sender, receiver, message)
|
||||
|
||||
def get_origin(self, headers):
|
||||
def get_origin(self, headers) -> str:
|
||||
"""Return origin of the received request"""
|
||||
if not headers:
|
||||
return self.default_origin
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from kubernetes.utils import parse_quantity
|
||||
|
||||
import yaml
|
||||
@ -44,7 +44,7 @@ class K8sOpAPI(K8sAPI):
|
||||
self.compute_crawler_resources()
|
||||
self.compute_profile_resources()
|
||||
|
||||
def compute_crawler_resources(self):
|
||||
def compute_crawler_resources(self) -> None:
|
||||
"""compute memory / cpu resources for crawlers"""
|
||||
p = self.shared_params
|
||||
num_workers = max(int(p["crawler_browser_instances"]), 1)
|
||||
@ -105,7 +105,7 @@ class K8sOpAPI(K8sAPI):
|
||||
p["qa_memory"] = qa_memory
|
||||
p["qa_workers"] = qa_num_workers
|
||||
|
||||
def compute_profile_resources(self):
|
||||
def compute_profile_resources(self) -> None:
|
||||
"""compute memory /cpu resources for a single profile browser"""
|
||||
p = self.shared_params
|
||||
# if no profile specific options provided, default to crawler base for one browser
|
||||
@ -122,7 +122,7 @@ class K8sOpAPI(K8sAPI):
|
||||
print(f"cpu = {profile_cpu}")
|
||||
print(f"memory = {profile_memory}")
|
||||
|
||||
async def async_init(self):
|
||||
async def async_init(self) -> None:
|
||||
"""perform any async init here"""
|
||||
self.has_pod_metrics = await self.is_pod_metrics_available()
|
||||
print("Pod Metrics Available:", self.has_pod_metrics)
|
||||
@ -172,16 +172,16 @@ class BaseOperator:
|
||||
# see: https://stackoverflow.com/a/74059981
|
||||
self.bg_tasks = set()
|
||||
|
||||
def init_routes(self, app):
|
||||
def init_routes(self, app) -> None:
|
||||
"""init routes for this operator"""
|
||||
|
||||
def run_task(self, func):
|
||||
def run_task(self, func) -> None:
|
||||
"""add bg tasks to set to avoid premature garbage collection"""
|
||||
task = asyncio.create_task(func)
|
||||
self.bg_tasks.add(task)
|
||||
task.add_done_callback(self.bg_tasks.discard)
|
||||
|
||||
def load_from_yaml(self, filename, params):
|
||||
def load_from_yaml(self, filename, params) -> list[Any]:
|
||||
"""load and parse k8s template from yaml file"""
|
||||
return list(
|
||||
yaml.safe_load_all(
|
||||
|
@ -1449,6 +1449,9 @@ class CrawlOperator(BaseOperator):
|
||||
"""Increment Crawl Stats"""
|
||||
|
||||
started = from_k8s_date(crawl.started)
|
||||
if not started:
|
||||
print("Missing crawl start time, unable to increment crawl stats")
|
||||
return
|
||||
|
||||
duration = int((finished - started).total_seconds())
|
||||
|
||||
|
@ -102,6 +102,7 @@ class PageOps:
|
||||
if not status and page_dict.get("loadState"):
|
||||
status = 200
|
||||
|
||||
ts = page_dict.get("ts")
|
||||
p = Page(
|
||||
id=page_id,
|
||||
oid=oid,
|
||||
@ -111,9 +112,7 @@ class PageOps:
|
||||
loadState=page_dict.get("loadState"),
|
||||
status=status,
|
||||
mime=page_dict.get("mime", "text/html"),
|
||||
ts=(
|
||||
from_k8s_date(page_dict.get("ts")) if page_dict.get("ts") else dt_now()
|
||||
),
|
||||
ts=(from_k8s_date(ts) if ts else dt_now()),
|
||||
)
|
||||
p.compute_page_type()
|
||||
return p
|
||||
@ -403,7 +402,7 @@ class PageOps:
|
||||
|
||||
remaining_notes = []
|
||||
for note in page_notes:
|
||||
if not note.get("id") in delete.delete_list:
|
||||
if note.get("id") not in delete.delete_list:
|
||||
remaining_notes.append(note)
|
||||
|
||||
modified = dt_now()
|
||||
|
@ -1,12 +1,13 @@
|
||||
""" Profile Management """
|
||||
|
||||
from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List
|
||||
from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List, Tuple
|
||||
from uuid import UUID, uuid4
|
||||
import os
|
||||
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from fastapi import APIRouter, Depends, Request, HTTPException
|
||||
from starlette.requests import Headers
|
||||
import aiohttp
|
||||
|
||||
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
||||
@ -30,6 +31,7 @@ from .models import (
|
||||
SuccessResponseStorageQuota,
|
||||
ProfilePingResponse,
|
||||
ProfileBrowserGetUrlResponse,
|
||||
CrawlConfigProfileOut,
|
||||
)
|
||||
from .utils import dt_now
|
||||
|
||||
@ -58,6 +60,9 @@ class ProfileOps:
|
||||
crawlconfigs: CrawlConfigOps
|
||||
background_job_ops: BackgroundJobOps
|
||||
|
||||
browser_fqdn_suffix: str
|
||||
router: APIRouter
|
||||
|
||||
def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops):
|
||||
self.profiles = mdb["profiles"]
|
||||
self.orgs = orgs
|
||||
@ -66,7 +71,7 @@ class ProfileOps:
|
||||
self.crawl_manager = crawl_manager
|
||||
self.storage_ops = storage_ops
|
||||
|
||||
self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX")
|
||||
self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX", "")
|
||||
|
||||
self.router = APIRouter(
|
||||
prefix="/profiles",
|
||||
@ -82,16 +87,16 @@ class ProfileOps:
|
||||
|
||||
async def create_new_browser(
|
||||
self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn
|
||||
):
|
||||
) -> BrowserId:
|
||||
"""Create new profile"""
|
||||
prev_profile = ""
|
||||
prev_profile_path = ""
|
||||
prev_profile_id = ""
|
||||
if profile_launch.profileId:
|
||||
prev_profile = await self.get_profile_storage_path(
|
||||
prev_profile_path = await self.get_profile_storage_path(
|
||||
profile_launch.profileId, org
|
||||
)
|
||||
|
||||
if not prev_profile:
|
||||
if not prev_profile_path:
|
||||
raise HTTPException(status_code=400, detail="invalid_base_profile")
|
||||
|
||||
prev_profile_id = str(profile_launch.profileId)
|
||||
@ -109,7 +114,7 @@ class ProfileOps:
|
||||
storage=org.storage,
|
||||
crawler_image=crawler_image,
|
||||
baseprofile=prev_profile_id,
|
||||
profile_filename=prev_profile,
|
||||
profile_filename=prev_profile_path,
|
||||
)
|
||||
|
||||
if not browserid:
|
||||
@ -117,7 +122,9 @@ class ProfileOps:
|
||||
|
||||
return BrowserId(browserid=browserid)
|
||||
|
||||
async def get_profile_browser_url(self, browserid, oid, headers):
|
||||
async def get_profile_browser_url(
|
||||
self, browserid: str, oid: str, headers: Headers
|
||||
) -> dict[str, str | int]:
|
||||
"""get profile browser url"""
|
||||
json = await self._send_browser_req(browserid, "/vncpass")
|
||||
|
||||
@ -130,7 +137,7 @@ class ProfileOps:
|
||||
host = headers.get("Host") or "localhost"
|
||||
# ws_scheme = "wss" if scheme == "https" else "ws"
|
||||
|
||||
auth_bearer = headers.get("Authorization").split(" ")[1]
|
||||
auth_bearer = headers.get("Authorization", "").split(" ")[1]
|
||||
|
||||
params = {
|
||||
"path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}",
|
||||
@ -144,7 +151,7 @@ class ProfileOps:
|
||||
params["url"] = url
|
||||
return params
|
||||
|
||||
async def ping_profile_browser(self, browserid):
|
||||
async def ping_profile_browser(self, browserid: str) -> dict[str, Any]:
|
||||
"""ping profile browser to keep it running"""
|
||||
await self.crawl_manager.ping_profile_browser(browserid)
|
||||
|
||||
@ -152,7 +159,9 @@ class ProfileOps:
|
||||
|
||||
return {"success": True, "origins": json.get("origins") or []}
|
||||
|
||||
async def navigate_profile_browser(self, browserid, urlin: UrlIn):
|
||||
async def navigate_profile_browser(
|
||||
self, browserid: str, urlin: UrlIn
|
||||
) -> dict[str, bool]:
|
||||
"""ping profile browser to keep it running"""
|
||||
await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict())
|
||||
|
||||
@ -255,7 +264,7 @@ class ProfileOps:
|
||||
|
||||
async def update_profile_metadata(
|
||||
self, profileid: UUID, update: ProfileUpdate, user: User
|
||||
):
|
||||
) -> dict[str, bool]:
|
||||
"""Update name and description metadata only on existing profile"""
|
||||
query = {
|
||||
"name": update.name,
|
||||
@ -282,7 +291,7 @@ class ProfileOps:
|
||||
page: int = 1,
|
||||
sort_by: str = "modified",
|
||||
sort_direction: int = -1,
|
||||
):
|
||||
) -> Tuple[list[Profile], int]:
|
||||
"""list all profiles"""
|
||||
# pylint: disable=too-many-locals,duplicate-code
|
||||
|
||||
@ -334,7 +343,9 @@ class ProfileOps:
|
||||
profiles = [Profile.from_dict(res) for res in items]
|
||||
return profiles, total
|
||||
|
||||
async def get_profile(self, profileid: UUID, org: Optional[Organization] = None):
|
||||
async def get_profile(
|
||||
self, profileid: UUID, org: Optional[Organization] = None
|
||||
) -> Profile:
|
||||
"""get profile by id and org"""
|
||||
query: dict[str, object] = {"_id": profileid}
|
||||
if org:
|
||||
@ -346,7 +357,9 @@ class ProfileOps:
|
||||
|
||||
return Profile.from_dict(res)
|
||||
|
||||
async def get_profile_with_configs(self, profileid: UUID, org: Organization):
|
||||
async def get_profile_with_configs(
|
||||
self, profileid: UUID, org: Organization
|
||||
) -> ProfileWithCrawlConfigs:
|
||||
"""get profile for api output, with crawlconfigs"""
|
||||
|
||||
profile = await self.get_profile(profileid, org)
|
||||
@ -357,27 +370,33 @@ class ProfileOps:
|
||||
|
||||
async def get_profile_storage_path(
|
||||
self, profileid: UUID, org: Optional[Organization] = None
|
||||
):
|
||||
) -> str:
|
||||
"""return profile path filename (relative path) for given profile id and org"""
|
||||
try:
|
||||
profile = await self.get_profile(profileid, org)
|
||||
return profile.resource.filename
|
||||
return profile.resource.filename if profile.resource else ""
|
||||
# pylint: disable=bare-except
|
||||
except:
|
||||
return None
|
||||
pass
|
||||
|
||||
return ""
|
||||
|
||||
async def get_profile_name(
|
||||
self, profileid: UUID, org: Optional[Organization] = None
|
||||
):
|
||||
) -> str:
|
||||
"""return profile for given profile id and org"""
|
||||
try:
|
||||
profile = await self.get_profile(profileid, org)
|
||||
return profile.name
|
||||
# pylint: disable=bare-except
|
||||
except:
|
||||
return None
|
||||
pass
|
||||
|
||||
async def get_crawl_configs_for_profile(self, profileid: UUID, org: Organization):
|
||||
return ""
|
||||
|
||||
async def get_crawl_configs_for_profile(
|
||||
self, profileid: UUID, org: Organization
|
||||
) -> list[CrawlConfigProfileOut]:
|
||||
"""Get list of crawl configs with basic info for that use a particular profile"""
|
||||
|
||||
crawlconfig_info = await self.crawlconfigs.get_crawl_config_info_for_profile(
|
||||
@ -386,7 +405,9 @@ class ProfileOps:
|
||||
|
||||
return crawlconfig_info
|
||||
|
||||
async def delete_profile(self, profileid: UUID, org: Organization):
|
||||
async def delete_profile(
|
||||
self, profileid: UUID, org: Organization
|
||||
) -> dict[str, Any]:
|
||||
"""delete profile, if not used in active crawlconfig"""
|
||||
profile = await self.get_profile_with_configs(profileid, org)
|
||||
|
||||
@ -403,27 +424,32 @@ class ProfileOps:
|
||||
await self.orgs.inc_org_bytes_stored(
|
||||
org.id, -profile.resource.size, "profile"
|
||||
)
|
||||
await self.background_job_ops.create_delete_replica_jobs(
|
||||
org, profile.resource, str(profile.id), "profile"
|
||||
)
|
||||
|
||||
res = await self.profiles.delete_one(query)
|
||||
if not res or res.deleted_count != 1:
|
||||
raise HTTPException(status_code=404, detail="profile_not_found")
|
||||
|
||||
await self.background_job_ops.create_delete_replica_jobs(
|
||||
org, profile.resource, profile.id, "profile"
|
||||
)
|
||||
|
||||
quota_reached = await self.orgs.storage_quota_reached(org.id)
|
||||
|
||||
return {"success": True, "storageQuotaReached": quota_reached}
|
||||
|
||||
async def delete_profile_browser(self, browserid):
|
||||
async def delete_profile_browser(self, browserid: str) -> dict[str, bool]:
|
||||
"""delete profile browser immediately"""
|
||||
if not await self.crawl_manager.delete_profile_browser(browserid):
|
||||
raise HTTPException(status_code=404, detail="browser_not_found")
|
||||
|
||||
return {"success": True}
|
||||
|
||||
async def _send_browser_req(self, browserid, path, method="GET", json=None):
|
||||
async def _send_browser_req(
|
||||
self,
|
||||
browserid: str,
|
||||
path: str,
|
||||
method: str = "GET",
|
||||
json: Optional[dict[str, Any]] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""make request to browser api to get state"""
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
@ -438,7 +464,7 @@ class ProfileOps:
|
||||
# pylint: disable=raise-missing-from
|
||||
raise HTTPException(status_code=200, detail="waiting_for_browser")
|
||||
|
||||
return json
|
||||
return json or {}
|
||||
|
||||
async def add_profile_file_replica(
|
||||
self, profileid: UUID, filename: str, ref: StorageRef
|
||||
@ -453,7 +479,12 @@ class ProfileOps:
|
||||
# ============================================================================
|
||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||
def init_profiles_api(
|
||||
mdb, org_ops, crawl_manager, storage_ops, background_job_ops, user_dep
|
||||
mdb,
|
||||
org_ops: OrgOps,
|
||||
crawl_manager: CrawlManager,
|
||||
storage_ops: StorageOps,
|
||||
background_job_ops: BackgroundJobOps,
|
||||
user_dep,
|
||||
):
|
||||
"""init profile ops system"""
|
||||
ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops)
|
||||
@ -584,6 +615,7 @@ def init_profiles_api(
|
||||
async def delete_profile_browser(browserid: str = Depends(browser_dep)):
|
||||
return await ops.delete_profile_browser(browserid)
|
||||
|
||||
org_ops.router.include_router(router)
|
||||
if org_ops.router:
|
||||
org_ops.router.include_router(router)
|
||||
|
||||
return ops
|
||||
|
@ -37,6 +37,7 @@ from mypy_boto3_s3.type_defs import CompletedPartTypeDef
|
||||
from types_aiobotocore_s3 import S3Client as AIOS3Client
|
||||
|
||||
from .models import (
|
||||
BaseFile,
|
||||
CrawlFile,
|
||||
CrawlFileOut,
|
||||
Organization,
|
||||
@ -504,7 +505,7 @@ class StorageOps:
|
||||
return presigned_url
|
||||
|
||||
async def delete_crawl_file_object(
|
||||
self, org: Organization, crawlfile: CrawlFile
|
||||
self, org: Organization, crawlfile: BaseFile
|
||||
) -> bool:
|
||||
"""delete crawl file from storage."""
|
||||
return await self._delete_file(org, crawlfile.filename, crawlfile.storage)
|
||||
|
@ -40,12 +40,12 @@ def get_templates_dir():
|
||||
return os.path.join(os.path.dirname(__file__), "templates")
|
||||
|
||||
|
||||
def from_k8s_date(string):
|
||||
def from_k8s_date(string: str) -> Optional[datetime]:
|
||||
"""convert k8s date string to datetime"""
|
||||
return datetime.fromisoformat(string[:-1]) if string else None
|
||||
|
||||
|
||||
def to_k8s_date(dt_val):
|
||||
def to_k8s_date(dt_val: datetime) -> str:
|
||||
"""convert datetime to string for k8s"""
|
||||
return dt_val.isoformat("T") + "Z"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user