""" Profile Management """ from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List, Tuple from uuid import UUID, uuid4 import os from urllib.parse import urlencode from fastapi import APIRouter, Depends, Request, HTTPException from starlette.requests import Headers import aiohttp from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( Profile, ProfileWithCrawlConfigs, ProfileFile, UrlIn, ProfileLaunchBrowserIn, BrowserId, ProfileCreate, ProfileUpdate, Organization, User, PaginatedProfileResponse, StorageRef, EmptyResponse, SuccessResponse, AddedResponseIdQuota, UpdatedResponse, SuccessResponseStorageQuota, ProfilePingResponse, ProfileBrowserGetUrlResponse, CrawlConfigProfileOut, ) from .utils import dt_now if TYPE_CHECKING: from .orgs import OrgOps from .crawlmanager import CrawlManager from .storages import StorageOps from .crawlconfigs import CrawlConfigOps from .background_jobs import BackgroundJobOps else: OrgOps = CrawlManager = StorageOps = CrawlConfigOps = BackgroundJobOps = object BROWSER_EXPIRE = 300 # ============================================================================ # pylint: disable=too-many-instance-attributes, too-many-arguments class ProfileOps: """Profile management""" orgs: OrgOps crawl_manager: CrawlManager storage_ops: StorageOps crawlconfigs: CrawlConfigOps background_job_ops: BackgroundJobOps browser_fqdn_suffix: str router: APIRouter def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops): self.profiles = mdb["profiles"] self.orgs = orgs self.background_job_ops = background_job_ops self.crawl_manager = crawl_manager self.storage_ops = storage_ops self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX", "") self.router = APIRouter( prefix="/profiles", tags=["profiles"], responses={404: {"description": "Not found"}}, ) self.crawlconfigs = cast(CrawlConfigOps, None) def set_crawlconfigs(self, crawlconfigs): """set crawlconfigs ops""" self.crawlconfigs = crawlconfigs async def create_new_browser( self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn ) -> BrowserId: """Create new profile""" prev_profile_path = "" prev_profile_id = "" if profile_launch.profileId: prev_profile_path = await self.get_profile_storage_path( profile_launch.profileId, org ) if not prev_profile_path: raise HTTPException(status_code=400, detail="invalid_base_profile") prev_profile_id = str(profile_launch.profileId) crawler_image = self.crawlconfigs.get_channel_crawler_image( profile_launch.crawlerChannel ) if not crawler_image: raise HTTPException(status_code=404, detail="crawler_not_found") browserid = await self.crawl_manager.run_profile_browser( str(user.id), str(org.id), url=str(profile_launch.url), storage=org.storage, crawler_image=crawler_image, baseprofile=prev_profile_id, profile_filename=prev_profile_path, ) if not browserid: raise HTTPException(status_code=400, detail="browser_not_created") return BrowserId(browserid=browserid) async def get_profile_browser_url( self, browserid: str, oid: str, headers: Headers ) -> dict[str, str | int]: """get profile browser url""" json = await self._send_browser_req(browserid, "/vncpass") password = json.get("password") if not password: raise HTTPException(status_code=400, detail="browser_not_available") scheme = headers.get("X-Forwarded-Proto") or "http" host = headers.get("Host") or "localhost" # ws_scheme = "wss" if scheme == "https" else "ws" auth_bearer = headers.get("Authorization", "").split(" ")[1] params = { "path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}", "password": password, "oid": oid, "auth_bearer": auth_bearer, "scale": 0.75, } url = f"{scheme}://{host}/browser/{browserid}/?{urlencode(params)}" params["url"] = url return params async def ping_profile_browser(self, browserid: str) -> dict[str, Any]: """ping profile browser to keep it running""" await self.crawl_manager.ping_profile_browser(browserid) json = await self._send_browser_req(browserid, "/ping") return {"success": True, "origins": json.get("origins") or []} async def navigate_profile_browser( self, browserid: str, urlin: UrlIn ) -> dict[str, bool]: """ping profile browser to keep it running""" await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict()) return {"success": True} async def commit_to_profile( self, browser_commit: ProfileCreate, org: Organization, user: User, metadata: dict, existing_profile: Optional[Profile] = None, ) -> dict[str, Any]: """commit profile and shutdown profile browser""" # pylint: disable=too-many-locals now = dt_now() if existing_profile: profileid = existing_profile.id created = existing_profile.created created_by = existing_profile.createdBy created_by_name = existing_profile.createdByName else: profileid = uuid4() created = now created_by = user.id created_by_name = user.name if user.name else user.email filename_data = {"filename": f"profiles/profile-{profileid}.tar.gz"} json = await self._send_browser_req( browser_commit.browserid, "/createProfileJS", "POST", json=filename_data ) try: resource = json["resource"] except: # pylint: disable=raise-missing-from raise HTTPException(status_code=400, detail="browser_not_valid") await self.crawl_manager.delete_profile_browser(browser_commit.browserid) # backwards compatibility file_size = resource.get("size") or resource.get("bytes") profile_file = ProfileFile( hash=resource["hash"], size=file_size, filename=resource["path"], storage=org.storage, ) baseid = metadata.get("btrix.baseprofile") if baseid: print("baseid", baseid) baseid = UUID(baseid) self.orgs.can_write_data(org, include_time=False) profile = Profile( id=profileid, name=browser_commit.name, description=browser_commit.description, created=created, createdBy=created_by, createdByName=created_by_name, modified=now, modifiedBy=user.id, modifiedByName=user.name if user.name else user.email, origins=json["origins"], resource=profile_file, userid=UUID(metadata.get("btrix.user")), oid=org.id, baseid=baseid, crawlerChannel=browser_commit.crawlerChannel, ) await self.profiles.find_one_and_update( {"_id": profile.id}, {"$set": profile.to_dict()}, upsert=True ) await self.background_job_ops.create_replica_jobs( org.id, profile_file, str(profileid), "profile" ) await self.orgs.inc_org_bytes_stored(org.id, file_size, "profile") return { "added": True, "id": str(profile.id), "storageQuotaReached": self.orgs.storage_quota_reached(org), } async def update_profile_metadata( self, profileid: UUID, update: ProfileUpdate, user: User ) -> dict[str, bool]: """Update name and description metadata only on existing profile""" query = { "name": update.name, "modified": dt_now(), "modifiedBy": user.id, "modifiedByName": user.name if user.name else user.email, } if update.description is not None: query["description"] = update.description if not await self.profiles.find_one_and_update( {"_id": profileid}, {"$set": query} ): raise HTTPException(status_code=404, detail="profile_not_found") return {"updated": True} async def list_profiles( self, org: Organization, userid: Optional[UUID] = None, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: str = "modified", sort_direction: int = -1, ) -> Tuple[list[Profile], int]: """list all profiles""" # pylint: disable=too-many-locals,duplicate-code # Zero-index page for query page = page - 1 skip = page_size * page match_query = {"oid": org.id} if userid: match_query["userid"] = userid aggregate: List[Dict[str, Any]] = [{"$match": match_query}] if sort_by: if sort_by not in ("modified", "created", "name", "url"): raise HTTPException(status_code=400, detail="invalid_sort_by") if sort_direction not in (1, -1): raise HTTPException(status_code=400, detail="invalid_sort_direction") if sort_by == "url": sort_by = "origins.0" aggregate.extend([{"$sort": {sort_by: sort_direction}}]) aggregate.extend( [ { "$facet": { "items": [ {"$skip": skip}, {"$limit": page_size}, ], "total": [{"$count": "count"}], } }, ] ) cursor = self.profiles.aggregate(aggregate) results = await cursor.to_list(length=1) result = results[0] items = result["items"] try: total = int(result["total"][0]["count"]) except (IndexError, ValueError): total = 0 profiles = [Profile.from_dict(res) for res in items] return profiles, total async def get_profile( self, profileid: UUID, org: Optional[Organization] = None ) -> Profile: """get profile by id and org""" query: dict[str, object] = {"_id": profileid} if org: query["oid"] = org.id res = await self.profiles.find_one(query) if not res: raise HTTPException(status_code=404, detail="profile_not_found") return Profile.from_dict(res) async def get_profile_with_configs( self, profileid: UUID, org: Organization ) -> ProfileWithCrawlConfigs: """get profile for api output, with crawlconfigs""" profile = await self.get_profile(profileid, org) crawlconfigs = await self.get_crawl_configs_for_profile(profileid, org) return ProfileWithCrawlConfigs(crawlconfigs=crawlconfigs, **profile.dict()) async def get_profile_storage_path( self, profileid: UUID, org: Optional[Organization] = None ) -> str: """return profile path filename (relative path) for given profile id and org""" try: profile = await self.get_profile(profileid, org) return profile.resource.filename if profile.resource else "" # pylint: disable=bare-except except: pass return "" async def get_profile_name( self, profileid: UUID, org: Optional[Organization] = None ) -> str: """return profile for given profile id and org""" try: profile = await self.get_profile(profileid, org) return profile.name # pylint: disable=bare-except except: pass return "" async def get_crawl_configs_for_profile( self, profileid: UUID, org: Organization ) -> list[CrawlConfigProfileOut]: """Get list of crawl configs with basic info for that use a particular profile""" crawlconfig_info = await self.crawlconfigs.get_crawl_config_info_for_profile( profileid, org ) return crawlconfig_info async def delete_profile( self, profileid: UUID, org: Organization ) -> dict[str, Any]: """delete profile, if not used in active crawlconfig""" profile = await self.get_profile_with_configs(profileid, org) if len(profile.crawlconfigs) > 0: return {"error": "in_use", "crawlconfigs": profile.crawlconfigs} query: dict[str, object] = {"_id": profileid} if org: query["oid"] = org.id # Delete file from storage if profile.resource: await self.storage_ops.delete_crawl_file_object(org, profile.resource) await self.orgs.inc_org_bytes_stored( org.id, -profile.resource.size, "profile" ) await self.background_job_ops.create_delete_replica_jobs( org, profile.resource, str(profile.id), "profile" ) res = await self.profiles.delete_one(query) if not res or res.deleted_count != 1: raise HTTPException(status_code=404, detail="profile_not_found") quota_reached = self.orgs.storage_quota_reached(org) return {"success": True, "storageQuotaReached": quota_reached} async def delete_profile_browser(self, browserid: str) -> dict[str, bool]: """delete profile browser immediately""" if not await self.crawl_manager.delete_profile_browser(browserid): raise HTTPException(status_code=404, detail="browser_not_found") return {"success": True} async def _send_browser_req( self, browserid: str, path: str, method: str = "GET", json: Optional[dict[str, Any]] = None, ) -> dict[str, Any]: """make request to browser api to get state""" try: async with aiohttp.ClientSession() as session: async with session.request( method, f"http://browser-{browserid}.browser{self.browser_fqdn_suffix}:9223{path}", json=json, ) as resp: json = await resp.json() except Exception: # pylint: disable=raise-missing-from raise HTTPException(status_code=200, detail="waiting_for_browser") return json or {} async def add_profile_file_replica( self, profileid: UUID, filename: str, ref: StorageRef ) -> dict[str, object]: """Add replica StorageRef to existing ProfileFile""" return await self.profiles.find_one_and_update( {"_id": profileid, "resource.filename": filename}, {"$push": {"resource.replicas": {"name": ref.name, "custom": ref.custom}}}, ) async def calculate_org_profile_file_storage(self, oid: UUID) -> int: """Calculate and return total size of profile files in org""" total_size = 0 cursor = self.profiles.find({"oid": oid}) async for profile_dict in cursor: file_ = profile_dict.get("resource") if file_: total_size += file_.get("size", 0) return total_size # ============================================================================ # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments def init_profiles_api( mdb, org_ops: OrgOps, crawl_manager: CrawlManager, storage_ops: StorageOps, background_job_ops: BackgroundJobOps, user_dep, ): """init profile ops system""" ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops) router = ops.router org_crawl_dep = org_ops.org_crawl_dep async def browser_get_metadata( browserid: str, org: Organization = Depends(org_crawl_dep) ): # if await ops.redis.hget(f"br:{browserid}", "org") != str(org.id): metadata = await crawl_manager.get_profile_browser_metadata(browserid) if metadata.get("btrix.org") != str(org.id): raise HTTPException(status_code=404, detail="no_such_browser") return metadata async def browser_dep(browserid: str, org: Organization = Depends(org_crawl_dep)): await browser_get_metadata(browserid, org) return browserid @router.get("", response_model=PaginatedProfileResponse) async def list_profiles( org: Organization = Depends(org_crawl_dep), userid: Optional[UUID] = None, pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, sortBy: str = "modified", sortDirection: int = -1, ): profiles, total = await ops.list_profiles( org, userid, page_size=pageSize, page=page, sort_by=sortBy, sort_direction=sortDirection, ) return paginated_format(profiles, total, page, pageSize) @router.post("", response_model=AddedResponseIdQuota) async def commit_browser_to_new( browser_commit: ProfileCreate, org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep), ): metadata = await browser_get_metadata(browser_commit.browserid, org) return await ops.commit_to_profile(browser_commit, org, user, metadata) @router.patch("/{profileid}", response_model=UpdatedResponse) async def commit_browser_to_existing( browser_commit: ProfileUpdate, profileid: UUID, org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep), ): if not browser_commit.browserid: await ops.update_profile_metadata(profileid, browser_commit, user) else: metadata = await browser_get_metadata(browser_commit.browserid, org) profile = await ops.get_profile(profileid) await ops.commit_to_profile( browser_commit=ProfileCreate( browserid=browser_commit.browserid, name=browser_commit.name, description=browser_commit.description or profile.description, crawlerChannel=profile.crawlerChannel, ), org=org, user=user, metadata=metadata, existing_profile=profile, ) return {"updated": True} @router.get("/{profileid}", response_model=ProfileWithCrawlConfigs) async def get_profile( profileid: UUID, org: Organization = Depends(org_crawl_dep), ): return await ops.get_profile_with_configs(profileid, org) @router.delete("/{profileid}", response_model=SuccessResponseStorageQuota) async def delete_profile( profileid: UUID, org: Organization = Depends(org_crawl_dep), ): return await ops.delete_profile(profileid, org) @router.post("/browser", response_model=BrowserId) async def create_new( profile_launch: ProfileLaunchBrowserIn, org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep), ): return await ops.create_new_browser(org, user, profile_launch) @router.post("/browser/{browserid}/ping", response_model=ProfilePingResponse) async def ping_profile_browser(browserid: str = Depends(browser_dep)): return await ops.ping_profile_browser(browserid) @router.post("/browser/{browserid}/navigate", response_model=SuccessResponse) async def navigate_profile_browser( urlin: UrlIn, browserid: str = Depends(browser_dep) ): return await ops.navigate_profile_browser(browserid, urlin) @router.get("/browser/{browserid}", response_model=ProfileBrowserGetUrlResponse) async def get_profile_browser_url( request: Request, browserid: str = Depends(browser_dep), org: Organization = Depends(org_crawl_dep), ): return await ops.get_profile_browser_url( browserid, str(org.id), request.headers ) # pylint: disable=unused-argument @router.get("/browser/{browserid}/access", response_model=EmptyResponse) async def access_check(browserid: str = Depends(browser_dep)): return {} @router.delete("/browser/{browserid}", response_model=SuccessResponse) async def delete_profile_browser(browserid: str = Depends(browser_dep)): return await ops.delete_profile_browser(browserid) if org_ops.router: org_ops.router.include_router(router) return ops