- instead of looking up storage and exec min quotas from oid, and loading an org each time, load org once and then check quotas on the org object - many times the org was already available, and was looked up again - storage and exec quota checks become sync - rename can_run_crawl() to more generic can_write_data(), optionally also checks exec minutes - typing: get_org_by_id() always returns org, or throws, adjust methods accordingly (don't check for none, catch exception) - typing: fix typo in BaseOperator, catch type errors in operator 'org_ops' - operator quota check: use up-to-date 'status.size' for current job, ignore current job in all jobs list to avoid double-counting - follow up to #1969
628 lines
21 KiB
Python
628 lines
21 KiB
Python
""" Profile Management """
|
|
|
|
from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List, Tuple
|
|
from uuid import UUID, uuid4
|
|
import os
|
|
|
|
from urllib.parse import urlencode
|
|
|
|
from fastapi import APIRouter, Depends, Request, HTTPException
|
|
from starlette.requests import Headers
|
|
import aiohttp
|
|
|
|
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
|
from .models import (
|
|
Profile,
|
|
ProfileWithCrawlConfigs,
|
|
ProfileFile,
|
|
UrlIn,
|
|
ProfileLaunchBrowserIn,
|
|
BrowserId,
|
|
ProfileCreate,
|
|
ProfileUpdate,
|
|
Organization,
|
|
User,
|
|
PaginatedProfileResponse,
|
|
StorageRef,
|
|
EmptyResponse,
|
|
SuccessResponse,
|
|
AddedResponseIdQuota,
|
|
UpdatedResponse,
|
|
SuccessResponseStorageQuota,
|
|
ProfilePingResponse,
|
|
ProfileBrowserGetUrlResponse,
|
|
CrawlConfigProfileOut,
|
|
)
|
|
from .utils import dt_now
|
|
|
|
if TYPE_CHECKING:
|
|
from .orgs import OrgOps
|
|
from .crawlmanager import CrawlManager
|
|
from .storages import StorageOps
|
|
from .crawlconfigs import CrawlConfigOps
|
|
from .background_jobs import BackgroundJobOps
|
|
else:
|
|
OrgOps = CrawlManager = StorageOps = CrawlConfigOps = BackgroundJobOps = object
|
|
|
|
|
|
BROWSER_EXPIRE = 300
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-instance-attributes, too-many-arguments
|
|
class ProfileOps:
|
|
"""Profile management"""
|
|
|
|
orgs: OrgOps
|
|
crawl_manager: CrawlManager
|
|
storage_ops: StorageOps
|
|
|
|
crawlconfigs: CrawlConfigOps
|
|
background_job_ops: BackgroundJobOps
|
|
|
|
browser_fqdn_suffix: str
|
|
router: APIRouter
|
|
|
|
def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops):
|
|
self.profiles = mdb["profiles"]
|
|
self.orgs = orgs
|
|
self.background_job_ops = background_job_ops
|
|
|
|
self.crawl_manager = crawl_manager
|
|
self.storage_ops = storage_ops
|
|
|
|
self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX", "")
|
|
|
|
self.router = APIRouter(
|
|
prefix="/profiles",
|
|
tags=["profiles"],
|
|
responses={404: {"description": "Not found"}},
|
|
)
|
|
|
|
self.crawlconfigs = cast(CrawlConfigOps, None)
|
|
|
|
def set_crawlconfigs(self, crawlconfigs):
|
|
"""set crawlconfigs ops"""
|
|
self.crawlconfigs = crawlconfigs
|
|
|
|
async def create_new_browser(
|
|
self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn
|
|
) -> BrowserId:
|
|
"""Create new profile"""
|
|
prev_profile_path = ""
|
|
prev_profile_id = ""
|
|
if profile_launch.profileId:
|
|
prev_profile_path = await self.get_profile_storage_path(
|
|
profile_launch.profileId, org
|
|
)
|
|
|
|
if not prev_profile_path:
|
|
raise HTTPException(status_code=400, detail="invalid_base_profile")
|
|
|
|
prev_profile_id = str(profile_launch.profileId)
|
|
|
|
crawler_image = self.crawlconfigs.get_channel_crawler_image(
|
|
profile_launch.crawlerChannel
|
|
)
|
|
if not crawler_image:
|
|
raise HTTPException(status_code=404, detail="crawler_not_found")
|
|
|
|
browserid = await self.crawl_manager.run_profile_browser(
|
|
str(user.id),
|
|
str(org.id),
|
|
url=str(profile_launch.url),
|
|
storage=org.storage,
|
|
crawler_image=crawler_image,
|
|
baseprofile=prev_profile_id,
|
|
profile_filename=prev_profile_path,
|
|
)
|
|
|
|
if not browserid:
|
|
raise HTTPException(status_code=400, detail="browser_not_created")
|
|
|
|
return BrowserId(browserid=browserid)
|
|
|
|
async def get_profile_browser_url(
|
|
self, browserid: str, oid: str, headers: Headers
|
|
) -> dict[str, str | int]:
|
|
"""get profile browser url"""
|
|
json = await self._send_browser_req(browserid, "/vncpass")
|
|
|
|
password = json.get("password")
|
|
|
|
if not password:
|
|
raise HTTPException(status_code=400, detail="browser_not_available")
|
|
|
|
scheme = headers.get("X-Forwarded-Proto") or "http"
|
|
host = headers.get("Host") or "localhost"
|
|
# ws_scheme = "wss" if scheme == "https" else "ws"
|
|
|
|
auth_bearer = headers.get("Authorization", "").split(" ")[1]
|
|
|
|
params = {
|
|
"path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}",
|
|
"password": password,
|
|
"oid": oid,
|
|
"auth_bearer": auth_bearer,
|
|
"scale": 0.75,
|
|
}
|
|
|
|
url = f"{scheme}://{host}/browser/{browserid}/?{urlencode(params)}"
|
|
params["url"] = url
|
|
return params
|
|
|
|
async def ping_profile_browser(self, browserid: str) -> dict[str, Any]:
|
|
"""ping profile browser to keep it running"""
|
|
await self.crawl_manager.ping_profile_browser(browserid)
|
|
|
|
json = await self._send_browser_req(browserid, "/ping")
|
|
|
|
return {"success": True, "origins": json.get("origins") or []}
|
|
|
|
async def navigate_profile_browser(
|
|
self, browserid: str, urlin: UrlIn
|
|
) -> dict[str, bool]:
|
|
"""ping profile browser to keep it running"""
|
|
await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict())
|
|
|
|
return {"success": True}
|
|
|
|
async def commit_to_profile(
|
|
self,
|
|
browser_commit: ProfileCreate,
|
|
org: Organization,
|
|
user: User,
|
|
metadata: dict,
|
|
existing_profile: Optional[Profile] = None,
|
|
) -> dict[str, Any]:
|
|
"""commit profile and shutdown profile browser"""
|
|
# pylint: disable=too-many-locals
|
|
|
|
now = dt_now()
|
|
|
|
if existing_profile:
|
|
profileid = existing_profile.id
|
|
created = existing_profile.created
|
|
created_by = existing_profile.createdBy
|
|
created_by_name = existing_profile.createdByName
|
|
else:
|
|
profileid = uuid4()
|
|
created = now
|
|
created_by = user.id
|
|
created_by_name = user.name if user.name else user.email
|
|
|
|
filename_data = {"filename": f"profiles/profile-{profileid}.tar.gz"}
|
|
|
|
json = await self._send_browser_req(
|
|
browser_commit.browserid, "/createProfileJS", "POST", json=filename_data
|
|
)
|
|
|
|
try:
|
|
resource = json["resource"]
|
|
except:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(status_code=400, detail="browser_not_valid")
|
|
|
|
await self.crawl_manager.delete_profile_browser(browser_commit.browserid)
|
|
|
|
# backwards compatibility
|
|
file_size = resource.get("size") or resource.get("bytes")
|
|
|
|
profile_file = ProfileFile(
|
|
hash=resource["hash"],
|
|
size=file_size,
|
|
filename=resource["path"],
|
|
storage=org.storage,
|
|
)
|
|
|
|
baseid = metadata.get("btrix.baseprofile")
|
|
if baseid:
|
|
print("baseid", baseid)
|
|
baseid = UUID(baseid)
|
|
|
|
self.orgs.can_write_data(org, include_time=False)
|
|
|
|
profile = Profile(
|
|
id=profileid,
|
|
name=browser_commit.name,
|
|
description=browser_commit.description,
|
|
created=created,
|
|
createdBy=created_by,
|
|
createdByName=created_by_name,
|
|
modified=now,
|
|
modifiedBy=user.id,
|
|
modifiedByName=user.name if user.name else user.email,
|
|
origins=json["origins"],
|
|
resource=profile_file,
|
|
userid=UUID(metadata.get("btrix.user")),
|
|
oid=org.id,
|
|
baseid=baseid,
|
|
crawlerChannel=browser_commit.crawlerChannel,
|
|
)
|
|
|
|
await self.profiles.find_one_and_update(
|
|
{"_id": profile.id}, {"$set": profile.to_dict()}, upsert=True
|
|
)
|
|
|
|
await self.background_job_ops.create_replica_jobs(
|
|
org.id, profile_file, str(profileid), "profile"
|
|
)
|
|
|
|
await self.orgs.inc_org_bytes_stored(org.id, file_size, "profile")
|
|
|
|
return {
|
|
"added": True,
|
|
"id": str(profile.id),
|
|
"storageQuotaReached": self.orgs.storage_quota_reached(org),
|
|
}
|
|
|
|
async def update_profile_metadata(
|
|
self, profileid: UUID, update: ProfileUpdate, user: User
|
|
) -> dict[str, bool]:
|
|
"""Update name and description metadata only on existing profile"""
|
|
query = {
|
|
"name": update.name,
|
|
"modified": dt_now(),
|
|
"modifiedBy": user.id,
|
|
"modifiedByName": user.name if user.name else user.email,
|
|
}
|
|
|
|
if update.description is not None:
|
|
query["description"] = update.description
|
|
|
|
if not await self.profiles.find_one_and_update(
|
|
{"_id": profileid}, {"$set": query}
|
|
):
|
|
raise HTTPException(status_code=404, detail="profile_not_found")
|
|
|
|
return {"updated": True}
|
|
|
|
async def list_profiles(
|
|
self,
|
|
org: Organization,
|
|
userid: Optional[UUID] = None,
|
|
page_size: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sort_by: str = "modified",
|
|
sort_direction: int = -1,
|
|
) -> Tuple[list[Profile], int]:
|
|
"""list all profiles"""
|
|
# pylint: disable=too-many-locals,duplicate-code
|
|
|
|
# Zero-index page for query
|
|
page = page - 1
|
|
skip = page_size * page
|
|
|
|
match_query = {"oid": org.id}
|
|
if userid:
|
|
match_query["userid"] = userid
|
|
|
|
aggregate: List[Dict[str, Any]] = [{"$match": match_query}]
|
|
|
|
if sort_by:
|
|
if sort_by not in ("modified", "created", "name", "url"):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
|
if sort_direction not in (1, -1):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
|
|
|
if sort_by == "url":
|
|
sort_by = "origins.0"
|
|
|
|
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
|
|
|
|
aggregate.extend(
|
|
[
|
|
{
|
|
"$facet": {
|
|
"items": [
|
|
{"$skip": skip},
|
|
{"$limit": page_size},
|
|
],
|
|
"total": [{"$count": "count"}],
|
|
}
|
|
},
|
|
]
|
|
)
|
|
|
|
cursor = self.profiles.aggregate(aggregate)
|
|
results = await cursor.to_list(length=1)
|
|
result = results[0]
|
|
items = result["items"]
|
|
|
|
try:
|
|
total = int(result["total"][0]["count"])
|
|
except (IndexError, ValueError):
|
|
total = 0
|
|
|
|
profiles = [Profile.from_dict(res) for res in items]
|
|
return profiles, total
|
|
|
|
async def get_profile(
|
|
self, profileid: UUID, org: Optional[Organization] = None
|
|
) -> Profile:
|
|
"""get profile by id and org"""
|
|
query: dict[str, object] = {"_id": profileid}
|
|
if org:
|
|
query["oid"] = org.id
|
|
|
|
res = await self.profiles.find_one(query)
|
|
if not res:
|
|
raise HTTPException(status_code=404, detail="profile_not_found")
|
|
|
|
return Profile.from_dict(res)
|
|
|
|
async def get_profile_with_configs(
|
|
self, profileid: UUID, org: Organization
|
|
) -> ProfileWithCrawlConfigs:
|
|
"""get profile for api output, with crawlconfigs"""
|
|
|
|
profile = await self.get_profile(profileid, org)
|
|
|
|
crawlconfigs = await self.get_crawl_configs_for_profile(profileid, org)
|
|
|
|
return ProfileWithCrawlConfigs(crawlconfigs=crawlconfigs, **profile.dict())
|
|
|
|
async def get_profile_storage_path(
|
|
self, profileid: UUID, org: Optional[Organization] = None
|
|
) -> str:
|
|
"""return profile path filename (relative path) for given profile id and org"""
|
|
try:
|
|
profile = await self.get_profile(profileid, org)
|
|
return profile.resource.filename if profile.resource else ""
|
|
# pylint: disable=bare-except
|
|
except:
|
|
pass
|
|
|
|
return ""
|
|
|
|
async def get_profile_name(
|
|
self, profileid: UUID, org: Optional[Organization] = None
|
|
) -> str:
|
|
"""return profile for given profile id and org"""
|
|
try:
|
|
profile = await self.get_profile(profileid, org)
|
|
return profile.name
|
|
# pylint: disable=bare-except
|
|
except:
|
|
pass
|
|
|
|
return ""
|
|
|
|
async def get_crawl_configs_for_profile(
|
|
self, profileid: UUID, org: Organization
|
|
) -> list[CrawlConfigProfileOut]:
|
|
"""Get list of crawl configs with basic info for that use a particular profile"""
|
|
|
|
crawlconfig_info = await self.crawlconfigs.get_crawl_config_info_for_profile(
|
|
profileid, org
|
|
)
|
|
|
|
return crawlconfig_info
|
|
|
|
async def delete_profile(
|
|
self, profileid: UUID, org: Organization
|
|
) -> dict[str, Any]:
|
|
"""delete profile, if not used in active crawlconfig"""
|
|
profile = await self.get_profile_with_configs(profileid, org)
|
|
|
|
if len(profile.crawlconfigs) > 0:
|
|
return {"error": "in_use", "crawlconfigs": profile.crawlconfigs}
|
|
|
|
query: dict[str, object] = {"_id": profileid}
|
|
if org:
|
|
query["oid"] = org.id
|
|
|
|
# Delete file from storage
|
|
if profile.resource:
|
|
await self.storage_ops.delete_crawl_file_object(org, profile.resource)
|
|
await self.orgs.inc_org_bytes_stored(
|
|
org.id, -profile.resource.size, "profile"
|
|
)
|
|
await self.background_job_ops.create_delete_replica_jobs(
|
|
org, profile.resource, str(profile.id), "profile"
|
|
)
|
|
|
|
res = await self.profiles.delete_one(query)
|
|
if not res or res.deleted_count != 1:
|
|
raise HTTPException(status_code=404, detail="profile_not_found")
|
|
|
|
quota_reached = self.orgs.storage_quota_reached(org)
|
|
|
|
return {"success": True, "storageQuotaReached": quota_reached}
|
|
|
|
async def delete_profile_browser(self, browserid: str) -> dict[str, bool]:
|
|
"""delete profile browser immediately"""
|
|
if not await self.crawl_manager.delete_profile_browser(browserid):
|
|
raise HTTPException(status_code=404, detail="browser_not_found")
|
|
|
|
return {"success": True}
|
|
|
|
async def _send_browser_req(
|
|
self,
|
|
browserid: str,
|
|
path: str,
|
|
method: str = "GET",
|
|
json: Optional[dict[str, Any]] = None,
|
|
) -> dict[str, Any]:
|
|
"""make request to browser api to get state"""
|
|
try:
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.request(
|
|
method,
|
|
f"http://browser-{browserid}.browser{self.browser_fqdn_suffix}:9223{path}",
|
|
json=json,
|
|
) as resp:
|
|
json = await resp.json()
|
|
|
|
except Exception:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(status_code=200, detail="waiting_for_browser")
|
|
|
|
return json or {}
|
|
|
|
async def add_profile_file_replica(
|
|
self, profileid: UUID, filename: str, ref: StorageRef
|
|
) -> dict[str, object]:
|
|
"""Add replica StorageRef to existing ProfileFile"""
|
|
return await self.profiles.find_one_and_update(
|
|
{"_id": profileid, "resource.filename": filename},
|
|
{"$push": {"resource.replicas": {"name": ref.name, "custom": ref.custom}}},
|
|
)
|
|
|
|
async def calculate_org_profile_file_storage(self, oid: UUID) -> int:
|
|
"""Calculate and return total size of profile files in org"""
|
|
total_size = 0
|
|
|
|
cursor = self.profiles.find({"oid": oid})
|
|
async for profile_dict in cursor:
|
|
file_ = profile_dict.get("resource")
|
|
if file_:
|
|
total_size += file_.get("size", 0)
|
|
|
|
return total_size
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
|
def init_profiles_api(
|
|
mdb,
|
|
org_ops: OrgOps,
|
|
crawl_manager: CrawlManager,
|
|
storage_ops: StorageOps,
|
|
background_job_ops: BackgroundJobOps,
|
|
user_dep,
|
|
):
|
|
"""init profile ops system"""
|
|
ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops)
|
|
|
|
router = ops.router
|
|
|
|
org_crawl_dep = org_ops.org_crawl_dep
|
|
|
|
async def browser_get_metadata(
|
|
browserid: str, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
# if await ops.redis.hget(f"br:{browserid}", "org") != str(org.id):
|
|
metadata = await crawl_manager.get_profile_browser_metadata(browserid)
|
|
if metadata.get("btrix.org") != str(org.id):
|
|
raise HTTPException(status_code=404, detail="no_such_browser")
|
|
|
|
return metadata
|
|
|
|
async def browser_dep(browserid: str, org: Organization = Depends(org_crawl_dep)):
|
|
await browser_get_metadata(browserid, org)
|
|
return browserid
|
|
|
|
@router.get("", response_model=PaginatedProfileResponse)
|
|
async def list_profiles(
|
|
org: Organization = Depends(org_crawl_dep),
|
|
userid: Optional[UUID] = None,
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sortBy: str = "modified",
|
|
sortDirection: int = -1,
|
|
):
|
|
profiles, total = await ops.list_profiles(
|
|
org,
|
|
userid,
|
|
page_size=pageSize,
|
|
page=page,
|
|
sort_by=sortBy,
|
|
sort_direction=sortDirection,
|
|
)
|
|
return paginated_format(profiles, total, page, pageSize)
|
|
|
|
@router.post("", response_model=AddedResponseIdQuota)
|
|
async def commit_browser_to_new(
|
|
browser_commit: ProfileCreate,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
metadata = await browser_get_metadata(browser_commit.browserid, org)
|
|
|
|
return await ops.commit_to_profile(browser_commit, org, user, metadata)
|
|
|
|
@router.patch("/{profileid}", response_model=UpdatedResponse)
|
|
async def commit_browser_to_existing(
|
|
browser_commit: ProfileUpdate,
|
|
profileid: UUID,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
if not browser_commit.browserid:
|
|
await ops.update_profile_metadata(profileid, browser_commit, user)
|
|
|
|
else:
|
|
metadata = await browser_get_metadata(browser_commit.browserid, org)
|
|
profile = await ops.get_profile(profileid)
|
|
await ops.commit_to_profile(
|
|
browser_commit=ProfileCreate(
|
|
browserid=browser_commit.browserid,
|
|
name=browser_commit.name,
|
|
description=browser_commit.description or profile.description,
|
|
crawlerChannel=profile.crawlerChannel,
|
|
),
|
|
org=org,
|
|
user=user,
|
|
metadata=metadata,
|
|
existing_profile=profile,
|
|
)
|
|
|
|
return {"updated": True}
|
|
|
|
@router.get("/{profileid}", response_model=ProfileWithCrawlConfigs)
|
|
async def get_profile(
|
|
profileid: UUID,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await ops.get_profile_with_configs(profileid, org)
|
|
|
|
@router.delete("/{profileid}", response_model=SuccessResponseStorageQuota)
|
|
async def delete_profile(
|
|
profileid: UUID,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await ops.delete_profile(profileid, org)
|
|
|
|
@router.post("/browser", response_model=BrowserId)
|
|
async def create_new(
|
|
profile_launch: ProfileLaunchBrowserIn,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
return await ops.create_new_browser(org, user, profile_launch)
|
|
|
|
@router.post("/browser/{browserid}/ping", response_model=ProfilePingResponse)
|
|
async def ping_profile_browser(browserid: str = Depends(browser_dep)):
|
|
return await ops.ping_profile_browser(browserid)
|
|
|
|
@router.post("/browser/{browserid}/navigate", response_model=SuccessResponse)
|
|
async def navigate_profile_browser(
|
|
urlin: UrlIn, browserid: str = Depends(browser_dep)
|
|
):
|
|
return await ops.navigate_profile_browser(browserid, urlin)
|
|
|
|
@router.get("/browser/{browserid}", response_model=ProfileBrowserGetUrlResponse)
|
|
async def get_profile_browser_url(
|
|
request: Request,
|
|
browserid: str = Depends(browser_dep),
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await ops.get_profile_browser_url(
|
|
browserid, str(org.id), request.headers
|
|
)
|
|
|
|
# pylint: disable=unused-argument
|
|
@router.get("/browser/{browserid}/access", response_model=EmptyResponse)
|
|
async def access_check(browserid: str = Depends(browser_dep)):
|
|
return {}
|
|
|
|
@router.delete("/browser/{browserid}", response_model=SuccessResponse)
|
|
async def delete_profile_browser(browserid: str = Depends(browser_dep)):
|
|
return await ops.delete_profile_browser(browserid)
|
|
|
|
if org_ops.router:
|
|
org_ops.router.include_router(router)
|
|
|
|
return ops
|