browsertrix/backend/btrixcloud/profiles.py
Tessa Walsh 032859f361
Support multiple crawler versions (#1420)
Fixes #1385 

## Changes
Supports multiple crawler 'channels' which can be configured to
different browsertrix-crawler versions
- Replaces `crawler_image` in helm chart with `crawler_channels` array
similar to how storages are handled
- The `default` crawler channel must always be provided and specifies
the default crawler image
- Adds backend `/orgs/{oid}/crawlconfigs/crawler-channels` API endpoint
to fetch information about available crawler versions (name, image, and
label) and test
- Adds crawler channel select to workflow creation/edit screens and
profile creation dialog, and updates related API endpoints and
configmaps accordingly. The select dropdown is shown only if more than
one channel is configured.
- Adds `crawlerChannel` to workflow and crawl details.
- Add `image` to crawler image, used to display actual image used as
part of the crawl.
- Modifies `crawler_crawl_id` backend test fixture to use `test` crawler
version to ensure crawler versions other than latest work
- Adds migration to add `crawlerChannel` set to `default` to existing
workflow and profile objects and workflow configmaps

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
2024-01-16 15:32:12 -08:00

512 lines
17 KiB
Python

""" Profile Management """
from typing import Optional, TYPE_CHECKING, Any, cast
from datetime import datetime
from uuid import UUID, uuid4
import os
from urllib.parse import urlencode
from fastapi import APIRouter, Depends, Request, HTTPException
import aiohttp
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .models import (
Profile,
ProfileWithCrawlConfigs,
ProfileFile,
UrlIn,
ProfileLaunchBrowserIn,
BrowserId,
ProfileCreate,
ProfileUpdate,
Organization,
User,
PaginatedResponse,
StorageRef,
)
if TYPE_CHECKING:
from .orgs import OrgOps
from .crawlmanager import CrawlManager
from .storages import StorageOps
from .crawlconfigs import CrawlConfigOps
from .background_jobs import BackgroundJobOps
else:
OrgOps = CrawlManager = StorageOps = CrawlConfigOps = BackgroundJobOps = object
BROWSER_EXPIRE = 300
# ============================================================================
# pylint: disable=too-many-instance-attributes, too-many-arguments
class ProfileOps:
"""Profile management"""
orgs: OrgOps
crawl_manager: CrawlManager
storage_ops: StorageOps
crawlconfigs: CrawlConfigOps
background_job_ops: BackgroundJobOps
def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops):
self.profiles = mdb["profiles"]
self.orgs = orgs
self.background_job_ops = background_job_ops
self.crawl_manager = crawl_manager
self.storage_ops = storage_ops
self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX")
self.router = APIRouter(
prefix="/profiles",
tags=["profiles"],
responses={404: {"description": "Not found"}},
)
self.crawlconfigs = cast(CrawlConfigOps, None)
def set_crawlconfigs(self, crawlconfigs):
"""set crawlconfigs ops"""
self.crawlconfigs = crawlconfigs
async def create_new_browser(
self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn
):
"""Create new profile"""
prev_profile = ""
prev_profile_id = ""
if profile_launch.profileId:
prev_profile = await self.get_profile_storage_path(
profile_launch.profileId, org
)
if not prev_profile:
raise HTTPException(status_code=400, detail="invalid_base_profile")
prev_profile_id = str(profile_launch.profileId)
crawler_image = self.crawlconfigs.get_channel_crawler_image(
profile_launch.crawlerChannel
)
if not crawler_image:
raise HTTPException(status_code=404, detail="crawler_not_found")
browserid = await self.crawl_manager.run_profile_browser(
str(user.id),
str(org.id),
url=profile_launch.url,
storage=org.storage,
crawler_image=crawler_image,
baseprofile=prev_profile_id,
profile_filename=prev_profile,
)
if not browserid:
raise HTTPException(status_code=400, detail="browser_not_created")
return BrowserId(browserid=browserid)
async def get_profile_browser_url(self, browserid, oid, headers):
"""get profile browser url"""
json = await self._send_browser_req(browserid, "/vncpass")
password = json.get("password")
if not password:
raise HTTPException(status_code=400, detail="browser_not_available")
scheme = headers.get("X-Forwarded-Proto") or "http"
host = headers.get("Host") or "localhost"
# ws_scheme = "wss" if scheme == "https" else "ws"
auth_bearer = headers.get("Authorization").split(" ")[1]
params = {
"path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}",
"password": password,
"oid": oid,
"auth_bearer": auth_bearer,
"scale": 0.75,
}
url = f"{scheme}://{host}/browser/{browserid}/?{urlencode(params)}"
params["url"] = url
return params
async def ping_profile_browser(self, browserid):
"""ping profile browser to keep it running"""
await self.crawl_manager.ping_profile_browser(browserid)
json = await self._send_browser_req(browserid, "/ping")
return {"success": True, "origins": json.get("origins") or []}
async def navigate_profile_browser(self, browserid, urlin: UrlIn):
"""ping profile browser to keep it running"""
await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict())
return {"success": True}
async def commit_to_profile(
self,
browser_commit: ProfileCreate,
storage: StorageRef,
metadata: dict,
profileid: Optional[UUID] = None,
) -> dict[str, Any]:
"""commit profile and shutdown profile browser"""
if not profileid:
profileid = uuid4()
filename_data = {"filename": f"profiles/profile-{profileid}.tar.gz"}
json = await self._send_browser_req(
browser_commit.browserid, "/createProfileJS", "POST", json=filename_data
)
try:
resource = json["resource"]
except:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="browser_not_valid")
await self.crawl_manager.delete_profile_browser(browser_commit.browserid)
# backwards compatibility
file_size = resource.get("size") or resource.get("bytes")
profile_file = ProfileFile(
hash=resource["hash"],
size=file_size,
filename=resource["path"],
storage=storage,
)
baseid = metadata.get("btrix.baseprofile")
if baseid:
print("baseid", baseid)
baseid = UUID(baseid)
oid = UUID(metadata.get("btrix.org"))
if await self.orgs.storage_quota_reached(oid):
raise HTTPException(status_code=403, detail="storage_quota_reached")
profile = Profile(
id=profileid,
name=browser_commit.name,
description=browser_commit.description,
created=datetime.utcnow().replace(microsecond=0, tzinfo=None),
origins=json["origins"],
resource=profile_file,
userid=UUID(metadata.get("btrix.user")),
oid=oid,
baseid=baseid,
crawlerChannel=browser_commit.crawlerChannel,
)
await self.profiles.find_one_and_update(
{"_id": profile.id}, {"$set": profile.to_dict()}, upsert=True
)
await self.background_job_ops.create_replica_jobs(
oid, profile_file, str(profileid), "profile"
)
quota_reached = await self.orgs.inc_org_bytes_stored(oid, file_size, "profile")
return {
"added": True,
"id": str(profile.id),
"storageQuotaReached": quota_reached,
}
async def update_profile_metadata(self, profileid: UUID, update: ProfileUpdate):
"""Update name and description metadata only on existing profile"""
query = {"name": update.name}
if update.description is not None:
query["description"] = update.description
if not await self.profiles.find_one_and_update(
{"_id": profileid}, {"$set": query}
):
raise HTTPException(status_code=404, detail="profile_not_found")
return {"updated": True}
async def list_profiles(
self,
org: Organization,
userid: Optional[UUID] = None,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
):
"""list all profiles"""
# Zero-index page for query
page = page - 1
skip = page_size * page
query = {"oid": org.id}
if userid:
query["userid"] = userid
total = await self.profiles.count_documents(query)
cursor = self.profiles.find(query, skip=skip, limit=page_size)
results = await cursor.to_list(length=page_size)
profiles = [Profile.from_dict(res) for res in results]
return profiles, total
async def get_profile(self, profileid: UUID, org: Optional[Organization] = None):
"""get profile by id and org"""
query: dict[str, object] = {"_id": profileid}
if org:
query["oid"] = org.id
res = await self.profiles.find_one(query)
if not res:
raise HTTPException(status_code=404, detail="profile_not_found")
return Profile.from_dict(res)
async def get_profile_with_configs(
self, profileid: UUID, org: Optional[Organization] = None
):
"""get profile for api output, with crawlconfigs"""
profile = await self.get_profile(profileid, org)
crawlconfigs = await self.get_crawl_configs_for_profile(profileid, org)
return ProfileWithCrawlConfigs(crawlconfigs=crawlconfigs, **profile.dict())
async def get_profile_storage_path(
self, profileid: UUID, org: Optional[Organization] = None
):
"""return profile path filename (relative path) for given profile id and org"""
try:
profile = await self.get_profile(profileid, org)
return profile.resource.filename
# pylint: disable=bare-except
except:
return None
async def get_profile_name(
self, profileid: UUID, org: Optional[Organization] = None
):
"""return profile for given profile id and org"""
try:
profile = await self.get_profile(profileid, org)
return profile.name
# pylint: disable=bare-except
except:
return None
async def get_crawl_configs_for_profile(
self, profileid: UUID, org: Optional[Organization] = None
):
"""Get list of crawl config id, names for that use a particular profile"""
crawlconfig_names = await self.crawlconfigs.get_crawl_config_ids_for_profile(
profileid, org
)
return crawlconfig_names
async def delete_profile(self, profileid: UUID, org: Organization):
"""delete profile, if not used in active crawlconfig"""
profile = await self.get_profile_with_configs(profileid, org)
if len(profile.crawlconfigs) > 0:
return {"error": "in_use", "crawlconfigs": profile.crawlconfigs}
query: dict[str, object] = {"_id": profileid}
if org:
query["oid"] = org.id
# Delete file from storage
if profile.resource:
await self.storage_ops.delete_crawl_file_object(org, profile.resource)
await self.orgs.inc_org_bytes_stored(
org.id, -profile.resource.size, "profile"
)
res = await self.profiles.delete_one(query)
if not res or res.deleted_count != 1:
raise HTTPException(status_code=404, detail="profile_not_found")
await self.background_job_ops.create_delete_replica_jobs(
org, profile.resource, profile.id, "profile"
)
quota_reached = await self.orgs.storage_quota_reached(org.id)
return {"success": True, "storageQuotaReached": quota_reached}
async def delete_profile_browser(self, browserid):
"""delete profile browser immediately"""
if not await self.crawl_manager.delete_profile_browser(browserid):
raise HTTPException(status_code=404, detail="browser_not_found")
return {"success": True}
async def _send_browser_req(self, browserid, path, method="GET", json=None):
"""make request to browser api to get state"""
try:
async with aiohttp.ClientSession() as session:
async with session.request(
method,
f"http://browser-{browserid}.browser{self.browser_fqdn_suffix}:9223{path}",
json=json,
) as resp:
json = await resp.json()
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=200, detail="waiting_for_browser")
return json
async def add_profile_file_replica(
self, profileid: UUID, filename: str, ref: StorageRef
) -> dict[str, object]:
"""Add replica StorageRef to existing ProfileFile"""
return await self.profiles.find_one_and_update(
{"_id": profileid, "resource.filename": filename},
{"$push": {"resource.replicas": {"name": ref.name, "custom": ref.custom}}},
)
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
def init_profiles_api(
mdb, org_ops, crawl_manager, storage_ops, background_job_ops, user_dep
):
"""init profile ops system"""
ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops)
router = ops.router
org_crawl_dep = org_ops.org_crawl_dep
async def browser_get_metadata(
browserid: str, org: Organization = Depends(org_crawl_dep)
):
# if await ops.redis.hget(f"br:{browserid}", "org") != str(org.id):
metadata = await crawl_manager.get_profile_browser_metadata(browserid)
if metadata.get("btrix.org") != str(org.id):
raise HTTPException(status_code=404, detail="no_such_browser")
return metadata
async def browser_dep(browserid: str, org: Organization = Depends(org_crawl_dep)):
await browser_get_metadata(browserid, org)
return browserid
@router.get("", response_model=PaginatedResponse)
async def list_profiles(
org: Organization = Depends(org_crawl_dep),
userid: Optional[UUID] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
):
profiles, total = await ops.list_profiles(
org, userid, page_size=pageSize, page=page
)
return paginated_format(profiles, total, page, pageSize)
@router.post("")
async def commit_browser_to_new(
browser_commit: ProfileCreate,
org: Organization = Depends(org_crawl_dep),
):
metadata = await browser_get_metadata(browser_commit.browserid, org)
return await ops.commit_to_profile(browser_commit, org.storage, metadata)
@router.patch("/{profileid}")
async def commit_browser_to_existing(
browser_commit: ProfileUpdate,
profileid: UUID,
org: Organization = Depends(org_crawl_dep),
):
if not browser_commit.browserid:
await ops.update_profile_metadata(profileid, browser_commit)
else:
metadata = await browser_get_metadata(browser_commit.browserid, org)
profile = await ops.get_profile(profileid)
await ops.commit_to_profile(
browser_commit=ProfileCreate(
browserid=browser_commit.browserid,
name=browser_commit.name,
description=browser_commit.description or profile.description,
crawlerChannel=profile.crawlerChannel,
),
storage=org.storage,
metadata=metadata,
profileid=profileid,
)
return {"updated": True}
@router.get("/{profileid}", response_model=ProfileWithCrawlConfigs)
async def get_profile(
profileid: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await ops.get_profile_with_configs(profileid, org)
@router.delete("/{profileid}")
async def delete_profile(
profileid: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await ops.delete_profile(profileid, org)
@router.post("/browser", response_model=BrowserId)
async def create_new(
profile_launch: ProfileLaunchBrowserIn,
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
return await ops.create_new_browser(org, user, profile_launch)
@router.post("/browser/{browserid}/ping")
async def ping_profile_browser(browserid: str = Depends(browser_dep)):
return await ops.ping_profile_browser(browserid)
@router.post("/browser/{browserid}/navigate")
async def navigate_profile_browser(
urlin: UrlIn, browserid: str = Depends(browser_dep)
):
return await ops.navigate_profile_browser(browserid, urlin)
@router.get("/browser/{browserid}")
async def get_profile_browser_url(
request: Request,
browserid: str = Depends(browser_dep),
org: Organization = Depends(org_crawl_dep),
):
return await ops.get_profile_browser_url(
browserid, str(org.id), request.headers
)
# pylint: disable=unused-argument
@router.get("/browser/{browserid}/access")
async def access_check(browserid: str = Depends(browser_dep)):
return {}
@router.delete("/browser/{browserid}")
async def delete_profile_browser(browserid: str = Depends(browser_dep)):
return await ops.delete_profile_browser(browserid)
org_ops.router.include_router(router)
return ops