browsertrix/backend/btrixcloud/profiles.py
Tessa Walsh a031fab313
Backend work for public collections (#2198)
Fixes #2182 

This rather large PR adds the rest of what should be needed for public
collections work in the frontend.

New API endpoints include:

- Public collections endpoints: GET, streaming download
- Paginated list of URLs in collection with snapshot (page) info for
each
- Collection endpoint to set home URL
- Collection endpoint to upload thumbnail as stream
- DELETE endpoint to remove collection thumbnail

Changes to existing API endpoints include:

- Paginating public collection list results
- Several `pages` endpoints that previously only supported `/crawls/` in
their path, e.g. `/orgs/{oid}/crawls/all/pages/reAdd`, now support
`/uploads/` and `/all-crawls/` namespaces as well. This is necessitated
by adding pages for uploads to the database (see below). For
`/orgs/{oid}/namespace/all/pages/reAdd`, `crawls` or `uploads` will
serve as a filter to only affect crawls of that given type. Other
endpoints are more liberal at this point, and will perform the same
action regardless of the namespace used in the route (we'll likely want
to change this in a follow-up to be more consistent).
- `/orgs/{oid}/namespace/all/pages/reAdd` now kicks off a background job
rather than doing all of the computation in an asyncio task in the
backend container. The background job additionally updates collection
date ranges, page/size counts, and tags for each collection in the org
after pages have been (re)added.

Other big changes:

- New uploads will now have their pages read into the database!
Collection page counts now also include uploads
- A migration was added to start a background job for each org that will
add the pages for previously-uploaded WACZ files to the database and
update collections accordingly
- Adds a new `ImageFile` subclass of `BaseFile` for thumbnails that we
can use for other user-uploaded image files moving forward, with
separate output models for authenticated and public endpoints
2025-01-13 15:15:48 -08:00

641 lines
21 KiB
Python

""" Profile Management """
from typing import Optional, TYPE_CHECKING, Any, cast, Dict, List, Tuple
from uuid import UUID, uuid4
import os
from urllib.parse import urlencode
from fastapi import APIRouter, Depends, Request, HTTPException
from starlette.requests import Headers
import aiohttp
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .models import (
Profile,
ProfileWithCrawlConfigs,
ProfileFile,
UrlIn,
ProfileLaunchBrowserIn,
BrowserId,
ProfileCreate,
ProfileUpdate,
Organization,
User,
PaginatedProfileResponse,
StorageRef,
EmptyResponse,
SuccessResponse,
AddedResponseIdQuota,
UpdatedResponse,
SuccessResponseStorageQuota,
ProfilePingResponse,
ProfileBrowserGetUrlResponse,
CrawlConfigProfileOut,
)
from .utils import dt_now
if TYPE_CHECKING:
from .orgs import OrgOps
from .crawlmanager import CrawlManager
from .storages import StorageOps
from .crawlconfigs import CrawlConfigOps
from .background_jobs import BackgroundJobOps
else:
OrgOps = CrawlManager = StorageOps = CrawlConfigOps = BackgroundJobOps = object
BROWSER_EXPIRE = 300
# ============================================================================
# pylint: disable=too-many-instance-attributes, too-many-arguments
class ProfileOps:
"""Profile management"""
orgs: OrgOps
crawl_manager: CrawlManager
storage_ops: StorageOps
crawlconfigs: CrawlConfigOps
background_job_ops: BackgroundJobOps
browser_fqdn_suffix: str
router: APIRouter
def __init__(self, mdb, orgs, crawl_manager, storage_ops, background_job_ops):
self.profiles = mdb["profiles"]
self.orgs = orgs
self.background_job_ops = background_job_ops
self.crawl_manager = crawl_manager
self.storage_ops = storage_ops
self.browser_fqdn_suffix = os.environ.get("CRAWLER_FQDN_SUFFIX", "")
self.router = APIRouter(
prefix="/profiles",
tags=["profiles"],
responses={404: {"description": "Not found"}},
)
self.crawlconfigs = cast(CrawlConfigOps, None)
def set_crawlconfigs(self, crawlconfigs):
"""set crawlconfigs ops"""
self.crawlconfigs = crawlconfigs
async def create_new_browser(
self, org: Organization, user: User, profile_launch: ProfileLaunchBrowserIn
) -> BrowserId:
"""Create new profile"""
prev_profile_path = ""
prev_profile_id = ""
prev_proxy_id = ""
if profile_launch.profileId:
prev_profile_path, prev_proxy_id = (
await self.get_profile_storage_path_and_proxy(
profile_launch.profileId, org
)
)
if not prev_profile_path:
raise HTTPException(status_code=400, detail="invalid_base_profile")
prev_profile_id = str(profile_launch.profileId)
crawler_image = self.crawlconfigs.get_channel_crawler_image(
profile_launch.crawlerChannel
)
if not crawler_image:
raise HTTPException(status_code=404, detail="crawler_not_found")
# use either specified proxyId or if none, use proxyId from existing profile
proxy_id = profile_launch.proxyId or prev_proxy_id
if proxy_id and not self.crawlconfigs.can_org_use_proxy(org, proxy_id):
raise HTTPException(status_code=404, detail="proxy_not_found")
browserid = await self.crawl_manager.run_profile_browser(
str(user.id),
str(org.id),
url=str(profile_launch.url),
storage=org.storage,
crawler_image=crawler_image,
baseprofile=prev_profile_id,
profile_filename=prev_profile_path,
proxy_id=proxy_id,
)
if not browserid:
raise HTTPException(status_code=400, detail="browser_not_created")
return BrowserId(browserid=browserid)
async def get_profile_browser_url(
self, browserid: str, oid: str, headers: Headers
) -> dict[str, str | int]:
"""get profile browser url"""
json = await self._send_browser_req(browserid, "/vncpass")
password = json.get("password")
if not password:
raise HTTPException(status_code=400, detail="browser_not_available")
scheme = headers.get("X-Forwarded-Proto") or "http"
host = headers.get("Host") or "localhost"
# ws_scheme = "wss" if scheme == "https" else "ws"
auth_bearer = headers.get("Authorization", "").split(" ")[1]
params = {
"path": f"browser/{browserid}/ws?oid={oid}&auth_bearer={auth_bearer}",
"password": password,
"oid": oid,
"auth_bearer": auth_bearer,
"scale": 0.75,
}
url = f"{scheme}://{host}/browser/{browserid}/?{urlencode(params)}"
params["url"] = url
return params
async def ping_profile_browser(self, browserid: str) -> dict[str, Any]:
"""ping profile browser to keep it running"""
await self.crawl_manager.ping_profile_browser(browserid)
json = await self._send_browser_req(browserid, "/ping")
return {"success": True, "origins": json.get("origins") or []}
async def navigate_profile_browser(
self, browserid: str, urlin: UrlIn
) -> dict[str, bool]:
"""ping profile browser to keep it running"""
await self._send_browser_req(browserid, "/navigate", "POST", json=urlin.dict())
return {"success": True}
async def commit_to_profile(
self,
browser_commit: ProfileCreate,
org: Organization,
user: User,
metadata: dict,
existing_profile: Optional[Profile] = None,
) -> dict[str, Any]:
"""commit profile and shutdown profile browser"""
# pylint: disable=too-many-locals
now = dt_now()
if existing_profile:
profileid = existing_profile.id
created = existing_profile.created
created_by = existing_profile.createdBy
created_by_name = existing_profile.createdByName
else:
profileid = uuid4()
created = now
created_by = user.id
created_by_name = user.name if user.name else user.email
filename_data = {"filename": f"profiles/profile-{profileid}.tar.gz"}
json = await self._send_browser_req(
browser_commit.browserid, "/createProfileJS", "POST", json=filename_data
)
try:
resource = json["resource"]
except:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="browser_not_valid")
await self.crawl_manager.delete_profile_browser(browser_commit.browserid)
# backwards compatibility
file_size = resource.get("size") or resource.get("bytes")
profile_file = ProfileFile(
hash=resource["hash"],
size=file_size,
filename=resource["path"],
storage=org.storage,
)
baseid = metadata.get("btrix.baseprofile")
if baseid:
print("baseid", baseid)
baseid = UUID(baseid)
self.orgs.can_write_data(org, include_time=False)
profile = Profile(
id=profileid,
name=browser_commit.name,
description=browser_commit.description,
created=created,
createdBy=created_by,
createdByName=created_by_name,
modified=now,
modifiedBy=user.id,
modifiedByName=user.name if user.name else user.email,
origins=json["origins"],
resource=profile_file,
userid=UUID(metadata.get("btrix.user")),
oid=org.id,
baseid=baseid,
crawlerChannel=browser_commit.crawlerChannel,
proxyId=browser_commit.proxyId,
)
await self.profiles.find_one_and_update(
{"_id": profile.id}, {"$set": profile.to_dict()}, upsert=True
)
await self.background_job_ops.create_replica_jobs(
org.id, profile_file, str(profileid), "profile"
)
await self.orgs.inc_org_bytes_stored(org.id, file_size, "profile")
return {
"added": True,
"id": str(profile.id),
"storageQuotaReached": self.orgs.storage_quota_reached(org),
}
async def update_profile_metadata(
self, profileid: UUID, update: ProfileUpdate, user: User
) -> dict[str, bool]:
"""Update name and description metadata only on existing profile"""
query = {
"name": update.name,
"modified": dt_now(),
"modifiedBy": user.id,
"modifiedByName": user.name if user.name else user.email,
}
if update.description is not None:
query["description"] = update.description
if not await self.profiles.find_one_and_update(
{"_id": profileid}, {"$set": query}
):
raise HTTPException(status_code=404, detail="profile_not_found")
return {"updated": True}
async def list_profiles(
self,
org: Organization,
userid: Optional[UUID] = None,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sort_by: str = "modified",
sort_direction: int = -1,
) -> Tuple[list[Profile], int]:
"""list all profiles"""
# pylint: disable=too-many-locals,duplicate-code
# Zero-index page for query
page = page - 1
skip = page_size * page
match_query = {"oid": org.id}
if userid:
match_query["userid"] = userid
aggregate: List[Dict[str, Any]] = [{"$match": match_query}]
if sort_by:
if sort_by not in ("modified", "created", "name", "url"):
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")
if sort_by == "url":
sort_by = "origins.0"
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
aggregate.extend(
[
{
"$facet": {
"items": [
{"$skip": skip},
{"$limit": page_size},
],
"total": [{"$count": "count"}],
}
},
]
)
cursor = self.profiles.aggregate(aggregate)
results = await cursor.to_list(length=1)
result = results[0]
items = result["items"]
try:
total = int(result["total"][0]["count"])
except (IndexError, ValueError):
total = 0
profiles = [Profile.from_dict(res) for res in items]
return profiles, total
async def get_profile(
self, profileid: UUID, org: Optional[Organization] = None
) -> Profile:
"""get profile by id and org"""
query: dict[str, object] = {"_id": profileid}
if org:
query["oid"] = org.id
res = await self.profiles.find_one(query)
if not res:
raise HTTPException(status_code=404, detail="profile_not_found")
return Profile.from_dict(res)
async def get_profile_with_configs(
self, profileid: UUID, org: Organization
) -> ProfileWithCrawlConfigs:
"""get profile for api output, with crawlconfigs"""
profile = await self.get_profile(profileid, org)
crawlconfigs = await self.get_crawl_configs_for_profile(profileid, org)
return ProfileWithCrawlConfigs(crawlconfigs=crawlconfigs, **profile.dict())
async def get_profile_storage_path_and_proxy(
self, profileid: UUID, org: Optional[Organization] = None
) -> tuple[str, str]:
"""return profile path filename (relative path) for given profile id and org"""
try:
profile = await self.get_profile(profileid, org)
storage_path = profile.resource.filename if profile.resource else ""
return storage_path, profile.proxyId or ""
# pylint: disable=bare-except
except:
pass
return "", ""
async def get_profile_name(
self, profileid: UUID, org: Optional[Organization] = None
) -> str:
"""return profile for given profile id and org"""
try:
profile = await self.get_profile(profileid, org)
return profile.name
# pylint: disable=bare-except
except:
pass
return ""
async def get_crawl_configs_for_profile(
self, profileid: UUID, org: Organization
) -> list[CrawlConfigProfileOut]:
"""Get list of crawl configs with basic info for that use a particular profile"""
crawlconfig_info = await self.crawlconfigs.get_crawl_config_info_for_profile(
profileid, org
)
return crawlconfig_info
async def delete_profile(
self, profileid: UUID, org: Organization
) -> dict[str, Any]:
"""delete profile, if not used in active crawlconfig"""
profile = await self.get_profile_with_configs(profileid, org)
if len(profile.crawlconfigs) > 0:
return {"error": "in_use", "crawlconfigs": profile.crawlconfigs}
query: dict[str, object] = {"_id": profileid}
if org:
query["oid"] = org.id
# Delete file from storage
if profile.resource:
await self.storage_ops.delete_file_object(org, profile.resource)
await self.orgs.inc_org_bytes_stored(
org.id, -profile.resource.size, "profile"
)
await self.background_job_ops.create_delete_replica_jobs(
org, profile.resource, str(profile.id), "profile"
)
res = await self.profiles.delete_one(query)
if not res or res.deleted_count != 1:
raise HTTPException(status_code=404, detail="profile_not_found")
quota_reached = self.orgs.storage_quota_reached(org)
return {"success": True, "storageQuotaReached": quota_reached}
async def delete_profile_browser(self, browserid: str) -> dict[str, bool]:
"""delete profile browser immediately"""
if not await self.crawl_manager.delete_profile_browser(browserid):
raise HTTPException(status_code=404, detail="browser_not_found")
return {"success": True}
async def _send_browser_req(
self,
browserid: str,
path: str,
method: str = "GET",
json: Optional[dict[str, Any]] = None,
) -> dict[str, Any]:
"""make request to browser api to get state"""
try:
async with aiohttp.ClientSession() as session:
async with session.request(
method,
f"http://browser-{browserid}.browser{self.browser_fqdn_suffix}:9223{path}",
json=json,
) as resp:
json = await resp.json()
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=200, detail="waiting_for_browser")
return json or {}
async def add_profile_file_replica(
self, profileid: UUID, filename: str, ref: StorageRef
) -> dict[str, object]:
"""Add replica StorageRef to existing ProfileFile"""
return await self.profiles.find_one_and_update(
{"_id": profileid, "resource.filename": filename},
{"$push": {"resource.replicas": {"name": ref.name, "custom": ref.custom}}},
)
async def calculate_org_profile_file_storage(self, oid: UUID) -> int:
"""Calculate and return total size of profile files in org"""
total_size = 0
cursor = self.profiles.find({"oid": oid})
async for profile_dict in cursor:
file_ = profile_dict.get("resource")
if file_:
total_size += file_.get("size", 0)
return total_size
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
def init_profiles_api(
mdb,
org_ops: OrgOps,
crawl_manager: CrawlManager,
storage_ops: StorageOps,
background_job_ops: BackgroundJobOps,
user_dep,
):
"""init profile ops system"""
ops = ProfileOps(mdb, org_ops, crawl_manager, storage_ops, background_job_ops)
router = ops.router
org_crawl_dep = org_ops.org_crawl_dep
async def browser_get_metadata(
browserid: str, org: Organization = Depends(org_crawl_dep)
):
# if await ops.redis.hget(f"br:{browserid}", "org") != str(org.id):
metadata = await crawl_manager.get_profile_browser_metadata(browserid)
if metadata.get("btrix.org") != str(org.id):
raise HTTPException(status_code=404, detail="no_such_browser")
return metadata
async def browser_dep(browserid: str, org: Organization = Depends(org_crawl_dep)):
await browser_get_metadata(browserid, org)
return browserid
@router.get("", response_model=PaginatedProfileResponse)
async def list_profiles(
org: Organization = Depends(org_crawl_dep),
userid: Optional[UUID] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: str = "modified",
sortDirection: int = -1,
):
profiles, total = await ops.list_profiles(
org,
userid,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
)
return paginated_format(profiles, total, page, pageSize)
@router.post("", response_model=AddedResponseIdQuota)
async def commit_browser_to_new(
browser_commit: ProfileCreate,
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
metadata = await browser_get_metadata(browser_commit.browserid, org)
return await ops.commit_to_profile(browser_commit, org, user, metadata)
@router.patch("/{profileid}", response_model=UpdatedResponse)
async def commit_browser_to_existing(
browser_commit: ProfileUpdate,
profileid: UUID,
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
if not browser_commit.browserid:
await ops.update_profile_metadata(profileid, browser_commit, user)
else:
metadata = await browser_get_metadata(browser_commit.browserid, org)
profile = await ops.get_profile(profileid)
await ops.commit_to_profile(
browser_commit=ProfileCreate(
browserid=browser_commit.browserid,
name=browser_commit.name,
description=browser_commit.description or profile.description,
crawlerChannel=profile.crawlerChannel,
proxyId=profile.proxyId,
),
org=org,
user=user,
metadata=metadata,
existing_profile=profile,
)
return {"updated": True}
@router.get("/{profileid}", response_model=ProfileWithCrawlConfigs)
async def get_profile(
profileid: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await ops.get_profile_with_configs(profileid, org)
@router.delete("/{profileid}", response_model=SuccessResponseStorageQuota)
async def delete_profile(
profileid: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await ops.delete_profile(profileid, org)
@router.post("/browser", response_model=BrowserId)
async def create_new(
profile_launch: ProfileLaunchBrowserIn,
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
return await ops.create_new_browser(org, user, profile_launch)
@router.post("/browser/{browserid}/ping", response_model=ProfilePingResponse)
async def ping_profile_browser(browserid: str = Depends(browser_dep)):
return await ops.ping_profile_browser(browserid)
@router.post("/browser/{browserid}/navigate", response_model=SuccessResponse)
async def navigate_profile_browser(
urlin: UrlIn, browserid: str = Depends(browser_dep)
):
return await ops.navigate_profile_browser(browserid, urlin)
@router.get("/browser/{browserid}", response_model=ProfileBrowserGetUrlResponse)
async def get_profile_browser_url(
request: Request,
browserid: str = Depends(browser_dep),
org: Organization = Depends(org_crawl_dep),
):
return await ops.get_profile_browser_url(
browserid, str(org.id), request.headers
)
# pylint: disable=unused-argument
@router.get("/browser/{browserid}/access", response_model=EmptyResponse)
async def access_check(browserid: str = Depends(browser_dep)):
return {}
@router.delete("/browser/{browserid}", response_model=SuccessResponse)
async def delete_profile_browser(browserid: str = Depends(browser_dep)):
return await ops.delete_profile_browser(browserid)
if org_ops.router:
org_ops.router.include_router(router)
return ops