browsertrix/backend/btrixcloud/webhooks.py
Ilya Kreymer 702c9ab3b7
Better cacheing of presigned URLs + support for thumbnails (#2446)
Overhauls URL presigning by:
- cache the presigned urls in a flat, separate mongodb collection which
has an expiring index
- update presigned urls if not found / expired automatically in index
- remove logic on storing presignedUrl in files
- support cacheing presigned URL for thumbnails.
- add endpoints to clear presigned urls for org or for all files in all
orgs (superadmin only)
- supersedes #2438, fix for #2437
- removes previous presignedUrl and expireAt data from crawls and QA
runs

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2025-03-03 12:05:23 -08:00

663 lines
21 KiB
Python

"""Webhook management"""
import asyncio
from typing import List, Union, Optional, TYPE_CHECKING, cast
from uuid import UUID, uuid4
import aiohttp
import backoff
from fastapi import APIRouter, Depends, HTTPException
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .models import (
WebhookEventType,
WebhookNotification,
CrawlStartedBody,
CrawlFinishedBody,
CrawlDeletedBody,
QaAnalysisStartedBody,
QaAnalysisFinishedBody,
CrawlReviewedBody,
UploadFinishedBody,
UploadDeletedBody,
CollectionItemAddedBody,
CollectionItemRemovedBody,
CollectionDeletedBody,
PaginatedWebhookNotificationResponse,
Organization,
QARun,
)
from .utils import dt_now
if TYPE_CHECKING:
from .orgs import OrgOps
from .crawls import CrawlOps
else:
OrgOps = CrawlOps = object
# ============================================================================
class EventWebhookOps:
"""Event webhook notification management"""
# pylint: disable=invalid-name, too-many-arguments, too-many-locals
org_ops: OrgOps
crawl_ops: CrawlOps
def __init__(self, mdb, org_ops):
self.webhooks = mdb["webhooks"]
self.colls = mdb["collections"]
self.crawls = mdb["crawls"]
self.org_ops = org_ops
self.crawl_ops = cast(CrawlOps, None)
self.origin = None
self.router = APIRouter(
prefix="/webhooks",
tags=["webhooks"],
responses={404: {"description": "Not found"}},
)
def set_crawl_ops(self, ops):
"""set crawl ops"""
self.crawl_ops = ops
async def list_notifications(
self,
org: Organization,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
success: Optional[bool] = None,
event: Optional[str] = None,
sort_by: Optional[str] = None,
sort_direction: Optional[int] = -1,
):
"""List all webhook notifications"""
# pylint: disable=duplicate-code
# Zero-index page for query
page = page - 1
skip = page_size * page
query: dict[str, object] = {"oid": org.id}
if success in (True, False):
query["success"] = success
if event:
query["event"] = event
aggregate = [{"$match": query}]
if sort_by:
SORT_FIELDS = ("success", "event", "attempts", "created", "lastAttempted")
if sort_by not in SORT_FIELDS:
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
aggregate.extend(
[
{
"$facet": {
"items": [
{"$skip": skip},
{"$limit": page_size},
],
"total": [{"$count": "count"}],
}
},
]
)
# Get total
cursor = self.webhooks.aggregate(aggregate)
results = await cursor.to_list(length=1)
result = results[0]
items = result["items"]
try:
total = int(result["total"][0]["count"])
except (IndexError, ValueError):
total = 0
notifications = [WebhookNotification.from_dict(res) for res in items]
return notifications, total
async def get_notification(self, org: Organization, notificationid: UUID):
"""Get webhook notification by id and org"""
query = {"_id": notificationid, "oid": org.id}
res = await self.webhooks.find_one(query)
if not res:
raise HTTPException(status_code=404, detail="notification_not_found")
return WebhookNotification.from_dict(res)
@backoff.on_exception(
backoff.expo,
(aiohttp.ClientError, aiohttp.client_exceptions.ClientConnectorError),
max_tries=5,
max_time=60,
)
async def send_notification(
self, org: Organization, notification: WebhookNotification
):
"""Send notification"""
if not org.webhookUrls:
print(
"Webhook URLs not configured - skipping sending notification",
flush=True,
)
return
webhook_url = getattr(org.webhookUrls, notification.event)
if not webhook_url:
print(
f"Webhook URL for event {notification.event} not configured, skipping",
flush=True,
)
return
try:
async with aiohttp.ClientSession() as session:
async with session.request(
"POST",
webhook_url,
json=notification.body.dict(),
raise_for_status=True,
):
await self.webhooks.find_one_and_update(
{"_id": notification.id},
{
"$set": {
"success": True,
"lastAttempted": dt_now(),
},
"$inc": {"attempts": 1},
},
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(f"Webhook notification failed: {err}", flush=True)
await self.webhooks.find_one_and_update(
{"_id": notification.id},
{
"$set": {"lastAttempted": dt_now()},
"$inc": {"attempts": 1},
},
)
async def _create_item_finished_notification(
self,
crawl_id: str,
org: Organization,
event: str,
body: Union[CrawlFinishedBody, QaAnalysisFinishedBody, UploadFinishedBody],
):
"""Create webhook notification for finished crawl/upload."""
crawl = await self.crawl_ops.get_crawl_out(crawl_id, org)
if not crawl:
print(f"Crawl {crawl_id} not found, skipping event webhook", flush=True)
return
body.resources = crawl.resources or []
notification = WebhookNotification(
id=uuid4(),
event=event,
oid=org.id,
body=body,
created=dt_now(),
)
await self.webhooks.insert_one(notification.to_dict())
await self.send_notification(org, notification)
if crawl.collectionIds:
for coll_id in crawl.collectionIds:
await self.create_added_to_collection_notification(
crawl_ids=[crawl_id], coll_id=coll_id, org=org
)
async def _create_deleted_notification(
self,
org: Organization,
event: str,
body: Union[CrawlDeletedBody, UploadDeletedBody, CollectionDeletedBody],
):
"""Create webhook notification for deleted crawl/upload/collection."""
notification = WebhookNotification(
id=uuid4(),
event=event,
oid=org.id,
body=body,
created=dt_now(),
)
await self.webhooks.insert_one(notification.to_dict())
await self.send_notification(org, notification)
async def create_crawl_finished_notification(
self, crawl_id: str, oid: UUID, state: str
) -> None:
"""Create webhook notification for finished crawl."""
org = await self.org_ops.get_org_by_id(oid)
if not org.webhookUrls or not org.webhookUrls.crawlFinished:
return
await self._create_item_finished_notification(
crawl_id,
org,
event=WebhookEventType.CRAWL_FINISHED,
body=CrawlFinishedBody(
itemId=crawl_id,
orgId=str(org.id),
state=state,
resources=[],
),
)
async def create_qa_analysis_finished_notification(
self, qa_run: QARun, oid: UUID, crawl_id: str
) -> None:
"""Create webhook notification for finished qa analysis run."""
org = await self.org_ops.get_org_by_id(oid)
if not org.webhookUrls or not org.webhookUrls.qaAnalysisFinished:
return
qa_resources = []
# Check both crawl.qa and crawl.qaFinished for files because we don't
# know for certain what state the crawl will be in at this point
try:
qa_resources = await self.crawl_ops.resolve_signed_urls(
qa_run.files, org, crawl_id
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(f"Error trying to get QA run resources: {err}", flush=True)
notification = WebhookNotification(
id=uuid4(),
event=WebhookEventType.QA_ANALYSIS_FINISHED,
oid=oid,
body=QaAnalysisFinishedBody(
itemId=crawl_id,
qaRunId=qa_run.id,
orgId=str(org.id),
state=qa_run.state,
resources=qa_resources,
),
created=dt_now(),
)
await self.webhooks.insert_one(notification.to_dict())
await self.send_notification(org, notification)
async def create_crawl_deleted_notification(
self, crawl_id: str, org: Organization
) -> None:
"""Create webhook notification for deleted crawl."""
if not org.webhookUrls or not org.webhookUrls.crawlDeleted:
return
await self._create_deleted_notification(
org,
event=WebhookEventType.CRAWL_DELETED,
body=CrawlDeletedBody(
itemId=crawl_id,
orgId=str(org.id),
),
)
async def create_upload_finished_notification(
self, crawl_id: str, oid: UUID
) -> None:
"""Create webhook notification for finished upload."""
org = await self.org_ops.get_org_by_id(oid)
if not org.webhookUrls or not org.webhookUrls.uploadFinished:
return
await self._create_item_finished_notification(
crawl_id,
org,
event=WebhookEventType.UPLOAD_FINISHED,
body=UploadFinishedBody(
itemId=crawl_id, orgId=str(org.id), state="complete", resources=[]
),
)
async def create_upload_deleted_notification(
self, crawl_id: str, org: Organization
) -> None:
"""Create webhook notification for deleted upload."""
if not org.webhookUrls or not org.webhookUrls.uploadDeleted:
return
await self._create_deleted_notification(
org,
event=WebhookEventType.UPLOAD_DELETED,
body=UploadDeletedBody(itemId=crawl_id, orgId=str(org.id)),
)
async def create_crawl_started_notification(
self, crawl_id: str, oid: UUID, scheduled: bool = False
) -> None:
"""Create webhook notification for started crawl."""
org = await self.org_ops.get_org_by_id(oid)
if not org.webhookUrls or not org.webhookUrls.crawlStarted:
return
# Check if already created this event
existing_notification = await self.webhooks.find_one(
{
"event": WebhookEventType.CRAWL_STARTED,
"body.itemId": crawl_id,
}
)
if existing_notification:
return
notification = WebhookNotification(
id=uuid4(),
event=WebhookEventType.CRAWL_STARTED,
oid=oid,
body=CrawlStartedBody(
itemId=crawl_id,
orgId=str(oid),
scheduled=scheduled,
),
created=dt_now(),
)
await self.webhooks.insert_one(notification.to_dict())
await self.send_notification(org, notification)
async def create_qa_analysis_started_notification(
self, qa_run_id: str, oid: UUID, crawl_id: str
) -> None:
"""Create webhook notification for started qa analysis run."""
org = await self.org_ops.get_org_by_id(oid)
if not org.webhookUrls or not org.webhookUrls.qaAnalysisStarted:
return
# Check if already created this event
existing_notification = await self.webhooks.find_one(
{
"event": WebhookEventType.QA_ANALYSIS_STARTED,
"body.qaRunId": qa_run_id,
}
)
if existing_notification:
return
notification = WebhookNotification(
id=uuid4(),
event=WebhookEventType.QA_ANALYSIS_STARTED,
oid=oid,
body=QaAnalysisStartedBody(
itemId=crawl_id,
qaRunId=qa_run_id,
orgId=str(oid),
),
created=dt_now(),
)
await self.webhooks.insert_one(notification.to_dict())
await self.send_notification(org, notification)
async def create_crawl_reviewed_notification(
self,
crawl_id: str,
oid: UUID,
review_status: Optional[int],
description: Optional[str],
) -> None:
"""Create webhook notification for crawl being reviewed in qa"""
org = await self.org_ops.get_org_by_id(oid)
if not org.webhookUrls or not org.webhookUrls.crawlReviewed:
return
review_status_labels = {
1: "Bad",
2: "Poor",
3: "Fair",
4: "Good",
5: "Excellent",
}
notification = WebhookNotification(
id=uuid4(),
event=WebhookEventType.CRAWL_REVIEWED,
oid=oid,
body=CrawlReviewedBody(
itemId=crawl_id,
orgId=str(oid),
reviewStatus=review_status,
reviewStatusLabel=(
review_status_labels.get(review_status, "") if review_status else ""
),
description=description,
),
created=dt_now(),
)
await self.webhooks.insert_one(notification.to_dict())
await self.send_notification(org, notification)
async def _create_collection_items_modified_notification(
self,
coll_id: UUID,
org: Organization,
event: str,
body: Union[CollectionItemAddedBody, CollectionItemRemovedBody],
):
"""Create webhook notification for item added/removed to collection."""
coll_download_url = f"/api/orgs/{org.id}/collections/{coll_id}/download"
if org.origin:
coll_download_url = (
f"{org.origin}/api/orgs/{org.id}/collections/{coll_id}/download"
)
body.downloadUrl = coll_download_url
notification = WebhookNotification(
id=uuid4(),
event=event,
oid=org.id,
body=body,
created=dt_now(),
)
await self.webhooks.insert_one(notification.to_dict())
await self.send_notification(org, notification)
async def create_added_to_collection_notification(
self,
crawl_ids: List[str],
coll_id: UUID,
org: Organization,
) -> None:
"""Create webhook notification for item added to collection"""
if not org.webhookUrls or not org.webhookUrls.addedToCollection:
return
await self._create_collection_items_modified_notification(
coll_id,
org,
event=WebhookEventType.ADDED_TO_COLLECTION,
body=CollectionItemAddedBody(
itemIds=crawl_ids,
downloadUrl="",
collectionId=str(coll_id),
orgId=str(org.id),
),
)
async def create_removed_from_collection_notification(
self,
crawl_ids: List[str],
coll_id: UUID,
org: Organization,
) -> None:
"""Create webhook notification for item removed from collection"""
if not org.webhookUrls or not org.webhookUrls.removedFromCollection:
return
await self._create_collection_items_modified_notification(
coll_id,
org,
event=WebhookEventType.REMOVED_FROM_COLLECTION,
body=CollectionItemRemovedBody(
itemIds=crawl_ids,
downloadUrl="",
collectionId=str(coll_id),
orgId=str(org.id),
),
)
async def create_collection_deleted_notification(
self,
coll_id: UUID,
org: Organization,
) -> None:
"""Create webhook notification for item removed from collection"""
if not org.webhookUrls or not org.webhookUrls.collectionDeleted:
return
await self._create_deleted_notification(
org,
event=WebhookEventType.REMOVED_FROM_COLLECTION,
body=CollectionDeletedBody(
collectionId=str(coll_id),
orgId=str(org.id),
),
)
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
def init_event_webhooks_api(mdb, org_ops, app):
"""init event webhooks system"""
# pylint: disable=invalid-name
ops = EventWebhookOps(mdb, org_ops)
router = ops.router
org_owner_dep = org_ops.org_owner_dep
@router.get("", response_model=PaginatedWebhookNotificationResponse)
async def list_notifications(
org: Organization = Depends(org_owner_dep),
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
success: Optional[bool] = None,
event: Optional[str] = None,
sortBy: Optional[str] = None,
sortDirection: Optional[int] = -1,
):
notifications, total = await ops.list_notifications(
org,
page_size=pageSize,
page=page,
success=success,
event=event,
sort_by=sortBy,
sort_direction=sortDirection,
)
return paginated_format(notifications, total, page, pageSize)
@router.get("/{notificationid}", response_model=WebhookNotification)
async def get_notification(
notificationid: UUID,
org: Organization = Depends(org_owner_dep),
):
return await ops.get_notification(org, notificationid)
@router.post("/{notificationid}/retry")
async def retry_notification(
notificationid: UUID,
org: Organization = Depends(org_owner_dep),
):
notification = await ops.get_notification(org, notificationid)
asyncio.create_task(ops.send_notification(org, notification))
return {"success": True}
init_openapi_webhooks(app)
org_ops.router.include_router(router)
return ops
def init_openapi_webhooks(app):
"""add webhooks declarations for openapi"""
# pylint: disable=unused-argument
@app.webhooks.post(WebhookEventType.CRAWL_STARTED)
def crawl_started(body: CrawlStartedBody):
"""Sent when a crawl is started"""
@app.webhooks.post(WebhookEventType.CRAWL_FINISHED)
def crawl_finished(body: CrawlFinishedBody):
"""Sent when a crawl has finished"""
@app.webhooks.post(WebhookEventType.CRAWL_DELETED)
def crawl_deleted(body: CrawlDeletedBody):
"""Sent when a crawl is deleted"""
@app.webhooks.post(WebhookEventType.QA_ANALYSIS_STARTED)
def qa_analysis_started(body: QaAnalysisStartedBody):
"""Sent when a qa analysis run is started"""
@app.webhooks.post(WebhookEventType.QA_ANALYSIS_FINISHED)
def qa_analysis_finished(body: QaAnalysisFinishedBody):
"""Sent when a qa analysis run has finished"""
@app.webhooks.post(WebhookEventType.CRAWL_REVIEWED)
def crawl_reviewed(body: CrawlReviewedBody):
"""Sent when a crawl has been reviewed in qa"""
@app.webhooks.post(WebhookEventType.UPLOAD_FINISHED)
def upload_finished(body: UploadFinishedBody):
"""Sent when an upload has finished"""
@app.webhooks.post(WebhookEventType.UPLOAD_DELETED)
def upload_deleted(body: UploadDeletedBody):
"""Sent when an upload is deleted"""
@app.webhooks.post(WebhookEventType.ADDED_TO_COLLECTION)
def added_to_collection(body: CollectionItemAddedBody):
"""Sent when an archived item (crawl or upload)
is added to a collection"""
@app.webhooks.post(WebhookEventType.REMOVED_FROM_COLLECTION)
def remove_from_collection(body: CollectionItemRemovedBody):
"""Sent when an archived item (crawl or upload)
is removed from a collection"""
@app.webhooks.post(WebhookEventType.COLLECTION_DELETED)
def collection_deleted(body: CollectionDeletedBody):
"""Sent when a collection is deleted"""