Provide full resources in archived items finished webhooks (#1308)

Fixes #1306 

- Include full `resources` with expireAt (as string) in crawlFinished
and uploadFinished webhook notifications rather than using the
`downloadUrls` field (this is retained for collections).
- Set default presigned duration to one minute short of 1 week and enforce
maximum supported by S3
- Add 'storage_presign_duration_minutes' commented out to helm values.yaml
- Update tests

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2023-10-23 22:01:58 -04:00 committed by GitHub
parent 2e5952a444
commit d58747dfa2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 48 additions and 33 deletions

View File

@ -1,6 +1,5 @@
""" base crawl type """ """ base crawl type """
import asyncio
import uuid import uuid
import os import os
from datetime import timedelta from datetime import timedelta
@ -44,6 +43,12 @@ NON_RUNNING_STATES = (*FAILED_STATES, *SUCCESSFUL_STATES)
ALL_CRAWL_STATES = (*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES) ALL_CRAWL_STATES = (*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES)
# Presign duration must be less than 604800 seconds (one week),
# so set this one minute short of a week.
PRESIGN_MINUTES_MAX = 10079
PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX
# ============================================================================ # ============================================================================
# pylint: disable=too-many-instance-attributes # pylint: disable=too-many-instance-attributes
class BaseCrawlOps: class BaseCrawlOps:
@ -62,8 +67,12 @@ class BaseCrawlOps:
self.colls = colls self.colls = colls
self.storage_ops = storage_ops self.storage_ops = storage_ops
presign_duration_minutes = int(
os.environ.get("PRESIGN_DURATION_MINUTES") or PRESIGN_MINUTES_DEFAULT
)
self.presign_duration_seconds = ( self.presign_duration_seconds = (
int(os.environ.get("PRESIGN_DURATION_MINUTES", 60)) * 60 min(presign_duration_minutes, PRESIGN_MINUTES_MAX) * 60
) )
async def get_crawl_raw( async def get_crawl_raw(
@ -362,7 +371,6 @@ class BaseCrawlOps:
delta = timedelta(seconds=self.presign_duration_seconds) delta = timedelta(seconds=self.presign_duration_seconds)
updates = []
out_files = [] out_files = []
for file_ in files: for file_ in files:
@ -374,17 +382,20 @@ class BaseCrawlOps:
presigned_url = await self.storage_ops.get_presigned_url( presigned_url = await self.storage_ops.get_presigned_url(
org, file_, self.presign_duration_seconds org, file_, self.presign_duration_seconds
) )
updates.append( await self.crawls.find_one_and_update(
( {"files.filename": file_.filename},
{"files.filename": file_.filename}, {
{ "$set": {
"$set": { "files.$.presignedUrl": presigned_url,
"files.$.presignedUrl": presigned_url, "files.$.expireAt": exp,
"files.$.expireAt": exp, }
} },
},
)
) )
file_.expireAt = exp
expire_at_str = ""
if file_.expireAt:
expire_at_str = file_.expireAt.isoformat()
out_files.append( out_files.append(
CrawlFileOut( CrawlFileOut(
@ -393,20 +404,12 @@ class BaseCrawlOps:
hash=file_.hash, hash=file_.hash,
size=file_.size, size=file_.size,
crawlId=crawl_id, crawlId=crawl_id,
expireAt=expire_at_str,
) )
) )
if updates:
asyncio.create_task(self._update_presigned(updates))
# print("presigned", out_files)
return out_files return out_files
async def _update_presigned(self, updates):
for update in updates:
await self.crawls.find_one_and_update(*update)
@contextlib.asynccontextmanager @contextlib.asynccontextmanager
async def get_redis(self, crawl_id): async def get_redis(self, crawl_id):
"""get redis url for crawl id""" """get redis url for crawl id"""

View File

@ -406,7 +406,9 @@ class CrawlFileOut(BaseModel):
path: str path: str
hash: str hash: str
size: int size: int
crawlId: Optional[str] crawlId: Optional[str]
expireAt: Optional[str]
# ============================================================================ # ============================================================================
@ -1053,6 +1055,7 @@ class BaseArchivedItemBody(WebhookNotificationBody):
"""Webhook notification POST body for when archived item is started or finished""" """Webhook notification POST body for when archived item is started or finished"""
itemId: str itemId: str
resources: Optional[List[CrawlFileOut]] = None
# ============================================================================ # ============================================================================

View File

@ -189,12 +189,7 @@ class EventWebhookOps:
print(f"Crawl {crawl_id} not found, skipping event webhook", flush=True) print(f"Crawl {crawl_id} not found, skipping event webhook", flush=True)
return return
download_urls = [] body.resources = crawl.resources
for resource in crawl.resources:
download_url = f"{org.origin}{resource.path}"
download_urls.append(download_url)
body.downloadUrls = download_urls
notification = WebhookNotification( notification = WebhookNotification(
id=uuid.uuid4(), id=uuid.uuid4(),

View File

@ -81,14 +81,17 @@ def test_get_webhook_event(admin_auth_headers, default_org_id):
assert event assert event
if event in ("crawlFinished", "uploadFinished"): if event in ("crawlFinished", "uploadFinished"):
assert len(body["downloadUrls"]) >= 1 assert len(body["resources"]) >= 1
assert len(body.get("downloadUrls", [])) == 0
assert body["itemId"] assert body["itemId"]
elif event in ("crawlStarted"): elif event in ("crawlStarted"):
assert len(body["downloadUrls"]) == 0 assert len(body.get("resources", [])) == 0
assert len(body.get("downloadUrls", [])) == 0
assert body["itemId"] assert body["itemId"]
elif event in ("addedToCollection", "removedFromCollection"): elif event in ("addedToCollection", "removedFromCollection"):
assert len(body.get("resources", [])) == 0
assert len(body["downloadUrls"]) == 1 assert len(body["downloadUrls"]) == 1
assert body["collectionId"] assert body["collectionId"]
assert len(body["itemIds"]) >= 1 assert len(body["itemIds"]) >= 1
@ -246,28 +249,33 @@ def test_webhooks_sent(
assert post["itemId"] assert post["itemId"]
assert post["scheduled"] in (True, False) assert post["scheduled"] in (True, False)
assert post.get("downloadUrls") is None assert post.get("downloadUrls") is None
assert post.get("resources") is None
elif event == "crawlFinished": elif event == "crawlFinished":
crawl_finished_count += 1 crawl_finished_count += 1
assert post["itemId"] assert post["itemId"]
assert post["state"] assert post["state"]
assert post["downloadUrls"] assert post["resources"]
assert post.get("downloadUrls") is None
elif event == "uploadFinished": elif event == "uploadFinished":
upload_finished_count += 1 upload_finished_count += 1
assert post["itemId"] assert post["itemId"]
assert post["state"] assert post["state"]
assert post["downloadUrls"] assert post["resources"]
assert post.get("downloadUrls") is None
elif event == "addedToCollection": elif event == "addedToCollection":
added_to_collection_count += 1 added_to_collection_count += 1
assert post["downloadUrls"] and len(post["downloadUrls"]) == 1 assert post["downloadUrls"] and len(post["downloadUrls"]) == 1
assert post.get("resources") is None
assert post["itemIds"] assert post["itemIds"]
assert post["collectionId"] assert post["collectionId"]
elif event == "removedFromCollection": elif event == "removedFromCollection":
removed_from_collection_count += 1 removed_from_collection_count += 1
assert post["downloadUrls"] and len(post["downloadUrls"]) == 1 assert post["downloadUrls"] and len(post["downloadUrls"]) == 1
assert post.get("resources") is None
assert post["itemIds"] assert post["itemIds"]
assert post["collectionId"] assert post["collectionId"]

View File

@ -36,7 +36,7 @@ data:
RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}" RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}"
PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes | default 60 }}" PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes }}"
FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}" FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}"

View File

@ -276,6 +276,12 @@ storages:
# shared_storage_profile: # shared_storage_profile:
# optional: duration in minutes for WACZ download links to be valid
# used by webhooks and replay
# max value = 10079 (one week minus one minute)
# storage_presign_duration_minutes: 10079
# Email Options # Email Options
# ========================================= # =========================================
email: email: