browsertrix/backend/test/test_webhooks.py
Tessa Walsh f3cbd9e179
Add crawl, upload, and collection delete webhook event notifications (#1363)
Fixes #1307
Fixes #1132
Related to #1306

Deleted webhook notifications include the org id and item/collection id.
This PR also includes API docs for the new webhooks and extends the
existing tests to account for the new webhooks.

This PR also does some additional cleanup for existing webhooks:
- Remove `downloadUrls` from item finished webhook bodies
- Rename collection webhook body `downloadUrls` to `downloadUrl`, since
we only ever have one per collection
- Fix API docs for existing webhooks, one of which had the wrong
response body
2023-11-09 18:19:08 -08:00

343 lines
10 KiB
Python

import json
import os
import time
import requests
from .conftest import API_PREFIX
from .utils import read_in_chunks
_webhook_event_id = None
curr_dir = os.path.dirname(os.path.realpath(__file__))
ECHO_SERVER_URL = "http://localhost:18080"
# Pull address to echo server running on host from CI env var.
# If not set, default to host.docker.internal (for local testing with
# Docker Desktop).
ECHO_SERVER_URL_FROM_K8S = os.environ.get(
"ECHO_SERVER_HOST_URL", "http://host.docker.internal:18080"
)
def test_list_webhook_events(admin_auth_headers, default_org_id):
# Verify that webhook URLs have been set in previous tests
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
urls = data["webhookUrls"]
assert urls["crawlStarted"]
assert urls["crawlFinished"]
assert urls["crawlDeleted"]
assert urls["uploadFinished"]
assert urls["uploadDeleted"]
assert urls["addedToCollection"]
assert urls["removedFromCollection"]
assert urls["collectionDeleted"]
# Verify list endpoint works as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
for item in data["items"]:
assert item["id"]
assert item["event"]
assert item["oid"]
assert item["body"]
assert item["success"] is False
assert item["attempts"] == 1
assert item["created"]
assert item["lastAttempted"]
global _webhook_event_id
_webhook_event_id = data["items"][0]["id"]
assert _webhook_event_id
def test_get_webhook_event(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
item = r.json()
assert item["id"]
assert item["oid"]
assert item["success"] is False
assert item["attempts"] == 1
assert item["created"]
assert item["lastAttempted"]
body = item["body"]
assert body
event = item["event"]
assert event
if event in ("crawlFinished", "uploadFinished"):
assert len(body["resources"]) >= 1
assert body["itemId"]
elif event in ("crawlStarted"):
assert len(body.get("resources", [])) == 0
assert body["itemId"]
elif event in ("addedToCollection", "removedFromCollection"):
assert len(body.get("resources", [])) == 0
assert body["downloadUrl"]
assert body["collectionId"]
assert len(body["itemIds"]) >= 1
def test_retry_webhook_event(admin_auth_headers, default_org_id):
# Expect to fail because we haven't set up URLs that accept webhooks
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}/retry",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Give it some time to run with exponential backoff retries
time.sleep(90)
# Verify attempts have been increased
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
item = r.json()
assert item["id"]
assert item["event"]
assert item["oid"]
assert item["body"]
assert item["success"] is False
assert item["attempts"] == 2
assert item["created"]
assert item["lastAttempted"]
def test_webhooks_sent(
admin_auth_headers,
default_org_id,
all_crawls_crawl_id,
echo_server,
):
# Reconfigure event webhooks to use echo server
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
headers=admin_auth_headers,
json={
"crawlStarted": ECHO_SERVER_URL_FROM_K8S,
"crawlFinished": ECHO_SERVER_URL_FROM_K8S,
"crawlDeleted": ECHO_SERVER_URL_FROM_K8S,
"uploadFinished": ECHO_SERVER_URL_FROM_K8S,
"uploadDeleted": ECHO_SERVER_URL_FROM_K8S,
"addedToCollection": ECHO_SERVER_URL_FROM_K8S,
"removedFromCollection": ECHO_SERVER_URL_FROM_K8S,
"collectionDeleted": ECHO_SERVER_URL_FROM_K8S,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Create collection with all_crawls_crawl_id already in it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={
"name": "Event webhooks test collection",
"crawlIds": [all_crawls_crawl_id],
},
)
assert r.status_code == 200
webhooks_coll_id = r.json()["id"]
assert webhooks_coll_id
# Create and run workflow that adds crawl to collection
crawl_data = {
"runNow": True,
"name": "Webhook crawl test",
"autoAddCollections": [webhooks_coll_id],
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
assert r.status_code == 200
data = r.json()
webhooks_config_id = data["id"]
assert webhooks_config_id
webhooks_crawl_id = data["run_now_job"]
# Wait for crawl to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
# Create upload and add to collection
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=webhookstest.wacz&name=Webhooks%20Upload&collections={webhooks_coll_id}",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
data = r.json()
assert data["added"]
webhooks_upload_id = data["id"]
# Remove upload from collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}/remove",
json={"crawlIds": [webhooks_upload_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"]
# Delete upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
json={"crawl_ids": [webhooks_upload_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Remove crawls from collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}/remove",
json={"crawlIds": [webhooks_crawl_id, all_crawls_crawl_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"]
# Delete crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
json={"crawl_ids": [webhooks_crawl_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Delete collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
# Wait to ensure async notifications are all sent
time.sleep(30)
# Send GET request to echo server to retrieve and verify POSTed data
r = requests.get(ECHO_SERVER_URL)
assert r.status_code == 200
data = r.json()
crawl_started_count = 0
crawl_finished_count = 0
crawl_deleted_count = 0
upload_finished_count = 0
upload_deleted_count = 0
added_to_collection_count = 0
removed_from_collection_count = 0
collection_deleted_count = 0
for post in data["post_bodies"]:
assert post["orgId"]
event = post["event"]
assert event
if event == "crawlStarted":
crawl_started_count += 1
assert post["itemId"]
assert post["scheduled"] in (True, False)
assert post.get("resources") is None
elif event == "crawlFinished":
crawl_finished_count += 1
assert post["itemId"]
assert post["state"]
assert post["resources"]
elif event == "crawlDeleted":
crawl_deleted_count += 1
assert post["itemId"]
elif event == "uploadFinished":
upload_finished_count += 1
assert post["itemId"]
assert post["state"]
assert post["resources"]
assert post.get("downloadUrls") is None
elif event == "uploadDeleted":
upload_deleted_count += 1
assert post["itemId"]
elif event == "addedToCollection":
added_to_collection_count += 1
assert post["downloadUrl"]
assert post.get("resources") is None
assert post["itemIds"]
assert post["collectionId"]
elif event == "removedFromCollection":
removed_from_collection_count += 1
assert post["downloadUrl"]
assert post.get("resources") is None
assert post["itemIds"]
assert post["collectionId"]
elif event == "collectionDeleted":
collection_deleted_count += 1
assert post["collectionId"]
# Allow for some variability here due to timing of crawls
assert crawl_started_count >= 1
assert crawl_finished_count >= 1
assert crawl_deleted_count == 1
assert upload_finished_count == 1
assert upload_deleted_count == 1
assert added_to_collection_count >= 2
assert removed_from_collection_count == 2
assert collection_deleted_count == 1
# Check that we've had expected number of successful webhook notifications
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks?success=True",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] >= 7