browsertrix/backend/test/test_webhooks.py
Tessa Walsh 551660bb62
Add webhooks for qaAnalysisStarted, qaAnalysisFinished, and crawlReviewed (#1974)
Fixes #1957 

Adds three new webhook events related to QA: analysis started, analysis
ended, and crawl reviewed.

Tests have been updated accordingly.

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2024-07-25 16:53:49 -07:00

403 lines
12 KiB
Python

import json
import os
import time
import requests
from .conftest import API_PREFIX
from .utils import read_in_chunks
_webhook_event_id = None
curr_dir = os.path.dirname(os.path.realpath(__file__))
ECHO_SERVER_URL = "http://localhost:18080"
# Pull address to echo server running on host from CI env var.
# If not set, default to host.docker.internal (for local testing with
# Docker Desktop).
ECHO_SERVER_URL_FROM_K8S = os.environ.get(
"ECHO_SERVER_HOST_URL", "http://host.docker.internal:18080"
)
def test_list_webhook_events(admin_auth_headers, default_org_id):
# Verify that webhook URLs have been set in previous tests
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
urls = data["webhookUrls"]
assert urls["crawlStarted"]
assert urls["crawlFinished"]
assert urls["crawlDeleted"]
assert urls["uploadFinished"]
assert urls["uploadDeleted"]
assert urls["addedToCollection"]
assert urls["removedFromCollection"]
assert urls["collectionDeleted"]
# Verify list endpoint works as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
for item in data["items"]:
assert item["id"]
assert item["event"]
assert item["oid"]
assert item["body"]
assert item["success"] is False
assert item["attempts"] == 1
assert item["created"]
assert item["lastAttempted"]
global _webhook_event_id
_webhook_event_id = data["items"][0]["id"]
assert _webhook_event_id
def test_get_webhook_event(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
item = r.json()
assert item["id"]
assert item["oid"]
assert item["success"] is False
assert item["attempts"] == 1
assert item["created"]
assert item["lastAttempted"]
body = item["body"]
assert body
event = item["event"]
assert event
if event in ("crawlFinished", "uploadFinished"):
assert len(body["resources"]) >= 1
assert body["itemId"]
elif event in ("crawlStarted"):
assert len(body.get("resources", [])) == 0
assert body["itemId"]
elif event in ("addedToCollection", "removedFromCollection"):
assert len(body.get("resources", [])) == 0
assert body["downloadUrl"]
assert body["collectionId"]
assert len(body["itemIds"]) >= 1
def test_retry_webhook_event(admin_auth_headers, default_org_id):
# Expect to fail because we haven't set up URLs that accept webhooks
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}/retry",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Give it some time to run with exponential backoff retries
time.sleep(90)
# Verify attempts have been increased
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
item = r.json()
assert item["id"]
assert item["event"]
assert item["oid"]
assert item["body"]
assert item["success"] is False
assert item["attempts"] == 2
assert item["created"]
assert item["lastAttempted"]
def test_webhooks_sent(
admin_auth_headers,
default_org_id,
all_crawls_crawl_id,
echo_server,
):
# Reconfigure event webhooks to use echo server
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
headers=admin_auth_headers,
json={
"crawlStarted": ECHO_SERVER_URL_FROM_K8S,
"crawlFinished": ECHO_SERVER_URL_FROM_K8S,
"crawlDeleted": ECHO_SERVER_URL_FROM_K8S,
"qaAnalysisStarted": ECHO_SERVER_URL_FROM_K8S,
"qaAnalysisFinished": ECHO_SERVER_URL_FROM_K8S,
"crawlReviewed": ECHO_SERVER_URL_FROM_K8S,
"uploadFinished": ECHO_SERVER_URL_FROM_K8S,
"uploadDeleted": ECHO_SERVER_URL_FROM_K8S,
"addedToCollection": ECHO_SERVER_URL_FROM_K8S,
"removedFromCollection": ECHO_SERVER_URL_FROM_K8S,
"collectionDeleted": ECHO_SERVER_URL_FROM_K8S,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Create collection with all_crawls_crawl_id already in it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={
"name": "Event webhooks test collection",
"crawlIds": [all_crawls_crawl_id],
},
)
assert r.status_code == 200
webhooks_coll_id = r.json()["id"]
assert webhooks_coll_id
# Create and run workflow that adds crawl to collection
crawl_data = {
"runNow": True,
"name": "Webhook crawl test",
"autoAddCollections": [webhooks_coll_id],
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
assert r.status_code == 200
data = r.json()
webhooks_config_id = data["id"]
assert webhooks_config_id
webhooks_crawl_id = data["run_now_job"]
# Wait for crawl to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
# Run QA analysis on crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}/qa/start",
headers=admin_auth_headers,
)
assert r.status_code == 200
qa_run_id = r.json()["started"]
# Wait for QA to complete
count = 0
max_attempts = 24
while count < max_attempts:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}/qa/activeQA",
headers=admin_auth_headers,
)
data = r.json()
if not data["qa"]:
break
if count + 1 == max_attempts:
assert False
time.sleep(5)
count += 1
# Review crawl
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}",
headers=admin_auth_headers,
json={"reviewStatus": 5, "description": "Perfect crawl"},
)
assert r.status_code == 200
# Create upload and add to collection
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=webhookstest.wacz&name=Webhooks%20Upload&collections={webhooks_coll_id}",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
data = r.json()
assert data["added"]
webhooks_upload_id = data["id"]
# Remove upload from collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}/remove",
json={"crawlIds": [webhooks_upload_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"]
# Delete upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
json={"crawl_ids": [webhooks_upload_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Remove crawls from collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}/remove",
json={"crawlIds": [webhooks_crawl_id, all_crawls_crawl_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"]
# Delete crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
json={"crawl_ids": [webhooks_crawl_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Delete collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
# Wait to ensure async notifications are all sent
time.sleep(30)
# Send GET request to echo server to retrieve and verify POSTed data
r = requests.get(ECHO_SERVER_URL)
assert r.status_code == 200
data = r.json()
crawl_started_count = 0
crawl_finished_count = 0
crawl_deleted_count = 0
qa_analysis_started_count = 0
qa_analysis_finished_count = 0
crawl_reviewed_count = 0
upload_finished_count = 0
upload_deleted_count = 0
added_to_collection_count = 0
removed_from_collection_count = 0
collection_deleted_count = 0
for post in data["post_bodies"]:
assert post["orgId"]
event = post["event"]
assert event
if event == "crawlStarted":
crawl_started_count += 1
assert post["itemId"]
assert post["scheduled"] in (True, False)
assert post.get("resources") is None
elif event == "crawlFinished":
crawl_finished_count += 1
assert post["itemId"]
assert post["state"]
assert post["resources"]
elif event == "crawlDeleted":
crawl_deleted_count += 1
assert post["itemId"]
elif event == "qaAnalysisStarted":
qa_analysis_started_count += 1
assert post["itemId"] == webhooks_crawl_id
assert post["qaRunId"] == qa_run_id
elif event == "qaAnalysisFinished":
qa_analysis_finished_count += 1
assert post["itemId"] == webhooks_crawl_id
assert post["qaRunId"] == qa_run_id
assert post["resources"]
elif event == "crawlReviewed":
crawl_reviewed_count += 1
assert post["itemId"] == webhooks_crawl_id
elif event == "uploadFinished":
upload_finished_count += 1
assert post["itemId"]
assert post["state"]
assert post["resources"]
assert post.get("downloadUrls") is None
elif event == "uploadDeleted":
upload_deleted_count += 1
assert post["itemId"]
elif event == "addedToCollection":
added_to_collection_count += 1
assert post["downloadUrl"]
assert post.get("resources") is None
assert post["itemIds"]
assert post["collectionId"]
elif event == "removedFromCollection":
removed_from_collection_count += 1
assert post["downloadUrl"]
assert post.get("resources") is None
assert post["itemIds"]
assert post["collectionId"]
elif event == "collectionDeleted":
collection_deleted_count += 1
assert post["collectionId"]
# Allow for some variability here due to timing of crawls
assert crawl_started_count >= 1
assert crawl_finished_count >= 1
assert crawl_deleted_count == 1
assert qa_analysis_started_count == 1
assert qa_analysis_finished_count == 1
assert crawl_reviewed_count == 1
assert upload_finished_count == 1
assert upload_deleted_count == 1
assert added_to_collection_count >= 2
assert removed_from_collection_count == 2
assert collection_deleted_count == 1
# Check that we've had expected number of successful webhook notifications
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks?success=True",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] >= 10