browsertrix/backend/test_nightly/test_webhooks.py
Tessa Walsh d91a3bc088
Run webhook tests nightly (#2738)
Fixes #2737 

- Moves webhook-related tests to run nightly, to speed up CI runs and
avoid the periodic failures we've been getting lately.
- Also ensures all try/except blocks that have time.sleep in the 'try' also have a time.sleep in 'except'
to avoid fast-looping retries

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-07-15 18:05:57 -07:00

551 lines
17 KiB
Python

import json
import os
import subprocess
import time
import pytest
import requests
from .conftest import API_PREFIX
from .utils import read_in_chunks
_webhook_event_id = None
curr_dir = os.path.dirname(os.path.realpath(__file__))
ECHO_SERVER_URL = "http://localhost:18080"
# Pull address to echo server running on host from CI env var.
# If not set, default to host.docker.internal (for local testing with
# Docker Desktop).
ECHO_SERVER_URL_FROM_K8S = os.environ.get(
"ECHO_SERVER_HOST_URL", "http://host.docker.internal:18080"
)
FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]
SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"]
FINISHED_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]
@pytest.fixture(scope="function")
def echo_server():
print(f"Echo server starting", flush=True)
p = subprocess.Popen(["python3", os.path.join(curr_dir, "echo_server.py")])
print(f"Echo server started", flush=True)
time.sleep(1)
yield p
time.sleep(10)
print(f"Echo server terminating", flush=True)
p.terminate()
print(f"Echo server terminated", flush=True)
@pytest.fixture(scope="session")
def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "All Crawls Test Crawl",
"description": "Lorem ipsum",
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"exclude": "community",
"limit": 3,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] in FINISHED_STATES:
break
time.sleep(5)
# Add description to crawl
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
headers=crawler_auth_headers,
json={"description": "Lorem ipsum"},
)
assert r.status_code == 200
return crawl_id
def test_update_event_webhook_urls_org_admin(admin_auth_headers, default_org_id):
# Verify no URLs are configured
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
if data.get("webhooks"):
webhooks = data.get("webhooks")
assert webhooks.get("crawlStarted") is None
assert webhooks.get("crawlFinished") is None
assert webhooks.get("crawlDeleted") is None
assert webhooks.get("uploadFinished") is None
assert webhooks.get("uploadDeleted") is None
assert webhooks.get("addedToCollection") is None
assert webhooks.get("removedFromCollection") is None
assert webhooks.get("collectionDeleted") is None
# Set URLs and verify
CRAWL_STARTED_URL = "https://example.com/crawl/started"
CRAWL_FINISHED_URL = "https://example.com/crawl/finished"
CRAWL_DELETED_URL = "https://example.com/crawl/deleted"
UPLOAD_FINISHED_URL = "https://example.com/upload/finished"
UPLOAD_DELETED_URL = "https://example.com/upload/deleted"
COLL_ADDED_URL = "https://example.com/coll/added"
COLL_REMOVED_URL = "http://example.com/coll/removed"
COLL_DELETED_URL = "http://example.com/coll/deleted"
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
headers=admin_auth_headers,
json={
"crawlStarted": CRAWL_STARTED_URL,
"crawlFinished": CRAWL_FINISHED_URL,
"crawlDeleted": CRAWL_DELETED_URL,
"uploadFinished": UPLOAD_FINISHED_URL,
"uploadDeleted": UPLOAD_DELETED_URL,
"addedToCollection": COLL_ADDED_URL,
"removedFromCollection": COLL_REMOVED_URL,
"collectionDeleted": COLL_DELETED_URL,
},
)
assert r.status_code == 200
assert r.json()["updated"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
urls = data["webhookUrls"]
assert urls["crawlStarted"] == CRAWL_STARTED_URL
assert urls["crawlFinished"] == CRAWL_FINISHED_URL
assert urls["crawlDeleted"] == CRAWL_DELETED_URL
assert urls["uploadFinished"] == UPLOAD_FINISHED_URL
assert urls["uploadDeleted"] == UPLOAD_DELETED_URL
assert urls["addedToCollection"] == COLL_ADDED_URL
assert urls["removedFromCollection"] == COLL_REMOVED_URL
assert urls["collectionDeleted"] == COLL_DELETED_URL
def test_update_event_webhook_urls_org_crawler(crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
headers=crawler_auth_headers,
json={
"crawlStarted": "https://example.com/crawlstarted",
"crawlFinished": "https://example.com/crawlfinished",
"uploadFinished": "https://example.com/uploadfinished",
"addedToCollection": "https://example.com/added",
"removedFromCollection": "https://example.com/removed",
},
)
assert r.status_code == 403
assert r.json()["detail"] == "User does not have permission to perform this action"
def test_list_webhook_events(admin_auth_headers, default_org_id, crawl_id_wr):
# Verify that webhook URLs have been set in previous tests
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
urls = data["webhookUrls"]
assert urls["crawlStarted"]
assert urls["crawlFinished"]
assert urls["crawlDeleted"]
assert urls["uploadFinished"]
assert urls["uploadDeleted"]
assert urls["addedToCollection"]
assert urls["removedFromCollection"]
assert urls["collectionDeleted"]
# Verify list endpoint works as expected
# At this point we expect webhook attempts to fail since they're not
# configured against a valid endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
for item in data["items"]:
assert item["id"]
assert item["event"]
assert item["oid"]
assert item["body"]
assert item["success"] is False
assert item["attempts"] == 1
assert item["created"]
assert item["lastAttempted"]
global _webhook_event_id
_webhook_event_id = data["items"][0]["id"]
assert _webhook_event_id
def test_get_webhook_event(admin_auth_headers, default_org_id, crawl_id_wr):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
item = r.json()
assert item["id"]
assert item["oid"]
assert item["success"] is False
assert item["attempts"] == 1
assert item["created"]
assert item["lastAttempted"]
body = item["body"]
assert body
event = item["event"]
assert event
if event in ("crawlFinished", "uploadFinished"):
assert len(body["resources"]) >= 1
assert body["resources"][0]["expireAt"]
assert body["itemId"]
elif event in ("crawlStarted"):
assert len(body.get("resources", [])) == 0
assert body["itemId"]
elif event in ("addedToCollection", "removedFromCollection"):
assert len(body.get("resources", [])) == 0
assert body["downloadUrl"]
assert body["collectionId"]
assert len(body["itemIds"]) >= 1
def test_retry_webhook_event(admin_auth_headers, default_org_id, crawl_id_wr):
# Expect to fail because we haven't set up URLs that accept webhooks
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}/retry",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Give it some time to run with exponential backoff retries
time.sleep(90)
# Verify attempts have been increased
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
item = r.json()
assert item["id"]
assert item["event"]
assert item["oid"]
assert item["body"]
assert item["success"] is False
assert item["attempts"] == 2
assert item["created"]
assert item["lastAttempted"]
def test_webhooks_sent(
admin_auth_headers,
default_org_id,
all_crawls_crawl_id,
echo_server,
):
# Reconfigure event webhooks to use echo server
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
headers=admin_auth_headers,
json={
"crawlStarted": ECHO_SERVER_URL_FROM_K8S,
"crawlFinished": ECHO_SERVER_URL_FROM_K8S,
"crawlDeleted": ECHO_SERVER_URL_FROM_K8S,
"qaAnalysisStarted": ECHO_SERVER_URL_FROM_K8S,
"qaAnalysisFinished": ECHO_SERVER_URL_FROM_K8S,
"crawlReviewed": ECHO_SERVER_URL_FROM_K8S,
"uploadFinished": ECHO_SERVER_URL_FROM_K8S,
"uploadDeleted": ECHO_SERVER_URL_FROM_K8S,
"addedToCollection": ECHO_SERVER_URL_FROM_K8S,
"removedFromCollection": ECHO_SERVER_URL_FROM_K8S,
"collectionDeleted": ECHO_SERVER_URL_FROM_K8S,
},
)
assert r.status_code == 200
assert r.json()["updated"]
# Create collection with all_crawls_crawl_id already in it
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={
"name": "Event webhooks test collection",
"crawlIds": [all_crawls_crawl_id],
},
)
assert r.status_code == 200
webhooks_coll_id = r.json()["id"]
assert webhooks_coll_id
# Create and run workflow that adds crawl to collection
crawl_data = {
"runNow": True,
"name": "Webhook crawl test",
"autoAddCollections": [webhooks_coll_id],
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"limit": 2,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
assert r.status_code == 200
data = r.json()
webhooks_config_id = data["id"]
assert webhooks_config_id
webhooks_crawl_id = data["run_now_job"]
# Wait for crawl to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
# Run QA analysis on crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}/qa/start",
headers=admin_auth_headers,
)
assert r.status_code == 200
qa_run_id = r.json()["started"]
# Wait for QA to complete
count = 0
max_attempts = 24
while count < max_attempts:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}/qa/activeQA",
headers=admin_auth_headers,
)
data = r.json()
if not data["qa"]:
break
if count + 1 == max_attempts:
assert False
time.sleep(5)
count += 1
# Review crawl
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{webhooks_crawl_id}",
headers=admin_auth_headers,
json={"reviewStatus": 5, "description": "Perfect crawl"},
)
assert r.status_code == 200
# Create upload and add to collection
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=webhookstest.wacz&name=Webhooks%20Upload&collections={webhooks_coll_id}",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 200
data = r.json()
assert data["added"]
webhooks_upload_id = data["id"]
# Remove upload from collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}/remove",
json={"crawlIds": [webhooks_upload_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"]
# Delete upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
json={"crawl_ids": [webhooks_upload_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Remove crawls from collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}/remove",
json={"crawlIds": [webhooks_crawl_id, all_crawls_crawl_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"]
# Delete crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
json={"crawl_ids": [webhooks_crawl_id]},
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Delete collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{webhooks_coll_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
# Wait to ensure async notifications are all sent
time.sleep(30)
# Send GET request to echo server to retrieve and verify POSTed data
r = requests.get(ECHO_SERVER_URL)
assert r.status_code == 200
data = r.json()
crawl_started_count = 0
crawl_finished_count = 0
crawl_deleted_count = 0
qa_analysis_started_count = 0
qa_analysis_finished_count = 0
crawl_reviewed_count = 0
upload_finished_count = 0
upload_deleted_count = 0
added_to_collection_count = 0
removed_from_collection_count = 0
collection_deleted_count = 0
for post in data["post_bodies"]:
assert post["orgId"]
event = post["event"]
assert event
if event == "crawlStarted":
crawl_started_count += 1
assert post["itemId"]
assert post["scheduled"] in (True, False)
assert post.get("resources") is None
elif event == "crawlFinished":
crawl_finished_count += 1
assert post["itemId"]
assert post["state"]
assert post["resources"]
elif event == "crawlDeleted":
crawl_deleted_count += 1
assert post["itemId"]
elif event == "qaAnalysisStarted":
qa_analysis_started_count += 1
assert post["itemId"] == webhooks_crawl_id
assert post["qaRunId"] == qa_run_id
elif event == "qaAnalysisFinished":
qa_analysis_finished_count += 1
assert post["itemId"] == webhooks_crawl_id
assert post["qaRunId"] == qa_run_id
assert post["resources"]
elif event == "crawlReviewed":
crawl_reviewed_count += 1
assert post["itemId"] == webhooks_crawl_id
elif event == "uploadFinished":
upload_finished_count += 1
assert post["itemId"]
assert post["state"]
assert post["resources"]
assert post.get("downloadUrls") is None
elif event == "uploadDeleted":
upload_deleted_count += 1
assert post["itemId"]
elif event == "addedToCollection":
added_to_collection_count += 1
assert post["downloadUrl"]
assert post.get("resources") is None
assert post["itemIds"]
assert post["collectionId"]
elif event == "removedFromCollection":
removed_from_collection_count += 1
assert post["downloadUrl"]
assert post.get("resources") is None
assert post["itemIds"]
assert post["collectionId"]
elif event == "collectionDeleted":
collection_deleted_count += 1
assert post["collectionId"]
# Allow for some variability here due to timing of crawls
assert crawl_started_count >= 1
assert crawl_finished_count >= 1
assert crawl_deleted_count == 1
assert qa_analysis_started_count == 1
assert qa_analysis_finished_count == 1
assert crawl_reviewed_count == 1
assert upload_finished_count == 1
assert upload_deleted_count == 1
assert added_to_collection_count >= 2
assert removed_from_collection_count == 2
assert collection_deleted_count == 1
# Check that we've had expected number of successful webhook notifications
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/webhooks?success=True",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["total"] >= 10