Run webhook tests nightly (#2738)

Fixes #2737 - Moves webhook-related tests to run nightly, to speed up CI runs and avoid the periodic failures we've been getting lately. - Also ensures all try/except blocks that have time.sleep in the 'try' also have a time.sleep in 'except' to avoid fast-looping retries --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2025-07-15 21:05:57 -04:00 · 2025-07-15 21:05:57 -04:00 · d91a3bc088
commit d91a3bc088
parent 4e0e9c87c2
9 changed files with 202 additions and 92 deletions
--- a/.github/workflows/k3d-nightly-ci.yaml
+++ b/.github/workflows/k3d-nightly-ci.yaml
@ -7,6 +7,9 @@ on:

  workflow_dispatch:

+env:
+  ECHO_SERVER_HOST_URL: http://host.k3d.internal:18080
+
 jobs:
  collect-test-modules:
    runs-on: ubuntu-latest
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@ -1,7 +1,6 @@
 import os
 import pytest
 import requests
-import socket
 import subprocess
 import time
 from typing import Dict
@ -691,7 +690,7 @@ def prepare_browser_for_profile_commit(
                break
            time.sleep(5)
        except:
-            pass
+            time.sleep(5)
        attempts += 1


--- a/backend/test/test_org.py
+++ b/backend/test/test_org.py
@ -485,87 +485,6 @@ def test_delete_invite_by_email(admin_auth_headers, non_default_org_id):
    assert data["detail"] == "invite_not_found"


-def test_update_event_webhook_urls_org_admin(admin_auth_headers, default_org_id):
-    # Verify no URLs are configured
-    r = requests.get(
-        f"{API_PREFIX}/orgs/{default_org_id}",
-        headers=admin_auth_headers,
-    )
-    assert r.status_code == 200
-    data = r.json()
-    if data.get("webhooks"):
-        webhooks = data.get("webhooks")
-        assert webhooks.get("crawlStarted") is None
-        assert webhooks.get("crawlFinished") is None
-        assert webhooks.get("crawlDeleted") is None
-        assert webhooks.get("uploadFinished") is None
-        assert webhooks.get("uploadDeleted") is None
-        assert webhooks.get("addedToCollection") is None
-        assert webhooks.get("removedFromCollection") is None
-        assert webhooks.get("collectionDeleted") is None
-
-    # Set URLs and verify
-    CRAWL_STARTED_URL = "https://example.com/crawl/started"
-    CRAWL_FINISHED_URL = "https://example.com/crawl/finished"
-    CRAWL_DELETED_URL = "https://example.com/crawl/deleted"
-    UPLOAD_FINISHED_URL = "https://example.com/upload/finished"
-    UPLOAD_DELETED_URL = "https://example.com/upload/deleted"
-    COLL_ADDED_URL = "https://example.com/coll/added"
-    COLL_REMOVED_URL = "http://example.com/coll/removed"
-    COLL_DELETED_URL = "http://example.com/coll/deleted"
-
-    r = requests.post(
-        f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
-        headers=admin_auth_headers,
-        json={
-            "crawlStarted": CRAWL_STARTED_URL,
-            "crawlFinished": CRAWL_FINISHED_URL,
-            "crawlDeleted": CRAWL_DELETED_URL,
-            "uploadFinished": UPLOAD_FINISHED_URL,
-            "uploadDeleted": UPLOAD_DELETED_URL,
-            "addedToCollection": COLL_ADDED_URL,
-            "removedFromCollection": COLL_REMOVED_URL,
-            "collectionDeleted": COLL_DELETED_URL,
-        },
-    )
-    assert r.status_code == 200
-    assert r.json()["updated"]
-
-    r = requests.get(
-        f"{API_PREFIX}/orgs/{default_org_id}",
-        headers=admin_auth_headers,
-    )
-    assert r.status_code == 200
-    data = r.json()
-    urls = data["webhookUrls"]
-    assert urls["crawlStarted"] == CRAWL_STARTED_URL
-    assert urls["crawlFinished"] == CRAWL_FINISHED_URL
-    assert urls["crawlDeleted"] == CRAWL_DELETED_URL
-
-    assert urls["uploadFinished"] == UPLOAD_FINISHED_URL
-    assert urls["uploadDeleted"] == UPLOAD_DELETED_URL
-
-    assert urls["addedToCollection"] == COLL_ADDED_URL
-    assert urls["removedFromCollection"] == COLL_REMOVED_URL
-    assert urls["collectionDeleted"] == COLL_DELETED_URL
-
-
-def test_update_event_webhook_urls_org_crawler(crawler_auth_headers, default_org_id):
-    r = requests.post(
-        f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
-        headers=crawler_auth_headers,
-        json={
-            "crawlStarted": "https://example.com/crawlstarted",
-            "crawlFinished": "https://example.com/crawlfinished",
-            "uploadFinished": "https://example.com/uploadfinished",
-            "addedToCollection": "https://example.com/added",
-            "removedFromCollection": "https://example.com/removed",
-        },
-    )
-    assert r.status_code == 403
-    assert r.json()["detail"] == "User does not have permission to perform this action"
-
-
 def test_org_metrics(crawler_auth_headers, default_org_id):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/metrics",
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@ -308,7 +308,7 @@ def test_uploads_collection_updated(
    assert data["totalSize"] > 0
    assert data["dateEarliest"]
    assert data["dateLatest"]
-    assert data["modified"] > data["created"]
+    assert data["modified"] >= data["created"]


 def test_replace_upload(
--- a/backend/test/test_z_delete_org.py
+++ b/backend/test/test_z_delete_org.py
@ -1,4 +1,5 @@
 import requests
+import time

 from .conftest import API_PREFIX

@ -39,7 +40,7 @@ def test_recalculate_org_storage(admin_auth_headers, default_org_id):

            time.sleep(10)
        except:
-            pass
+            time.sleep(10)

        attempts += 1

@ -112,7 +113,7 @@ def test_delete_org_superadmin(admin_auth_headers, default_org_id):

            time.sleep(10)
        except:
-            pass
+            time.sleep(10)

        attempts += 1

--- a/backend/test_nightly/data/example.wacz
+++ b/backend/test_nightly/data/example.wacz
--- a/backend/test_nightly/echo_server.py
+++ b/backend/test_nightly/echo_server.py
@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""
+A web server to record POST requests and return them on a GET request
+"""
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import json
+
+BIND_HOST = "0.0.0.0"
+PORT = 18080
+
+post_bodies = []
+
+
+class EchoServerHTTPRequestHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        self.send_response(200)
+        self.end_headers()
+        self.wfile.write(json.dumps({"post_bodies": post_bodies}).encode("utf-8"))
+
+    def do_POST(self):
+        content_length = int(self.headers.get("content-length", 0))
+        body = self.rfile.read(content_length)
+        self.send_response(200)
+        if self.path.endswith("/portalUrl"):
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(
+                json.dumps({"portalUrl": "https://portal.example.com/path/"}).encode(
+                    "utf-8"
+                )
+            )
+        else:
+            self.end_headers()
+
+        post_bodies.append(json.loads(body.decode("utf-8").replace("'", '"')))
+
+
+httpd = HTTPServer((BIND_HOST, PORT), EchoServerHTTPRequestHandler)
+httpd.serve_forever()
--- a/backend/test_nightly/test_org_deletion.py
+++ b/backend/test_nightly/test_org_deletion.py
@ -168,7 +168,9 @@ def test_delete_org_crawl_running(

            time.sleep(10)
        except:
-            pass
+            time.sleep(10)
+
+

        attempts += 1

@ -214,7 +216,7 @@ def test_delete_org_qa_running(

            time.sleep(10)
        except:
-            pass
+            time.sleep(10)

        attempts += 1

@ -260,7 +262,7 @@ def test_delete_org_profile_running(

            time.sleep(10)
        except:
-            pass
+            time.sleep(10)

        attempts += 1

--- a/backend/test_nightly/test_webhooks.py
+++ b/backend/test_nightly/test_webhooks.py
@ -1,7 +1,9 @@
 import json
 import os
+import subprocess
 import time

+import pytest
 import requests

 from .conftest import API_PREFIX
@ -20,8 +22,150 @@ ECHO_SERVER_URL_FROM_K8S = os.environ.get(
    "ECHO_SERVER_HOST_URL", "http://host.docker.internal:18080"
 )

+FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]

-def test_list_webhook_events(admin_auth_headers, default_org_id):
+SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"]
+
+FINISHED_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]
+
+
+@pytest.fixture(scope="function")
+def echo_server():
+    print(f"Echo server starting", flush=True)
+    p = subprocess.Popen(["python3", os.path.join(curr_dir, "echo_server.py")])
+    print(f"Echo server started", flush=True)
+    time.sleep(1)
+    yield p
+    time.sleep(10)
+    print(f"Echo server terminating", flush=True)
+    p.terminate()
+    print(f"Echo server terminated", flush=True)
+
+
+@pytest.fixture(scope="session")
+def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
+    # Start crawl.
+    crawl_data = {
+        "runNow": True,
+        "name": "All Crawls Test Crawl",
+        "description": "Lorem ipsum",
+        "config": {
+            "seeds": [{"url": "https://webrecorder.net/"}],
+            "exclude": "community",
+            "limit": 3,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=crawler_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+    crawl_id = data["run_now_job"]
+
+    # Wait for it to complete and then return crawl ID
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=crawler_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in FINISHED_STATES:
+            break
+        time.sleep(5)
+
+    # Add description to crawl
+    r = requests.patch(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
+        headers=crawler_auth_headers,
+        json={"description": "Lorem ipsum"},
+    )
+    assert r.status_code == 200
+    return crawl_id
+
+
+def test_update_event_webhook_urls_org_admin(admin_auth_headers, default_org_id):
+    # Verify no URLs are configured
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    if data.get("webhooks"):
+        webhooks = data.get("webhooks")
+        assert webhooks.get("crawlStarted") is None
+        assert webhooks.get("crawlFinished") is None
+        assert webhooks.get("crawlDeleted") is None
+        assert webhooks.get("uploadFinished") is None
+        assert webhooks.get("uploadDeleted") is None
+        assert webhooks.get("addedToCollection") is None
+        assert webhooks.get("removedFromCollection") is None
+        assert webhooks.get("collectionDeleted") is None
+
+    # Set URLs and verify
+    CRAWL_STARTED_URL = "https://example.com/crawl/started"
+    CRAWL_FINISHED_URL = "https://example.com/crawl/finished"
+    CRAWL_DELETED_URL = "https://example.com/crawl/deleted"
+    UPLOAD_FINISHED_URL = "https://example.com/upload/finished"
+    UPLOAD_DELETED_URL = "https://example.com/upload/deleted"
+    COLL_ADDED_URL = "https://example.com/coll/added"
+    COLL_REMOVED_URL = "http://example.com/coll/removed"
+    COLL_DELETED_URL = "http://example.com/coll/deleted"
+
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
+        headers=admin_auth_headers,
+        json={
+            "crawlStarted": CRAWL_STARTED_URL,
+            "crawlFinished": CRAWL_FINISHED_URL,
+            "crawlDeleted": CRAWL_DELETED_URL,
+            "uploadFinished": UPLOAD_FINISHED_URL,
+            "uploadDeleted": UPLOAD_DELETED_URL,
+            "addedToCollection": COLL_ADDED_URL,
+            "removedFromCollection": COLL_REMOVED_URL,
+            "collectionDeleted": COLL_DELETED_URL,
+        },
+    )
+    assert r.status_code == 200
+    assert r.json()["updated"]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    urls = data["webhookUrls"]
+    assert urls["crawlStarted"] == CRAWL_STARTED_URL
+    assert urls["crawlFinished"] == CRAWL_FINISHED_URL
+    assert urls["crawlDeleted"] == CRAWL_DELETED_URL
+
+    assert urls["uploadFinished"] == UPLOAD_FINISHED_URL
+    assert urls["uploadDeleted"] == UPLOAD_DELETED_URL
+
+    assert urls["addedToCollection"] == COLL_ADDED_URL
+    assert urls["removedFromCollection"] == COLL_REMOVED_URL
+    assert urls["collectionDeleted"] == COLL_DELETED_URL
+
+
+def test_update_event_webhook_urls_org_crawler(crawler_auth_headers, default_org_id):
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/event-webhook-urls",
+        headers=crawler_auth_headers,
+        json={
+            "crawlStarted": "https://example.com/crawlstarted",
+            "crawlFinished": "https://example.com/crawlfinished",
+            "uploadFinished": "https://example.com/uploadfinished",
+            "addedToCollection": "https://example.com/added",
+            "removedFromCollection": "https://example.com/removed",
+        },
+    )
+    assert r.status_code == 403
+    assert r.json()["detail"] == "User does not have permission to perform this action"
+
+
+def test_list_webhook_events(admin_auth_headers, default_org_id, crawl_id_wr):
    # Verify that webhook URLs have been set in previous tests
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}",
@ -40,6 +184,8 @@ def test_list_webhook_events(admin_auth_headers, default_org_id):
    assert urls["collectionDeleted"]

    # Verify list endpoint works as expected
+    # At this point we expect webhook attempts to fail since they're not
+    # configured against a valid endpoint
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/webhooks",
        headers=admin_auth_headers,
@ -62,7 +208,7 @@ def test_list_webhook_events(admin_auth_headers, default_org_id):
    assert _webhook_event_id


-def test_get_webhook_event(admin_auth_headers, default_org_id):
+def test_get_webhook_event(admin_auth_headers, default_org_id, crawl_id_wr):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}",
        headers=admin_auth_headers,
@ -99,7 +245,7 @@ def test_get_webhook_event(admin_auth_headers, default_org_id):
        assert len(body["itemIds"]) >= 1


-def test_retry_webhook_event(admin_auth_headers, default_org_id):
+def test_retry_webhook_event(admin_auth_headers, default_org_id, crawl_id_wr):
    # Expect to fail because we haven't set up URLs that accept webhooks
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}/retry",
@ -175,6 +321,7 @@ def test_webhooks_sent(
        "autoAddCollections": [webhooks_coll_id],
        "config": {
            "seeds": [{"url": "https://webrecorder.net/"}],
+            "limit": 2,
        },
    }
    r = requests.post(