Fix nightly tests: modify kubectl exec syntax for creating new minio bucket (#2097)

Fixes #2096 For example failing test run, see: https://github.com/webrecorder/browsertrix/actions/runs/11121185534/job/30899729448 --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
2024-10-21 20:41:19 -04:00 · 2024-10-21 20:41:19 -04:00 · f7426cc46a
commit f7426cc46a
parent 1b1819ba5a
4 changed files with 55 additions and 28 deletions
--- a/.github/workflows/k3d-nightly-ci.yaml
+++ b/.github/workflows/k3d-nightly-ci.yaml
@ -79,7 +79,7 @@ jobs:
        run: kubectl wait --for=condition=ready pod --all --timeout=240s

      - name: Create Extra Test Buckets
-        run:  kubectl exec -i deployment/local-minio -c minio mkdir /data/replica-0
+        run:  kubectl exec -i deployment/local-minio -c minio -- mkdir /data/replica-0

      - name: Run Tests
        run: pytest -vv ./backend/test_nightly/test_*.py
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -592,8 +592,9 @@ class CrawlConfigOps:
        update_query: dict[str, object] = {}

        running_crawl = await self.get_running_crawl(cid)
-        # only look up last finished crawl if no crawls running, otherwise
-        # lastCrawl* stats are already for running crawl
+
+        # If crawl is running, lastCrawl* stats are already for running crawl,
+        # so there's nothing to update other than size and crawl count
        if not running_crawl:
            match_query = {
                "cid": cid,
@ -603,26 +604,36 @@ class CrawlConfigOps:
            last_crawl = await self.crawls.find_one(
                match_query, sort=[("finished", pymongo.DESCENDING)]
            )
-        else:
-            last_crawl = None

-        if last_crawl:
-            last_crawl_finished = last_crawl.get("finished")
+            # Update to reflect last crawl
+            if last_crawl:
+                last_crawl_finished = last_crawl.get("finished")

-            update_query["lastCrawlId"] = str(last_crawl.get("_id"))
-            update_query["lastCrawlStartTime"] = last_crawl.get("started")
-            update_query["lastStartedBy"] = last_crawl.get("userid")
-            update_query["lastStartedByName"] = last_crawl.get("userName")
-            update_query["lastCrawlTime"] = last_crawl_finished
-            update_query["lastCrawlState"] = last_crawl.get("state")
-            update_query["lastCrawlSize"] = sum(
-                file_.get("size", 0) for file_ in last_crawl.get("files", [])
-            )
-            update_query["lastCrawlStopping"] = False
-            update_query["isCrawlRunning"] = False
+                update_query["lastCrawlId"] = str(last_crawl.get("_id"))
+                update_query["lastCrawlStartTime"] = last_crawl.get("started")
+                update_query["lastStartedBy"] = last_crawl.get("userid")
+                update_query["lastStartedByName"] = last_crawl.get("userName")
+                update_query["lastCrawlTime"] = last_crawl_finished
+                update_query["lastCrawlState"] = last_crawl.get("state")
+                update_query["lastCrawlSize"] = sum(
+                    file_.get("size", 0) for file_ in last_crawl.get("files", [])
+                )
+                update_query["lastCrawlStopping"] = False
+                update_query["isCrawlRunning"] = False

-            if last_crawl_finished:
-                update_query["lastRun"] = last_crawl_finished
+                if last_crawl_finished:
+                    update_query["lastRun"] = last_crawl_finished
+            # If no last crawl exists and no running crawl, reset stats
+            else:
+                update_query["lastCrawlId"] = None
+                update_query["lastCrawlStartTime"] = None
+                update_query["lastStartedBy"] = None
+                update_query["lastStartedByName"] = None
+                update_query["lastCrawlTime"] = None
+                update_query["lastCrawlState"] = None
+                update_query["lastCrawlSize"] = 0
+                update_query["lastRun"] = None
+                update_query["isCrawlRunning"] = False

        result = await self.crawl_configs.find_one_and_update(
            {"_id": cid, "inactive": {"$ne": True}},
--- a/backend/test_nightly/test_crawlconfig_crawl_stats.py
+++ b/backend/test_nightly/test_crawlconfig_crawl_stats.py
@ -1,4 +1,5 @@
 import requests
+import time

 from .conftest import API_PREFIX

@ -70,6 +71,8 @@ def test_crawlconfig_crawl_stats(admin_auth_headers, default_org_id, crawl_confi
    data = r.json()
    assert data["deleted"]

+    time.sleep(10)
+
    # Verify crawl stats from /crawlconfigs
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
--- a/backend/test_nightly/test_storage_quota.py
+++ b/backend/test_nightly/test_storage_quota.py
@ -8,18 +8,19 @@ from .conftest import API_PREFIX
 from .utils import get_crawl_status


-STORAGE_QUOTA_KB = 5
-STORAGE_QUOTA_BYTES = STORAGE_QUOTA_KB * 1000
+STORAGE_QUOTA_MB_TO_INCREASE = 5
+STORAGE_QUOTA_BYTES_INC = STORAGE_QUOTA_MB_TO_INCREASE * 1000 * 1000

 config_id = None

+storage_quota = None

 def run_crawl(org_id, headers):
    crawl_data = {
        "runNow": True,
        "name": "Storage Quota",
        "config": {
-            "seeds": [{"url": "https://webrecorder.net/"}],
+            "seeds": [{"url": "https://specs.webrecorder.net/"}],
            "extraHops": 1,
        },
    }
@ -34,10 +35,22 @@ def run_crawl(org_id, headers):


 def test_storage_quota(org_with_quotas, admin_auth_headers):
+    # Get current storage usage
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{org_with_quotas}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    bytes_stored = r.json()["bytesStored"]
+
+    global storage_quota
+    storage_quota = bytes_stored + STORAGE_QUOTA_BYTES_INC
+
+    # Set storage quota higher than bytesStored
    r = requests.post(
        f"{API_PREFIX}/orgs/{org_with_quotas}/quotas",
        headers=admin_auth_headers,
-        json={"storageQuota": STORAGE_QUOTA_BYTES},
+        json={"storageQuota": storage_quota},
    )
    assert r.status_code == 200
    assert r.json()["updated"]
@ -49,9 +62,12 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he
    crawl_id, config_id = run_crawl(org_with_quotas, admin_auth_headers)
    time.sleep(1)

+    assert crawl_id
+
    while get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers) in (
        "starting",
        "waiting_capacity",
+        "waiting_org_limit",
    ):
        time.sleep(2)

@ -63,14 +79,11 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he
    ):
        time.sleep(2)

-    # Ensure that crawl was stopped by quota
    assert (
        get_crawl_status(org_with_quotas, crawl_id, admin_auth_headers)
        == "stopped_storage_quota_reached"
    )

-    time.sleep(10)
-
    # Ensure crawl storage went over quota
    r = requests.get(
        f"{API_PREFIX}/orgs/{org_with_quotas}",
@ -78,7 +91,7 @@ def test_crawl_stopped_when_storage_quota_reached(org_with_quotas, admin_auth_he
    )
    data = r.json()
    bytes_stored = data["bytesStored"]
-    assert bytes_stored >= STORAGE_QUOTA_BYTES
+    assert bytes_stored >= storage_quota

    time.sleep(5)