browsertrix/backend/test/test_workflow_auto_add_to_collection.py
Tessa Walsh 9c7a312a4c
Rework collections to track collections in Crawl (#878)
* Track collections in Crawl rather than crawls in Collection
* Add delete collection API endpoint and tests
* Precompute collection crawlCount, pageCount, and tags and add them to
GET collection responses
* Add modified field to Collection
* Update collection replay.json method
* Make add and remove crawls accept list of crawl ids
* Auto-add new workflow crawls to collections when they successfully
complete via CrawlConfig.autoAddCollections field
* Move long-running post-crawl operator tasks into asyncio task
* Make CrawlConfig.autoAddCollections updatable via /update API endpoint
2023-05-25 15:41:50 -04:00

55 lines
1.5 KiB
Python

import requests
import time
from .conftest import API_PREFIX
def test_workflow_crawl_auto_added_to_collection(
crawler_auth_headers,
default_org_id,
auto_add_collection_id,
auto_add_crawl_id,
):
# Verify that crawl is in collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{auto_add_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert auto_add_collection_id in r.json()["collections"]
def test_workflow_crawl_auto_added_subsequent_runs(
crawler_auth_headers,
default_org_id,
auto_add_collection_id,
auto_add_crawl_id,
auto_add_config_id,
):
# Run workflow again and make sure new crawl is also in collection
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{auto_add_config_id}/run",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("started")
crawl_id = data["started"]
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert auto_add_collection_id in r.json()["collections"]