From 7b5d82936db01d22bf46c878d7b23b35a9de6a4f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jan 2023 13:29:35 -0800 Subject: [PATCH] backend: initial tags api support (addresses #365): (#434) * backend: initial tags api support (addresses #365): - add 'tags' field to crawlconfig (array of strings) - allow querying crawlconfigs to specify multiple 'tag' query args, eg. tag=A&tag=B - add /archives//crawlconfigs/tags api to query by distinct tag, include index on aid + tag tests: add tests for adding configs, querying by tags tests: fix fixtures to retry login if initial attempts fails, use test seed of https://webrecorder.net instead of https://example.com/ --- backend/btrixcloud/crawlconfigs.py | 34 ++++++++++-- backend/test/conftest.py | 42 ++++++++------ backend/test/test_crawl_config_tags.py | 76 ++++++++++++++++++++++++++ backend/test/test_run_crawl.py | 10 ++-- 4 files changed, 135 insertions(+), 27 deletions(-) create mode 100644 backend/test/test_crawl_config_tags.py diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index a883cb86..da95f427 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -11,7 +11,7 @@ from datetime import datetime import pymongo from pydantic import BaseModel, UUID4, conint, HttpUrl -from fastapi import APIRouter, Depends, HTTPException +from fastapi import APIRouter, Depends, HTTPException, Query from .users import User from .archives import Archive, MAX_CRAWL_SCALE @@ -102,6 +102,7 @@ class CrawlConfigIn(BaseModel): profileid: Optional[UUID4] colls: Optional[List[str]] = [] + tags: Optional[List[str]] = [] crawlTimeout: Optional[int] = 0 scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 @@ -124,6 +125,7 @@ class CrawlConfig(BaseMongoModel): created: Optional[datetime] colls: Optional[List[str]] = [] + tags: Optional[List[str]] = [] crawlTimeout: Optional[int] = 0 scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 @@ -222,6 +224,10 @@ class CrawlConfigOps: [("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)] ) + await self.crawl_configs.create_index( + [("aid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)] + ) + def set_coll_ops(self, coll_ops): """set collection ops""" self.coll_ops = coll_ops @@ -359,12 +365,19 @@ class CrawlConfigOps: return {"success": True} - async def get_crawl_configs(self, archive: Archive): + async def get_crawl_configs( + self, archive: Archive, tags: Optional[List[str]] = None + ): """Get all crawl configs for an archive is a member of""" + match_query = {"aid": archive.id, "inactive": {"$ne": True}} + + if tags: + match_query["tags"] = {"$all": tags} + # pylint: disable=duplicate-code cursor = self.crawl_configs.aggregate( [ - {"$match": {"aid": archive.id, "inactive": {"$ne": True}}}, + {"$match": match_query}, { "$lookup": { "from": "users", @@ -564,6 +577,10 @@ class CrawlConfigOps: return result.inserted_id + async def get_crawl_config_tags(self, archive): + """get distinct tags from all crawl configs for this archive""" + return await self.crawl_configs.distinct("tags", {"aid": archive.id}) + # ============================================================================ # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments @@ -580,8 +597,15 @@ def init_crawl_config_api( archive_crawl_dep = archive_ops.archive_crawl_dep @router.get("", response_model=CrawlConfigsResponse) - async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)): - return await ops.get_crawl_configs(archive) + async def get_crawl_configs( + archive: Archive = Depends(archive_crawl_dep), + tag: Union[List[str], None] = Query(default=None), + ): + return await ops.get_crawl_configs(archive, tag) + + @router.get("/tags") + async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)): + return await ops.get_crawl_config_tags(archive) @router.get("/{cid}", response_model=CrawlConfigOut) async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)): diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 21d868ad..ed03e607 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -3,7 +3,8 @@ import requests import time -API_PREFIX = "http://127.0.0.1:30870/api" +HOST_PREFIX = "http://127.0.0.1:30870" +API_PREFIX = HOST_PREFIX + "/api" ADMIN_USERNAME = "admin@example.com" ADMIN_PW = "PASSW0RD!" @@ -14,24 +15,33 @@ VIEWER_PW = "viewerPASSW0RD!" @pytest.fixture(scope="session") def admin_auth_headers(): - r = requests.post( - f"{API_PREFIX}/auth/jwt/login", - data={ - "username": ADMIN_USERNAME, - "password": ADMIN_PW, - "grant_type": "password", - }, - ) - data = r.json() - access_token = data.get("access_token") - return {"Authorization": f"Bearer {access_token}"} + while True: + r = requests.post( + f"{API_PREFIX}/auth/jwt/login", + data={ + "username": ADMIN_USERNAME, + "password": ADMIN_PW, + "grant_type": "password", + }, + ) + data = r.json() + try: + return {"Authorization": f"Bearer {data['access_token']}"} + except: + print("Waiting for admin_auth_headers") + time.sleep(5) @pytest.fixture(scope="session") def admin_aid(admin_auth_headers): - r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers) - data = r.json() - return data["archives"][0]["id"] + while True: + r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers) + data = r.json() + try: + return data["archives"][0]["id"] + except: + print("Waiting for admin_aid") + time.sleep(5) @pytest.fixture(scope="session") @@ -40,7 +50,7 @@ def admin_crawl_id(admin_auth_headers, admin_aid): crawl_data = { "runNow": True, "name": "Admin Test Crawl", - "config": {"seeds": ["https://example.com/"]}, + "config": {"seeds": ["https://webrecorder.net/"], "limit": 1}, } r = requests.post( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", diff --git a/backend/test/test_crawl_config_tags.py b/backend/test/test_crawl_config_tags.py new file mode 100644 index 00000000..71f2d07f --- /dev/null +++ b/backend/test/test_crawl_config_tags.py @@ -0,0 +1,76 @@ +import requests + +from .conftest import API_PREFIX + +new_cid_1 = None +new_cid_2 = None + +def get_sample_crawl_data(tags): + return { + "runNow": False, + "name": "Test Crawl", + "config": {"seeds": ["https://example.com/"]}, + "tags": tags, + } + +def test_create_new_config_1(admin_auth_headers, admin_aid): + r = requests.post( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", + headers=admin_auth_headers, + json=get_sample_crawl_data(["tag-1", "tag-2"]) + ) + + assert r.status_code == 200 + + data = r.json() + assert data["added"] + assert data["run_now_job"] == None + + global new_cid_1 + new_cid_1 = data["added"] + +def test_get_config_1(admin_auth_headers, admin_aid): + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}", + headers=admin_auth_headers, + ) + assert r.json()["tags"] == ["tag-1", "tag-2"] + +def test_get_config_by_tag_1(admin_auth_headers, admin_aid): + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags", + headers=admin_auth_headers, + ) + assert r.json() == ["tag-1", "tag-2"] + +def test_create_new_config_2(admin_auth_headers, admin_aid): + r = requests.post( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", + headers=admin_auth_headers, + json=get_sample_crawl_data(["tag-3", "tag-0"]) + ) + + assert r.status_code == 200 + + data = r.json() + assert data["added"] + assert data["run_now_job"] == None + + global new_cid_2 + new_cid_2 = data["added"] + +def test_get_config_by_tag_2(admin_auth_headers, admin_aid): + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags", + headers=admin_auth_headers, + ) + assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"] + +def test_get_config_2(admin_auth_headers, admin_aid): + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}", + headers=admin_auth_headers, + ) + assert r.json()["tags"] == ["tag-3", "tag-0"] + + diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 7418b44a..82c0d28b 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -4,9 +4,7 @@ import time import io import zipfile -from .conftest import API_PREFIX, ADMIN_USERNAME, ADMIN_PW - -host_prefix = "http://127.0.0.1:30870" +from .conftest import API_PREFIX, HOST_PREFIX wacz_path = None wacz_size = None @@ -35,7 +33,7 @@ def test_create_new_config(admin_auth_headers, admin_aid): crawl_data = { "runNow": True, "name": "Test Crawl", - "config": {"seeds": ["https://example.com/"]}, + "config": {"seeds": ["https://webrecorder.net/"]}, } r = requests.post( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", @@ -91,7 +89,7 @@ def test_crawl_info(admin_auth_headers, admin_aid, admin_crawl_id): def test_download_wacz(): - r = requests.get(host_prefix + wacz_path) + r = requests.get(HOST_PREFIX + wacz_path) assert r.status_code == 200 assert len(r.content) == wacz_size @@ -110,4 +108,4 @@ def test_verify_wacz(): assert "pages/pages.jsonl" in z.namelist() pages = z.open("pages/pages.jsonl").read().decode("utf-8") - assert '"https://example.com/"' in pages + assert '"https://webrecorder.net/"' in pages