From 7b5d82936db01d22bf46c878d7b23b35a9de6a4f Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Wed, 11 Jan 2023 13:29:35 -0800
Subject: [PATCH] backend: initial tags api support (addresses #365): (#434)

* backend: initial tags api support (addresses #365):
- add 'tags' field to crawlconfig (array of strings)
- allow querying crawlconfigs to specify multiple 'tag' query args, eg. tag=A&tag=B
- add /archives/<aid>/crawlconfigs/tags api to query by distinct tag, include index on aid + tag
tests: add tests for adding configs, querying by tags
tests: fix fixtures to retry login if initial attempts fails, use test seed of https://webrecorder.net instead of https://example.com/
---
 backend/btrixcloud/crawlconfigs.py     | 34 ++++++++++--
 backend/test/conftest.py               | 42 ++++++++------
 backend/test/test_crawl_config_tags.py | 76 ++++++++++++++++++++++++++
 backend/test/test_run_crawl.py         | 10 ++--
 4 files changed, 135 insertions(+), 27 deletions(-)
 create mode 100644 backend/test/test_crawl_config_tags.py
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
index a883cb86..da95f427 100644
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@@ -11,7 +11,7 @@ from datetime import datetime
 
 import pymongo
 from pydantic import BaseModel, UUID4, conint, HttpUrl
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends, HTTPException, Query
 
 from .users import User
 from .archives import Archive, MAX_CRAWL_SCALE
@@ -102,6 +102,7 @@ class CrawlConfigIn(BaseModel):
     profileid: Optional[UUID4]
 
     colls: Optional[List[str]] = []
+    tags: Optional[List[str]] = []
 
     crawlTimeout: Optional[int] = 0
     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
@@ -124,6 +125,7 @@ class CrawlConfig(BaseMongoModel):
     created: Optional[datetime]
 
     colls: Optional[List[str]] = []
+    tags: Optional[List[str]] = []
 
     crawlTimeout: Optional[int] = 0
     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
@@ -222,6 +224,10 @@ class CrawlConfigOps:
             [("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
         )
 
+        await self.crawl_configs.create_index(
+            [("aid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
+        )
+
     def set_coll_ops(self, coll_ops):
         """set collection ops"""
         self.coll_ops = coll_ops
@@ -359,12 +365,19 @@ class CrawlConfigOps:
 
         return {"success": True}
 
-    async def get_crawl_configs(self, archive: Archive):
+    async def get_crawl_configs(
+        self, archive: Archive, tags: Optional[List[str]] = None
+    ):
         """Get all crawl configs for an archive is a member of"""
+        match_query = {"aid": archive.id, "inactive": {"$ne": True}}
+
+        if tags:
+            match_query["tags"] = {"$all": tags}
+
         # pylint: disable=duplicate-code
         cursor = self.crawl_configs.aggregate(
             [
-                {"$match": {"aid": archive.id, "inactive": {"$ne": True}}},
+                {"$match": match_query},
                 {
                     "$lookup": {
                         "from": "users",
@@ -564,6 +577,10 @@ class CrawlConfigOps:
 
         return result.inserted_id
 
+    async def get_crawl_config_tags(self, archive):
+        """get distinct tags from all crawl configs for this archive"""
+        return await self.crawl_configs.distinct("tags", {"aid": archive.id})
+
 
 # ============================================================================
 # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
@@ -580,8 +597,15 @@ def init_crawl_config_api(
     archive_crawl_dep = archive_ops.archive_crawl_dep
 
     @router.get("", response_model=CrawlConfigsResponse)
-    async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
-        return await ops.get_crawl_configs(archive)
+    async def get_crawl_configs(
+        archive: Archive = Depends(archive_crawl_dep),
+        tag: Union[List[str], None] = Query(default=None),
+    ):
+        return await ops.get_crawl_configs(archive, tag)
+
+    @router.get("/tags")
+    async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)):
+        return await ops.get_crawl_config_tags(archive)
 
     @router.get("/{cid}", response_model=CrawlConfigOut)
     async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)):
diff --git a/backend/test/conftest.py b/backend/test/conftest.py
index 21d868ad..ed03e607 100644
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@@ -3,7 +3,8 @@ import requests
 import time
 
 
-API_PREFIX = "http://127.0.0.1:30870/api"
+HOST_PREFIX = "http://127.0.0.1:30870"
+API_PREFIX = HOST_PREFIX + "/api"
 
 ADMIN_USERNAME = "admin@example.com"
 ADMIN_PW = "PASSW0RD!"
@@ -14,24 +15,33 @@ VIEWER_PW = "viewerPASSW0RD!"
 
 @pytest.fixture(scope="session")
 def admin_auth_headers():
-    r = requests.post(
-        f"{API_PREFIX}/auth/jwt/login",
-        data={
-            "username": ADMIN_USERNAME,
-            "password": ADMIN_PW,
-            "grant_type": "password",
-        },
-    )
-    data = r.json()
-    access_token = data.get("access_token")
-    return {"Authorization": f"Bearer {access_token}"}
+    while True:
+        r = requests.post(
+            f"{API_PREFIX}/auth/jwt/login",
+            data={
+                "username": ADMIN_USERNAME,
+                "password": ADMIN_PW,
+                "grant_type": "password",
+            },
+        )
+        data = r.json()
+        try:
+            return {"Authorization": f"Bearer {data['access_token']}"}
+        except:
+            print("Waiting for admin_auth_headers")
+            time.sleep(5)
 
 
 @pytest.fixture(scope="session")
 def admin_aid(admin_auth_headers):
-    r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
-    data = r.json()
-    return data["archives"][0]["id"]
+    while True:
+        r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
+        data = r.json()
+        try:
+            return data["archives"][0]["id"]
+        except:
+            print("Waiting for admin_aid")
+            time.sleep(5)
 
 
 @pytest.fixture(scope="session")
@@ -40,7 +50,7 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
     crawl_data = {
         "runNow": True,
         "name": "Admin Test Crawl",
-        "config": {"seeds": ["https://example.com/"]},
+        "config": {"seeds": ["https://webrecorder.net/"], "limit": 1},
     }
     r = requests.post(
         f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
diff --git a/backend/test/test_crawl_config_tags.py b/backend/test/test_crawl_config_tags.py
new file mode 100644
index 00000000..71f2d07f
--- /dev/null
+++ b/backend/test/test_crawl_config_tags.py
@@ -0,0 +1,76 @@
+import requests
+
+from .conftest import API_PREFIX
+
+new_cid_1 = None
+new_cid_2 = None
+
+def get_sample_crawl_data(tags):
+    return {
+        "runNow": False,
+        "name": "Test Crawl",
+        "config": {"seeds": ["https://example.com/"]},
+        "tags": tags,
+    }
+
+def test_create_new_config_1(admin_auth_headers, admin_aid):
+    r = requests.post(
+        f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=get_sample_crawl_data(["tag-1", "tag-2"])
+    )
+
+    assert r.status_code == 200
+
+    data = r.json()
+    assert data["added"]
+    assert data["run_now_job"] == None
+
+    global new_cid_1
+    new_cid_1 = data["added"]
+
+def test_get_config_1(admin_auth_headers, admin_aid):
+    r = requests.get(
+        f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}",
+        headers=admin_auth_headers,
+    )
+    assert r.json()["tags"] == ["tag-1", "tag-2"]
+
+def test_get_config_by_tag_1(admin_auth_headers, admin_aid):
+    r = requests.get(
+        f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
+        headers=admin_auth_headers,
+    )
+    assert r.json() == ["tag-1", "tag-2"]
+
+def test_create_new_config_2(admin_auth_headers, admin_aid):
+    r = requests.post(
+        f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=get_sample_crawl_data(["tag-3", "tag-0"])
+    )
+
+    assert r.status_code == 200
+
+    data = r.json()
+    assert data["added"]
+    assert data["run_now_job"] == None
+
+    global new_cid_2
+    new_cid_2 = data["added"]
+
+def test_get_config_by_tag_2(admin_auth_headers, admin_aid):
+    r = requests.get(
+        f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
+        headers=admin_auth_headers,
+    )
+    assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"]
+
+def test_get_config_2(admin_auth_headers, admin_aid):
+    r = requests.get(
+        f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}",
+        headers=admin_auth_headers,
+    )
+    assert r.json()["tags"] == ["tag-3", "tag-0"]
+
+
diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py
index 7418b44a..82c0d28b 100644
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@@ -4,9 +4,7 @@ import time
 import io
 import zipfile
 
-from .conftest import API_PREFIX, ADMIN_USERNAME, ADMIN_PW
-
-host_prefix = "http://127.0.0.1:30870"
+from .conftest import API_PREFIX, HOST_PREFIX
 
 wacz_path = None
 wacz_size = None
@@ -35,7 +33,7 @@ def test_create_new_config(admin_auth_headers, admin_aid):
     crawl_data = {
         "runNow": True,
         "name": "Test Crawl",
-        "config": {"seeds": ["https://example.com/"]},
+        "config": {"seeds": ["https://webrecorder.net/"]},
     }
     r = requests.post(
         f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
@@ -91,7 +89,7 @@ def test_crawl_info(admin_auth_headers, admin_aid, admin_crawl_id):
 
 
 def test_download_wacz():
-    r = requests.get(host_prefix + wacz_path)
+    r = requests.get(HOST_PREFIX + wacz_path)
     assert r.status_code == 200
     assert len(r.content) == wacz_size
 
@@ -110,4 +108,4 @@ def test_verify_wacz():
     assert "pages/pages.jsonl" in z.namelist()
 
     pages = z.open("pages/pages.jsonl").read().decode("utf-8")
-    assert '"https://example.com/"' in pages
+    assert '"https://webrecorder.net/"' in pages