backend: initial tags api support (addresses #365): (#434)

* backend: initial tags api support (addresses #365):
- add 'tags' field to crawlconfig (array of strings)
- allow querying crawlconfigs to specify multiple 'tag' query args, eg. tag=A&tag=B
- add /archives/<aid>/crawlconfigs/tags api to query by distinct tag, include index on aid + tag
tests: add tests for adding configs, querying by tags
tests: fix fixtures to retry login if initial attempts fails, use test seed of https://webrecorder.net instead of https://example.com/
This commit is contained in:
Ilya Kreymer 2023-01-11 13:29:35 -08:00 committed by GitHub
parent edfb1bd513
commit 7b5d82936d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 135 additions and 27 deletions

View File

@ -11,7 +11,7 @@ from datetime import datetime
import pymongo import pymongo
from pydantic import BaseModel, UUID4, conint, HttpUrl from pydantic import BaseModel, UUID4, conint, HttpUrl
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException, Query
from .users import User from .users import User
from .archives import Archive, MAX_CRAWL_SCALE from .archives import Archive, MAX_CRAWL_SCALE
@ -102,6 +102,7 @@ class CrawlConfigIn(BaseModel):
profileid: Optional[UUID4] profileid: Optional[UUID4]
colls: Optional[List[str]] = [] colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0 crawlTimeout: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
@ -124,6 +125,7 @@ class CrawlConfig(BaseMongoModel):
created: Optional[datetime] created: Optional[datetime]
colls: Optional[List[str]] = [] colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0 crawlTimeout: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
@ -222,6 +224,10 @@ class CrawlConfigOps:
[("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)] [("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
) )
await self.crawl_configs.create_index(
[("aid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
)
def set_coll_ops(self, coll_ops): def set_coll_ops(self, coll_ops):
"""set collection ops""" """set collection ops"""
self.coll_ops = coll_ops self.coll_ops = coll_ops
@ -359,12 +365,19 @@ class CrawlConfigOps:
return {"success": True} return {"success": True}
async def get_crawl_configs(self, archive: Archive): async def get_crawl_configs(
self, archive: Archive, tags: Optional[List[str]] = None
):
"""Get all crawl configs for an archive is a member of""" """Get all crawl configs for an archive is a member of"""
match_query = {"aid": archive.id, "inactive": {"$ne": True}}
if tags:
match_query["tags"] = {"$all": tags}
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cursor = self.crawl_configs.aggregate( cursor = self.crawl_configs.aggregate(
[ [
{"$match": {"aid": archive.id, "inactive": {"$ne": True}}}, {"$match": match_query},
{ {
"$lookup": { "$lookup": {
"from": "users", "from": "users",
@ -564,6 +577,10 @@ class CrawlConfigOps:
return result.inserted_id return result.inserted_id
async def get_crawl_config_tags(self, archive):
"""get distinct tags from all crawl configs for this archive"""
return await self.crawl_configs.distinct("tags", {"aid": archive.id})
# ============================================================================ # ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
@ -580,8 +597,15 @@ def init_crawl_config_api(
archive_crawl_dep = archive_ops.archive_crawl_dep archive_crawl_dep = archive_ops.archive_crawl_dep
@router.get("", response_model=CrawlConfigsResponse) @router.get("", response_model=CrawlConfigsResponse)
async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)): async def get_crawl_configs(
return await ops.get_crawl_configs(archive) archive: Archive = Depends(archive_crawl_dep),
tag: Union[List[str], None] = Query(default=None),
):
return await ops.get_crawl_configs(archive, tag)
@router.get("/tags")
async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl_config_tags(archive)
@router.get("/{cid}", response_model=CrawlConfigOut) @router.get("/{cid}", response_model=CrawlConfigOut)
async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)): async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)):

View File

@ -3,7 +3,8 @@ import requests
import time import time
API_PREFIX = "http://127.0.0.1:30870/api" HOST_PREFIX = "http://127.0.0.1:30870"
API_PREFIX = HOST_PREFIX + "/api"
ADMIN_USERNAME = "admin@example.com" ADMIN_USERNAME = "admin@example.com"
ADMIN_PW = "PASSW0RD!" ADMIN_PW = "PASSW0RD!"
@ -14,24 +15,33 @@ VIEWER_PW = "viewerPASSW0RD!"
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def admin_auth_headers(): def admin_auth_headers():
r = requests.post( while True:
f"{API_PREFIX}/auth/jwt/login", r = requests.post(
data={ f"{API_PREFIX}/auth/jwt/login",
"username": ADMIN_USERNAME, data={
"password": ADMIN_PW, "username": ADMIN_USERNAME,
"grant_type": "password", "password": ADMIN_PW,
}, "grant_type": "password",
) },
data = r.json() )
access_token = data.get("access_token") data = r.json()
return {"Authorization": f"Bearer {access_token}"} try:
return {"Authorization": f"Bearer {data['access_token']}"}
except:
print("Waiting for admin_auth_headers")
time.sleep(5)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def admin_aid(admin_auth_headers): def admin_aid(admin_auth_headers):
r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers) while True:
data = r.json() r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
return data["archives"][0]["id"] data = r.json()
try:
return data["archives"][0]["id"]
except:
print("Waiting for admin_aid")
time.sleep(5)
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -40,7 +50,7 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
crawl_data = { crawl_data = {
"runNow": True, "runNow": True,
"name": "Admin Test Crawl", "name": "Admin Test Crawl",
"config": {"seeds": ["https://example.com/"]}, "config": {"seeds": ["https://webrecorder.net/"], "limit": 1},
} }
r = requests.post( r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",

View File

@ -0,0 +1,76 @@
import requests
from .conftest import API_PREFIX
new_cid_1 = None
new_cid_2 = None
def get_sample_crawl_data(tags):
return {
"runNow": False,
"name": "Test Crawl",
"config": {"seeds": ["https://example.com/"]},
"tags": tags,
}
def test_create_new_config_1(admin_auth_headers, admin_aid):
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=admin_auth_headers,
json=get_sample_crawl_data(["tag-1", "tag-2"])
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["run_now_job"] == None
global new_cid_1
new_cid_1 = data["added"]
def test_get_config_1(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}",
headers=admin_auth_headers,
)
assert r.json()["tags"] == ["tag-1", "tag-2"]
def test_get_config_by_tag_1(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
headers=admin_auth_headers,
)
assert r.json() == ["tag-1", "tag-2"]
def test_create_new_config_2(admin_auth_headers, admin_aid):
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=admin_auth_headers,
json=get_sample_crawl_data(["tag-3", "tag-0"])
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["run_now_job"] == None
global new_cid_2
new_cid_2 = data["added"]
def test_get_config_by_tag_2(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
headers=admin_auth_headers,
)
assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"]
def test_get_config_2(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}",
headers=admin_auth_headers,
)
assert r.json()["tags"] == ["tag-3", "tag-0"]

View File

@ -4,9 +4,7 @@ import time
import io import io
import zipfile import zipfile
from .conftest import API_PREFIX, ADMIN_USERNAME, ADMIN_PW from .conftest import API_PREFIX, HOST_PREFIX
host_prefix = "http://127.0.0.1:30870"
wacz_path = None wacz_path = None
wacz_size = None wacz_size = None
@ -35,7 +33,7 @@ def test_create_new_config(admin_auth_headers, admin_aid):
crawl_data = { crawl_data = {
"runNow": True, "runNow": True,
"name": "Test Crawl", "name": "Test Crawl",
"config": {"seeds": ["https://example.com/"]}, "config": {"seeds": ["https://webrecorder.net/"]},
} }
r = requests.post( r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
@ -91,7 +89,7 @@ def test_crawl_info(admin_auth_headers, admin_aid, admin_crawl_id):
def test_download_wacz(): def test_download_wacz():
r = requests.get(host_prefix + wacz_path) r = requests.get(HOST_PREFIX + wacz_path)
assert r.status_code == 200 assert r.status_code == 200
assert len(r.content) == wacz_size assert len(r.content) == wacz_size
@ -110,4 +108,4 @@ def test_verify_wacz():
assert "pages/pages.jsonl" in z.namelist() assert "pages/pages.jsonl" in z.namelist()
pages = z.open("pages/pages.jsonl").read().decode("utf-8") pages = z.open("pages/pages.jsonl").read().decode("utf-8")
assert '"https://example.com/"' in pages assert '"https://webrecorder.net/"' in pages