backend: initial tags api support (addresses #365): (#434)

* backend: initial tags api support (addresses #365):
- add 'tags' field to crawlconfig (array of strings)
- allow querying crawlconfigs to specify multiple 'tag' query args, eg. tag=A&tag=B
- add /archives/<aid>/crawlconfigs/tags api to query by distinct tag, include index on aid + tag
tests: add tests for adding configs, querying by tags
tests: fix fixtures to retry login if initial attempts fails, use test seed of https://webrecorder.net instead of https://example.com/
This commit is contained in:
Ilya Kreymer 2023-01-11 13:29:35 -08:00 committed by GitHub
parent edfb1bd513
commit 7b5d82936d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 135 additions and 27 deletions

View File

@ -11,7 +11,7 @@ from datetime import datetime
import pymongo
from pydantic import BaseModel, UUID4, conint, HttpUrl
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends, HTTPException, Query
from .users import User
from .archives import Archive, MAX_CRAWL_SCALE
@ -102,6 +102,7 @@ class CrawlConfigIn(BaseModel):
profileid: Optional[UUID4]
colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
@ -124,6 +125,7 @@ class CrawlConfig(BaseMongoModel):
created: Optional[datetime]
colls: Optional[List[str]] = []
tags: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
@ -222,6 +224,10 @@ class CrawlConfigOps:
[("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
)
await self.crawl_configs.create_index(
[("aid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
)
def set_coll_ops(self, coll_ops):
"""set collection ops"""
self.coll_ops = coll_ops
@ -359,12 +365,19 @@ class CrawlConfigOps:
return {"success": True}
async def get_crawl_configs(self, archive: Archive):
async def get_crawl_configs(
self, archive: Archive, tags: Optional[List[str]] = None
):
"""Get all crawl configs for an archive is a member of"""
match_query = {"aid": archive.id, "inactive": {"$ne": True}}
if tags:
match_query["tags"] = {"$all": tags}
# pylint: disable=duplicate-code
cursor = self.crawl_configs.aggregate(
[
{"$match": {"aid": archive.id, "inactive": {"$ne": True}}},
{"$match": match_query},
{
"$lookup": {
"from": "users",
@ -564,6 +577,10 @@ class CrawlConfigOps:
return result.inserted_id
async def get_crawl_config_tags(self, archive):
"""get distinct tags from all crawl configs for this archive"""
return await self.crawl_configs.distinct("tags", {"aid": archive.id})
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
@ -580,8 +597,15 @@ def init_crawl_config_api(
archive_crawl_dep = archive_ops.archive_crawl_dep
@router.get("", response_model=CrawlConfigsResponse)
async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl_configs(archive)
async def get_crawl_configs(
archive: Archive = Depends(archive_crawl_dep),
tag: Union[List[str], None] = Query(default=None),
):
return await ops.get_crawl_configs(archive, tag)
@router.get("/tags")
async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl_config_tags(archive)
@router.get("/{cid}", response_model=CrawlConfigOut)
async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)):

View File

@ -3,7 +3,8 @@ import requests
import time
API_PREFIX = "http://127.0.0.1:30870/api"
HOST_PREFIX = "http://127.0.0.1:30870"
API_PREFIX = HOST_PREFIX + "/api"
ADMIN_USERNAME = "admin@example.com"
ADMIN_PW = "PASSW0RD!"
@ -14,24 +15,33 @@ VIEWER_PW = "viewerPASSW0RD!"
@pytest.fixture(scope="session")
def admin_auth_headers():
r = requests.post(
f"{API_PREFIX}/auth/jwt/login",
data={
"username": ADMIN_USERNAME,
"password": ADMIN_PW,
"grant_type": "password",
},
)
data = r.json()
access_token = data.get("access_token")
return {"Authorization": f"Bearer {access_token}"}
while True:
r = requests.post(
f"{API_PREFIX}/auth/jwt/login",
data={
"username": ADMIN_USERNAME,
"password": ADMIN_PW,
"grant_type": "password",
},
)
data = r.json()
try:
return {"Authorization": f"Bearer {data['access_token']}"}
except:
print("Waiting for admin_auth_headers")
time.sleep(5)
@pytest.fixture(scope="session")
def admin_aid(admin_auth_headers):
r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
data = r.json()
return data["archives"][0]["id"]
while True:
r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
data = r.json()
try:
return data["archives"][0]["id"]
except:
print("Waiting for admin_aid")
time.sleep(5)
@pytest.fixture(scope="session")
@ -40,7 +50,7 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
crawl_data = {
"runNow": True,
"name": "Admin Test Crawl",
"config": {"seeds": ["https://example.com/"]},
"config": {"seeds": ["https://webrecorder.net/"], "limit": 1},
}
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",

View File

@ -0,0 +1,76 @@
import requests
from .conftest import API_PREFIX
new_cid_1 = None
new_cid_2 = None
def get_sample_crawl_data(tags):
return {
"runNow": False,
"name": "Test Crawl",
"config": {"seeds": ["https://example.com/"]},
"tags": tags,
}
def test_create_new_config_1(admin_auth_headers, admin_aid):
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=admin_auth_headers,
json=get_sample_crawl_data(["tag-1", "tag-2"])
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["run_now_job"] == None
global new_cid_1
new_cid_1 = data["added"]
def test_get_config_1(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}",
headers=admin_auth_headers,
)
assert r.json()["tags"] == ["tag-1", "tag-2"]
def test_get_config_by_tag_1(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
headers=admin_auth_headers,
)
assert r.json() == ["tag-1", "tag-2"]
def test_create_new_config_2(admin_auth_headers, admin_aid):
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=admin_auth_headers,
json=get_sample_crawl_data(["tag-3", "tag-0"])
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["run_now_job"] == None
global new_cid_2
new_cid_2 = data["added"]
def test_get_config_by_tag_2(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
headers=admin_auth_headers,
)
assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"]
def test_get_config_2(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}",
headers=admin_auth_headers,
)
assert r.json()["tags"] == ["tag-3", "tag-0"]

View File

@ -4,9 +4,7 @@ import time
import io
import zipfile
from .conftest import API_PREFIX, ADMIN_USERNAME, ADMIN_PW
host_prefix = "http://127.0.0.1:30870"
from .conftest import API_PREFIX, HOST_PREFIX
wacz_path = None
wacz_size = None
@ -35,7 +33,7 @@ def test_create_new_config(admin_auth_headers, admin_aid):
crawl_data = {
"runNow": True,
"name": "Test Crawl",
"config": {"seeds": ["https://example.com/"]},
"config": {"seeds": ["https://webrecorder.net/"]},
}
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
@ -91,7 +89,7 @@ def test_crawl_info(admin_auth_headers, admin_aid, admin_crawl_id):
def test_download_wacz():
r = requests.get(host_prefix + wacz_path)
r = requests.get(HOST_PREFIX + wacz_path)
assert r.status_code == 200
assert len(r.content) == wacz_size
@ -110,4 +108,4 @@ def test_verify_wacz():
assert "pages/pages.jsonl" in z.namelist()
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
assert '"https://example.com/"' in pages
assert '"https://webrecorder.net/"' in pages