* backend: initial tags api support (addresses #365): - add 'tags' field to crawlconfig (array of strings) - allow querying crawlconfigs to specify multiple 'tag' query args, eg. tag=A&tag=B - add /archives/<aid>/crawlconfigs/tags api to query by distinct tag, include index on aid + tag tests: add tests for adding configs, querying by tags tests: fix fixtures to retry login if initial attempts fails, use test seed of https://webrecorder.net instead of https://example.com/
This commit is contained in:
parent
edfb1bd513
commit
7b5d82936d
@ -11,7 +11,7 @@ from datetime import datetime
|
||||
|
||||
import pymongo
|
||||
from pydantic import BaseModel, UUID4, conint, HttpUrl
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from .users import User
|
||||
from .archives import Archive, MAX_CRAWL_SCALE
|
||||
@ -102,6 +102,7 @@ class CrawlConfigIn(BaseModel):
|
||||
profileid: Optional[UUID4]
|
||||
|
||||
colls: Optional[List[str]] = []
|
||||
tags: Optional[List[str]] = []
|
||||
|
||||
crawlTimeout: Optional[int] = 0
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||
@ -124,6 +125,7 @@ class CrawlConfig(BaseMongoModel):
|
||||
created: Optional[datetime]
|
||||
|
||||
colls: Optional[List[str]] = []
|
||||
tags: Optional[List[str]] = []
|
||||
|
||||
crawlTimeout: Optional[int] = 0
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||
@ -222,6 +224,10 @@ class CrawlConfigOps:
|
||||
[("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
|
||||
)
|
||||
|
||||
await self.crawl_configs.create_index(
|
||||
[("aid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
|
||||
)
|
||||
|
||||
def set_coll_ops(self, coll_ops):
|
||||
"""set collection ops"""
|
||||
self.coll_ops = coll_ops
|
||||
@ -359,12 +365,19 @@ class CrawlConfigOps:
|
||||
|
||||
return {"success": True}
|
||||
|
||||
async def get_crawl_configs(self, archive: Archive):
|
||||
async def get_crawl_configs(
|
||||
self, archive: Archive, tags: Optional[List[str]] = None
|
||||
):
|
||||
"""Get all crawl configs for an archive is a member of"""
|
||||
match_query = {"aid": archive.id, "inactive": {"$ne": True}}
|
||||
|
||||
if tags:
|
||||
match_query["tags"] = {"$all": tags}
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
cursor = self.crawl_configs.aggregate(
|
||||
[
|
||||
{"$match": {"aid": archive.id, "inactive": {"$ne": True}}},
|
||||
{"$match": match_query},
|
||||
{
|
||||
"$lookup": {
|
||||
"from": "users",
|
||||
@ -564,6 +577,10 @@ class CrawlConfigOps:
|
||||
|
||||
return result.inserted_id
|
||||
|
||||
async def get_crawl_config_tags(self, archive):
|
||||
"""get distinct tags from all crawl configs for this archive"""
|
||||
return await self.crawl_configs.distinct("tags", {"aid": archive.id})
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||
@ -580,8 +597,15 @@ def init_crawl_config_api(
|
||||
archive_crawl_dep = archive_ops.archive_crawl_dep
|
||||
|
||||
@router.get("", response_model=CrawlConfigsResponse)
|
||||
async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
|
||||
return await ops.get_crawl_configs(archive)
|
||||
async def get_crawl_configs(
|
||||
archive: Archive = Depends(archive_crawl_dep),
|
||||
tag: Union[List[str], None] = Query(default=None),
|
||||
):
|
||||
return await ops.get_crawl_configs(archive, tag)
|
||||
|
||||
@router.get("/tags")
|
||||
async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)):
|
||||
return await ops.get_crawl_config_tags(archive)
|
||||
|
||||
@router.get("/{cid}", response_model=CrawlConfigOut)
|
||||
async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)):
|
||||
|
@ -3,7 +3,8 @@ import requests
|
||||
import time
|
||||
|
||||
|
||||
API_PREFIX = "http://127.0.0.1:30870/api"
|
||||
HOST_PREFIX = "http://127.0.0.1:30870"
|
||||
API_PREFIX = HOST_PREFIX + "/api"
|
||||
|
||||
ADMIN_USERNAME = "admin@example.com"
|
||||
ADMIN_PW = "PASSW0RD!"
|
||||
@ -14,24 +15,33 @@ VIEWER_PW = "viewerPASSW0RD!"
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def admin_auth_headers():
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/auth/jwt/login",
|
||||
data={
|
||||
"username": ADMIN_USERNAME,
|
||||
"password": ADMIN_PW,
|
||||
"grant_type": "password",
|
||||
},
|
||||
)
|
||||
data = r.json()
|
||||
access_token = data.get("access_token")
|
||||
return {"Authorization": f"Bearer {access_token}"}
|
||||
while True:
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/auth/jwt/login",
|
||||
data={
|
||||
"username": ADMIN_USERNAME,
|
||||
"password": ADMIN_PW,
|
||||
"grant_type": "password",
|
||||
},
|
||||
)
|
||||
data = r.json()
|
||||
try:
|
||||
return {"Authorization": f"Bearer {data['access_token']}"}
|
||||
except:
|
||||
print("Waiting for admin_auth_headers")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def admin_aid(admin_auth_headers):
|
||||
r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
|
||||
data = r.json()
|
||||
return data["archives"][0]["id"]
|
||||
while True:
|
||||
r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
|
||||
data = r.json()
|
||||
try:
|
||||
return data["archives"][0]["id"]
|
||||
except:
|
||||
print("Waiting for admin_aid")
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
@ -40,7 +50,7 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Admin Test Crawl",
|
||||
"config": {"seeds": ["https://example.com/"]},
|
||||
"config": {"seeds": ["https://webrecorder.net/"], "limit": 1},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||
|
76
backend/test/test_crawl_config_tags.py
Normal file
76
backend/test/test_crawl_config_tags.py
Normal file
@ -0,0 +1,76 @@
|
||||
import requests
|
||||
|
||||
from .conftest import API_PREFIX
|
||||
|
||||
new_cid_1 = None
|
||||
new_cid_2 = None
|
||||
|
||||
def get_sample_crawl_data(tags):
|
||||
return {
|
||||
"runNow": False,
|
||||
"name": "Test Crawl",
|
||||
"config": {"seeds": ["https://example.com/"]},
|
||||
"tags": tags,
|
||||
}
|
||||
|
||||
def test_create_new_config_1(admin_auth_headers, admin_aid):
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=get_sample_crawl_data(["tag-1", "tag-2"])
|
||||
)
|
||||
|
||||
assert r.status_code == 200
|
||||
|
||||
data = r.json()
|
||||
assert data["added"]
|
||||
assert data["run_now_job"] == None
|
||||
|
||||
global new_cid_1
|
||||
new_cid_1 = data["added"]
|
||||
|
||||
def test_get_config_1(admin_auth_headers, admin_aid):
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.json()["tags"] == ["tag-1", "tag-2"]
|
||||
|
||||
def test_get_config_by_tag_1(admin_auth_headers, admin_aid):
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.json() == ["tag-1", "tag-2"]
|
||||
|
||||
def test_create_new_config_2(admin_auth_headers, admin_aid):
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=get_sample_crawl_data(["tag-3", "tag-0"])
|
||||
)
|
||||
|
||||
assert r.status_code == 200
|
||||
|
||||
data = r.json()
|
||||
assert data["added"]
|
||||
assert data["run_now_job"] == None
|
||||
|
||||
global new_cid_2
|
||||
new_cid_2 = data["added"]
|
||||
|
||||
def test_get_config_by_tag_2(admin_auth_headers, admin_aid):
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"]
|
||||
|
||||
def test_get_config_2(admin_auth_headers, admin_aid):
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.json()["tags"] == ["tag-3", "tag-0"]
|
||||
|
||||
|
@ -4,9 +4,7 @@ import time
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
from .conftest import API_PREFIX, ADMIN_USERNAME, ADMIN_PW
|
||||
|
||||
host_prefix = "http://127.0.0.1:30870"
|
||||
from .conftest import API_PREFIX, HOST_PREFIX
|
||||
|
||||
wacz_path = None
|
||||
wacz_size = None
|
||||
@ -35,7 +33,7 @@ def test_create_new_config(admin_auth_headers, admin_aid):
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Test Crawl",
|
||||
"config": {"seeds": ["https://example.com/"]},
|
||||
"config": {"seeds": ["https://webrecorder.net/"]},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||
@ -91,7 +89,7 @@ def test_crawl_info(admin_auth_headers, admin_aid, admin_crawl_id):
|
||||
|
||||
|
||||
def test_download_wacz():
|
||||
r = requests.get(host_prefix + wacz_path)
|
||||
r = requests.get(HOST_PREFIX + wacz_path)
|
||||
assert r.status_code == 200
|
||||
assert len(r.content) == wacz_size
|
||||
|
||||
@ -110,4 +108,4 @@ def test_verify_wacz():
|
||||
assert "pages/pages.jsonl" in z.namelist()
|
||||
|
||||
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
||||
assert '"https://example.com/"' in pages
|
||||
assert '"https://webrecorder.net/"' in pages
|
||||
|
Loading…
Reference in New Issue
Block a user