* backend: initial tags api support (addresses #365): - add 'tags' field to crawlconfig (array of strings) - allow querying crawlconfigs to specify multiple 'tag' query args, eg. tag=A&tag=B - add /archives/<aid>/crawlconfigs/tags api to query by distinct tag, include index on aid + tag tests: add tests for adding configs, querying by tags tests: fix fixtures to retry login if initial attempts fails, use test seed of https://webrecorder.net instead of https://example.com/
This commit is contained in:
parent
edfb1bd513
commit
7b5d82936d
@ -11,7 +11,7 @@ from datetime import datetime
|
|||||||
|
|
||||||
import pymongo
|
import pymongo
|
||||||
from pydantic import BaseModel, UUID4, conint, HttpUrl
|
from pydantic import BaseModel, UUID4, conint, HttpUrl
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||||
|
|
||||||
from .users import User
|
from .users import User
|
||||||
from .archives import Archive, MAX_CRAWL_SCALE
|
from .archives import Archive, MAX_CRAWL_SCALE
|
||||||
@ -102,6 +102,7 @@ class CrawlConfigIn(BaseModel):
|
|||||||
profileid: Optional[UUID4]
|
profileid: Optional[UUID4]
|
||||||
|
|
||||||
colls: Optional[List[str]] = []
|
colls: Optional[List[str]] = []
|
||||||
|
tags: Optional[List[str]] = []
|
||||||
|
|
||||||
crawlTimeout: Optional[int] = 0
|
crawlTimeout: Optional[int] = 0
|
||||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||||
@ -124,6 +125,7 @@ class CrawlConfig(BaseMongoModel):
|
|||||||
created: Optional[datetime]
|
created: Optional[datetime]
|
||||||
|
|
||||||
colls: Optional[List[str]] = []
|
colls: Optional[List[str]] = []
|
||||||
|
tags: Optional[List[str]] = []
|
||||||
|
|
||||||
crawlTimeout: Optional[int] = 0
|
crawlTimeout: Optional[int] = 0
|
||||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||||
@ -222,6 +224,10 @@ class CrawlConfigOps:
|
|||||||
[("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
|
[("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
await self.crawl_configs.create_index(
|
||||||
|
[("aid", pymongo.ASCENDING), ("tags", pymongo.ASCENDING)]
|
||||||
|
)
|
||||||
|
|
||||||
def set_coll_ops(self, coll_ops):
|
def set_coll_ops(self, coll_ops):
|
||||||
"""set collection ops"""
|
"""set collection ops"""
|
||||||
self.coll_ops = coll_ops
|
self.coll_ops = coll_ops
|
||||||
@ -359,12 +365,19 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
return {"success": True}
|
return {"success": True}
|
||||||
|
|
||||||
async def get_crawl_configs(self, archive: Archive):
|
async def get_crawl_configs(
|
||||||
|
self, archive: Archive, tags: Optional[List[str]] = None
|
||||||
|
):
|
||||||
"""Get all crawl configs for an archive is a member of"""
|
"""Get all crawl configs for an archive is a member of"""
|
||||||
|
match_query = {"aid": archive.id, "inactive": {"$ne": True}}
|
||||||
|
|
||||||
|
if tags:
|
||||||
|
match_query["tags"] = {"$all": tags}
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cursor = self.crawl_configs.aggregate(
|
cursor = self.crawl_configs.aggregate(
|
||||||
[
|
[
|
||||||
{"$match": {"aid": archive.id, "inactive": {"$ne": True}}},
|
{"$match": match_query},
|
||||||
{
|
{
|
||||||
"$lookup": {
|
"$lookup": {
|
||||||
"from": "users",
|
"from": "users",
|
||||||
@ -564,6 +577,10 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
return result.inserted_id
|
return result.inserted_id
|
||||||
|
|
||||||
|
async def get_crawl_config_tags(self, archive):
|
||||||
|
"""get distinct tags from all crawl configs for this archive"""
|
||||||
|
return await self.crawl_configs.distinct("tags", {"aid": archive.id})
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||||
@ -580,8 +597,15 @@ def init_crawl_config_api(
|
|||||||
archive_crawl_dep = archive_ops.archive_crawl_dep
|
archive_crawl_dep = archive_ops.archive_crawl_dep
|
||||||
|
|
||||||
@router.get("", response_model=CrawlConfigsResponse)
|
@router.get("", response_model=CrawlConfigsResponse)
|
||||||
async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
|
async def get_crawl_configs(
|
||||||
return await ops.get_crawl_configs(archive)
|
archive: Archive = Depends(archive_crawl_dep),
|
||||||
|
tag: Union[List[str], None] = Query(default=None),
|
||||||
|
):
|
||||||
|
return await ops.get_crawl_configs(archive, tag)
|
||||||
|
|
||||||
|
@router.get("/tags")
|
||||||
|
async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)):
|
||||||
|
return await ops.get_crawl_config_tags(archive)
|
||||||
|
|
||||||
@router.get("/{cid}", response_model=CrawlConfigOut)
|
@router.get("/{cid}", response_model=CrawlConfigOut)
|
||||||
async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)):
|
async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)):
|
||||||
|
@ -3,7 +3,8 @@ import requests
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
API_PREFIX = "http://127.0.0.1:30870/api"
|
HOST_PREFIX = "http://127.0.0.1:30870"
|
||||||
|
API_PREFIX = HOST_PREFIX + "/api"
|
||||||
|
|
||||||
ADMIN_USERNAME = "admin@example.com"
|
ADMIN_USERNAME = "admin@example.com"
|
||||||
ADMIN_PW = "PASSW0RD!"
|
ADMIN_PW = "PASSW0RD!"
|
||||||
@ -14,24 +15,33 @@ VIEWER_PW = "viewerPASSW0RD!"
|
|||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def admin_auth_headers():
|
def admin_auth_headers():
|
||||||
r = requests.post(
|
while True:
|
||||||
f"{API_PREFIX}/auth/jwt/login",
|
r = requests.post(
|
||||||
data={
|
f"{API_PREFIX}/auth/jwt/login",
|
||||||
"username": ADMIN_USERNAME,
|
data={
|
||||||
"password": ADMIN_PW,
|
"username": ADMIN_USERNAME,
|
||||||
"grant_type": "password",
|
"password": ADMIN_PW,
|
||||||
},
|
"grant_type": "password",
|
||||||
)
|
},
|
||||||
data = r.json()
|
)
|
||||||
access_token = data.get("access_token")
|
data = r.json()
|
||||||
return {"Authorization": f"Bearer {access_token}"}
|
try:
|
||||||
|
return {"Authorization": f"Bearer {data['access_token']}"}
|
||||||
|
except:
|
||||||
|
print("Waiting for admin_auth_headers")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def admin_aid(admin_auth_headers):
|
def admin_aid(admin_auth_headers):
|
||||||
r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
|
while True:
|
||||||
data = r.json()
|
r = requests.get(f"{API_PREFIX}/archives", headers=admin_auth_headers)
|
||||||
return data["archives"][0]["id"]
|
data = r.json()
|
||||||
|
try:
|
||||||
|
return data["archives"][0]["id"]
|
||||||
|
except:
|
||||||
|
print("Waiting for admin_aid")
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
@ -40,7 +50,7 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
|
|||||||
crawl_data = {
|
crawl_data = {
|
||||||
"runNow": True,
|
"runNow": True,
|
||||||
"name": "Admin Test Crawl",
|
"name": "Admin Test Crawl",
|
||||||
"config": {"seeds": ["https://example.com/"]},
|
"config": {"seeds": ["https://webrecorder.net/"], "limit": 1},
|
||||||
}
|
}
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||||
|
76
backend/test/test_crawl_config_tags.py
Normal file
76
backend/test/test_crawl_config_tags.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
from .conftest import API_PREFIX
|
||||||
|
|
||||||
|
new_cid_1 = None
|
||||||
|
new_cid_2 = None
|
||||||
|
|
||||||
|
def get_sample_crawl_data(tags):
|
||||||
|
return {
|
||||||
|
"runNow": False,
|
||||||
|
"name": "Test Crawl",
|
||||||
|
"config": {"seeds": ["https://example.com/"]},
|
||||||
|
"tags": tags,
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_create_new_config_1(admin_auth_headers, admin_aid):
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
json=get_sample_crawl_data(["tag-1", "tag-2"])
|
||||||
|
)
|
||||||
|
|
||||||
|
assert r.status_code == 200
|
||||||
|
|
||||||
|
data = r.json()
|
||||||
|
assert data["added"]
|
||||||
|
assert data["run_now_job"] == None
|
||||||
|
|
||||||
|
global new_cid_1
|
||||||
|
new_cid_1 = data["added"]
|
||||||
|
|
||||||
|
def test_get_config_1(admin_auth_headers, admin_aid):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.json()["tags"] == ["tag-1", "tag-2"]
|
||||||
|
|
||||||
|
def test_get_config_by_tag_1(admin_auth_headers, admin_aid):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.json() == ["tag-1", "tag-2"]
|
||||||
|
|
||||||
|
def test_create_new_config_2(admin_auth_headers, admin_aid):
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
json=get_sample_crawl_data(["tag-3", "tag-0"])
|
||||||
|
)
|
||||||
|
|
||||||
|
assert r.status_code == 200
|
||||||
|
|
||||||
|
data = r.json()
|
||||||
|
assert data["added"]
|
||||||
|
assert data["run_now_job"] == None
|
||||||
|
|
||||||
|
global new_cid_2
|
||||||
|
new_cid_2 = data["added"]
|
||||||
|
|
||||||
|
def test_get_config_by_tag_2(admin_auth_headers, admin_aid):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"]
|
||||||
|
|
||||||
|
def test_get_config_2(admin_auth_headers, admin_aid):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.json()["tags"] == ["tag-3", "tag-0"]
|
||||||
|
|
||||||
|
|
@ -4,9 +4,7 @@ import time
|
|||||||
import io
|
import io
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from .conftest import API_PREFIX, ADMIN_USERNAME, ADMIN_PW
|
from .conftest import API_PREFIX, HOST_PREFIX
|
||||||
|
|
||||||
host_prefix = "http://127.0.0.1:30870"
|
|
||||||
|
|
||||||
wacz_path = None
|
wacz_path = None
|
||||||
wacz_size = None
|
wacz_size = None
|
||||||
@ -35,7 +33,7 @@ def test_create_new_config(admin_auth_headers, admin_aid):
|
|||||||
crawl_data = {
|
crawl_data = {
|
||||||
"runNow": True,
|
"runNow": True,
|
||||||
"name": "Test Crawl",
|
"name": "Test Crawl",
|
||||||
"config": {"seeds": ["https://example.com/"]},
|
"config": {"seeds": ["https://webrecorder.net/"]},
|
||||||
}
|
}
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
|
||||||
@ -91,7 +89,7 @@ def test_crawl_info(admin_auth_headers, admin_aid, admin_crawl_id):
|
|||||||
|
|
||||||
|
|
||||||
def test_download_wacz():
|
def test_download_wacz():
|
||||||
r = requests.get(host_prefix + wacz_path)
|
r = requests.get(HOST_PREFIX + wacz_path)
|
||||||
assert r.status_code == 200
|
assert r.status_code == 200
|
||||||
assert len(r.content) == wacz_size
|
assert len(r.content) == wacz_size
|
||||||
|
|
||||||
@ -110,4 +108,4 @@ def test_verify_wacz():
|
|||||||
assert "pages/pages.jsonl" in z.namelist()
|
assert "pages/pages.jsonl" in z.namelist()
|
||||||
|
|
||||||
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
||||||
assert '"https://example.com/"' in pages
|
assert '"https://webrecorder.net/"' in pages
|
||||||
|
Loading…
Reference in New Issue
Block a user