API filters by user + crawl collection ids (#462)

backend: object filtering:
- add filtering crawls, crawlconfigs and profiles by userid= query arg, fixes #460
- add filtering crawls by crawlconfig via cid= query arg, fixes #400 
- tests: add test_filter_results test suite to test filtering crawls and crawlconfigs by user, also create user with 'crawler' permissions, run second crawl with that user.
This commit is contained in:
Ilya Kreymer 2023-01-11 16:50:38 -08:00 committed by GitHub
parent 7b5d82936d
commit 5efeaa58b1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 214 additions and 20 deletions

View File

@ -366,7 +366,10 @@ class CrawlConfigOps:
return {"success": True}
async def get_crawl_configs(
self, archive: Archive, tags: Optional[List[str]] = None
self,
archive: Archive,
userid: Optional[UUID4] = None,
tags: Optional[List[str]] = None,
):
"""Get all crawl configs for an archive is a member of"""
match_query = {"aid": archive.id, "inactive": {"$ne": True}}
@ -374,6 +377,9 @@ class CrawlConfigOps:
if tags:
match_query["tags"] = {"$all": tags}
if userid:
match_query["userid"] = userid
# pylint: disable=duplicate-code
cursor = self.crawl_configs.aggregate(
[
@ -599,9 +605,10 @@ def init_crawl_config_api(
@router.get("", response_model=CrawlConfigsResponse)
async def get_crawl_configs(
archive: Archive = Depends(archive_crawl_dep),
userid: Optional[UUID4] = None,
tag: Union[List[str], None] = Query(default=None),
):
return await ops.get_crawl_configs(archive, tag)
return await ops.get_crawl_configs(archive, userid=userid, tags=tag)
@router.get("/tags")
async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)):

View File

@ -173,6 +173,7 @@ class CrawlOps:
archive: Optional[Archive] = None,
cid: uuid.UUID = None,
collid: uuid.UUID = None,
userid: uuid.UUID = None,
crawl_id: str = None,
exclude_files=True,
running_only=False,
@ -191,6 +192,9 @@ class CrawlOps:
if collid:
query["colls"] = collid
if userid:
query["userid"] = userid
if running_only:
query["state"] = {"$in": ["running", "starting", "stopping"]}
@ -573,15 +577,31 @@ def init_crawls_api(
archive_crawl_dep = archives.archive_crawl_dep
@app.get("/archives/all/crawls", tags=["crawls"], response_model=ListCrawls)
async def list_crawls_admin(user: User = Depends(user_dep)):
async def list_crawls_admin(
user: User = Depends(user_dep),
userid: Optional[UUID4] = None,
cid: Optional[UUID4] = None,
):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
return ListCrawls(crawls=await ops.list_crawls(None, running_only=True))
return ListCrawls(
crawls=await ops.list_crawls(
None, userid=userid, cid=cid, running_only=True
)
)
@app.get("/archives/{aid}/crawls", tags=["crawls"], response_model=ListCrawls)
async def list_crawls(archive: Archive = Depends(archive_viewer_dep)):
return ListCrawls(crawls=await ops.list_crawls(archive))
async def list_crawls(
archive: Archive = Depends(archive_viewer_dep),
userid: Optional[UUID4] = None,
cid: Optional[UUID4] = None,
):
return ListCrawls(
crawls=await ops.list_crawls(
archive, userid=userid, cid=cid, running_only=False
)
)
@app.post(
"/archives/{aid}/crawls/{crawl_id}/cancel",
@ -646,7 +666,6 @@ def init_crawls_api(
raise HTTPException(status_code=403, detail="Not Allowed")
crawls = await ops.list_crawls(crawl_id=crawl_id)
print("crawls", crawls)
if len(crawls) < 1:
raise HTTPException(status_code=404, detail="crawl_not_found")

View File

@ -253,9 +253,13 @@ class ProfileOps:
return {"success": True}
async def list_profiles(self, archive: Archive):
async def list_profiles(self, archive: Archive, userid: Optional[UUID4] = None):
"""list all profiles"""
cursor = self.profiles.find({"aid": archive.id})
query = {"aid": archive.id}
if userid:
query["userid"] = userid
cursor = self.profiles.find(query)
results = await cursor.to_list(length=1000)
return [Profile.from_dict(res) for res in results]
@ -395,8 +399,9 @@ def init_profiles_api(mdb, crawl_manager, archive_ops, user_dep):
@router.get("", response_model=List[Profile])
async def list_profiles(
archive: Archive = Depends(archive_crawl_dep),
userid: Optional[UUID4] = None,
):
return await ops.list_profiles(archive)
return await ops.list_profiles(archive, userid)
@router.post("", response_model=Profile)
async def commit_browser_to_new(

View File

@ -12,6 +12,12 @@ ADMIN_PW = "PASSW0RD!"
VIEWER_USERNAME = "viewer@example.com"
VIEWER_PW = "viewerPASSW0RD!"
CRAWLER_USERNAME = "crawler@example.com"
CRAWLER_PW = "crawlerPASSWORD!"
_admin_config_id = None
_crawler_config_id = None
@pytest.fixture(scope="session")
def admin_auth_headers():
@ -58,6 +64,10 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
json=crawl_data,
)
data = r.json()
global _admin_config_id
_admin_config_id = data["added"]
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
@ -71,6 +81,11 @@ def admin_crawl_id(admin_auth_headers, admin_aid):
time.sleep(5)
@pytest.fixture(scope="session")
def admin_config_id(admin_crawl_id):
return _admin_config_id
@pytest.fixture(scope="session")
def viewer_auth_headers(admin_auth_headers, admin_aid):
requests.post(
@ -90,8 +105,74 @@ def viewer_auth_headers(admin_auth_headers, admin_aid):
"password": VIEWER_PW,
"grant_type": "password",
},
headers=admin_auth_headers,
)
data = r.json()
access_token = data.get("access_token")
return {"Authorization": f"Bearer {access_token}"}
@pytest.fixture(scope="session")
def crawler_auth_headers(admin_auth_headers, admin_aid):
requests.post(
f"{API_PREFIX}/archives/{admin_aid}/add-user",
json={
"email": CRAWLER_USERNAME,
"password": CRAWLER_PW,
"name": "new-crawler",
"role": 20,
},
headers=admin_auth_headers,
)
r = requests.post(
f"{API_PREFIX}/auth/jwt/login",
data={
"username": CRAWLER_USERNAME,
"password": CRAWLER_PW,
"grant_type": "password",
},
)
data = r.json()
access_token = data.get("access_token")
return {"Authorization": f"Bearer {access_token}"}
@pytest.fixture(scope="session")
def crawler_userid(crawler_auth_headers):
r = requests.get(f"{API_PREFIX}/users/me", headers=crawler_auth_headers)
return r.json()["id"]
@pytest.fixture(scope="session")
def crawler_crawl_id(crawler_auth_headers, admin_aid):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Crawler User Test Crawl",
"config": {"seeds": ["https://webrecorder.net/"], "limit": 1},
}
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=crawler_auth_headers,
json=crawl_data,
)
data = r.json()
global _crawler_config_id
_crawler_config_id = data["added"]
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def crawler_config_id(crawler_crawl_id):
return _crawler_config_id

View File

@ -5,6 +5,7 @@ from .conftest import API_PREFIX
new_cid_1 = None
new_cid_2 = None
def get_sample_crawl_data(tags):
return {
"runNow": False,
@ -13,11 +14,12 @@ def get_sample_crawl_data(tags):
"tags": tags,
}
def test_create_new_config_1(admin_auth_headers, admin_aid):
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=admin_auth_headers,
json=get_sample_crawl_data(["tag-1", "tag-2"])
json=get_sample_crawl_data(["tag-1", "tag-2"]),
)
assert r.status_code == 200
@ -29,6 +31,7 @@ def test_create_new_config_1(admin_auth_headers, admin_aid):
global new_cid_1
new_cid_1 = data["added"]
def test_get_config_1(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}",
@ -36,6 +39,7 @@ def test_get_config_1(admin_auth_headers, admin_aid):
)
assert r.json()["tags"] == ["tag-1", "tag-2"]
def test_get_config_by_tag_1(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
@ -43,11 +47,12 @@ def test_get_config_by_tag_1(admin_auth_headers, admin_aid):
)
assert r.json() == ["tag-1", "tag-2"]
def test_create_new_config_2(admin_auth_headers, admin_aid):
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=admin_auth_headers,
json=get_sample_crawl_data(["tag-3", "tag-0"])
json=get_sample_crawl_data(["tag-3", "tag-0"]),
)
assert r.status_code == 200
@ -59,6 +64,7 @@ def test_create_new_config_2(admin_auth_headers, admin_aid):
global new_cid_2
new_cid_2 = data["added"]
def test_get_config_by_tag_2(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags",
@ -66,11 +72,10 @@ def test_get_config_by_tag_2(admin_auth_headers, admin_aid):
)
assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"]
def test_get_config_2(admin_auth_headers, admin_aid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}",
headers=admin_auth_headers,
)
assert r.json()["tags"] == ["tag-3", "tag-0"]

View File

@ -0,0 +1,72 @@
import requests
from .conftest import API_PREFIX
def get_sample_crawl_data():
return {
"runNow": False,
"name": "Test Crawl",
"config": {"seeds": ["https://example.com/"]},
}
def test_create_new_config_crawler_user(crawler_auth_headers, admin_aid):
r = requests.post(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/",
headers=crawler_auth_headers,
json=get_sample_crawl_data(),
)
assert r.status_code == 200
data = r.json()
assert data["added"]
assert data["run_now_job"] == None
def test_get_config_by_user(crawler_auth_headers, admin_aid, crawler_userid):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs?userid={crawler_userid}",
headers=crawler_auth_headers,
)
assert len(r.json()["crawlConfigs"]) == 1
def test_ensure_crawl_and_admin_user_crawls(
admin_aid, crawler_auth_headers, crawler_crawl_id, admin_crawl_id
):
assert crawler_crawl_id
assert admin_crawl_id
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls",
headers=crawler_auth_headers,
)
assert len(r.json()["crawls"]) == 2
def test_get_crawl_job_by_user(
crawler_auth_headers, admin_aid, crawler_userid, crawler_crawl_id
):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls?userid={crawler_userid}",
headers=crawler_auth_headers,
)
assert len(r.json()["crawls"]) == 1
def test_get_crawl_job_by_config(
crawler_auth_headers, admin_aid, admin_config_id, crawler_config_id
):
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls?cid={admin_config_id}",
headers=crawler_auth_headers,
)
assert len(r.json()["crawls"]) == 1
r = requests.get(
f"{API_PREFIX}/archives/{admin_aid}/crawls?cid={crawler_config_id}",
headers=crawler_auth_headers,
)
assert len(r.json()["crawls"]) == 1

View File

@ -8,9 +8,13 @@ def test_admin_get_archive_crawls(admin_auth_headers, admin_aid, admin_crawl_id)
f"{API_PREFIX}/archives/{admin_aid}/crawls", headers=admin_auth_headers
)
data = r.json()
assert len(data["crawls"]) > 0
assert data["crawls"][0]["id"] == admin_crawl_id
assert data["crawls"][0]["aid"] == admin_aid
crawls = data["crawls"]
crawl_ids = []
assert len(crawls) > 0
for crawl in crawls:
assert crawl["aid"] == admin_aid
crawl_ids.append(crawl["id"])
assert admin_crawl_id in crawl_ids
def test_viewer_get_archive_crawls(viewer_auth_headers, admin_aid, admin_crawl_id):
@ -20,9 +24,10 @@ def test_viewer_get_archive_crawls(viewer_auth_headers, admin_aid, admin_crawl_i
data = r.json()
crawls = data["crawls"]
crawl_ids = []
for crawl in crawls:
crawl_ids.append(crawl["id"])
assert len(crawls) > 0
for crawl in crawls:
assert crawl["aid"] == admin_aid
crawl_ids.append(crawl["id"])
assert admin_crawl_id in crawl_ids