From 5efeaa58b18f7a1a0f5c90c466263d5d51d50efd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 11 Jan 2023 16:50:38 -0800 Subject: [PATCH] API filters by user + crawl collection ids (#462) backend: object filtering: - add filtering crawls, crawlconfigs and profiles by userid= query arg, fixes #460 - add filtering crawls by crawlconfig via cid= query arg, fixes #400 - tests: add test_filter_results test suite to test filtering crawls and crawlconfigs by user, also create user with 'crawler' permissions, run second crawl with that user. --- backend/btrixcloud/crawlconfigs.py | 11 +++- backend/btrixcloud/crawls.py | 29 +++++++-- backend/btrixcloud/profiles.py | 11 +++- backend/test/conftest.py | 83 +++++++++++++++++++++++++- backend/test/test_crawl_config_tags.py | 13 ++-- backend/test/test_filter_results.py | 72 ++++++++++++++++++++++ backend/test/test_permissions.py | 15 +++-- 7 files changed, 214 insertions(+), 20 deletions(-) create mode 100644 backend/test/test_filter_results.py diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index da95f427..c8768c07 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -366,7 +366,10 @@ class CrawlConfigOps: return {"success": True} async def get_crawl_configs( - self, archive: Archive, tags: Optional[List[str]] = None + self, + archive: Archive, + userid: Optional[UUID4] = None, + tags: Optional[List[str]] = None, ): """Get all crawl configs for an archive is a member of""" match_query = {"aid": archive.id, "inactive": {"$ne": True}} @@ -374,6 +377,9 @@ class CrawlConfigOps: if tags: match_query["tags"] = {"$all": tags} + if userid: + match_query["userid"] = userid + # pylint: disable=duplicate-code cursor = self.crawl_configs.aggregate( [ @@ -599,9 +605,10 @@ def init_crawl_config_api( @router.get("", response_model=CrawlConfigsResponse) async def get_crawl_configs( archive: Archive = Depends(archive_crawl_dep), + userid: Optional[UUID4] = None, tag: Union[List[str], None] = Query(default=None), ): - return await ops.get_crawl_configs(archive, tag) + return await ops.get_crawl_configs(archive, userid=userid, tags=tag) @router.get("/tags") async def get_crawl_config_tags(archive: Archive = Depends(archive_crawl_dep)): diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index da25423c..f288a5b1 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -173,6 +173,7 @@ class CrawlOps: archive: Optional[Archive] = None, cid: uuid.UUID = None, collid: uuid.UUID = None, + userid: uuid.UUID = None, crawl_id: str = None, exclude_files=True, running_only=False, @@ -191,6 +192,9 @@ class CrawlOps: if collid: query["colls"] = collid + if userid: + query["userid"] = userid + if running_only: query["state"] = {"$in": ["running", "starting", "stopping"]} @@ -573,15 +577,31 @@ def init_crawls_api( archive_crawl_dep = archives.archive_crawl_dep @app.get("/archives/all/crawls", tags=["crawls"], response_model=ListCrawls) - async def list_crawls_admin(user: User = Depends(user_dep)): + async def list_crawls_admin( + user: User = Depends(user_dep), + userid: Optional[UUID4] = None, + cid: Optional[UUID4] = None, + ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return ListCrawls(crawls=await ops.list_crawls(None, running_only=True)) + return ListCrawls( + crawls=await ops.list_crawls( + None, userid=userid, cid=cid, running_only=True + ) + ) @app.get("/archives/{aid}/crawls", tags=["crawls"], response_model=ListCrawls) - async def list_crawls(archive: Archive = Depends(archive_viewer_dep)): - return ListCrawls(crawls=await ops.list_crawls(archive)) + async def list_crawls( + archive: Archive = Depends(archive_viewer_dep), + userid: Optional[UUID4] = None, + cid: Optional[UUID4] = None, + ): + return ListCrawls( + crawls=await ops.list_crawls( + archive, userid=userid, cid=cid, running_only=False + ) + ) @app.post( "/archives/{aid}/crawls/{crawl_id}/cancel", @@ -646,7 +666,6 @@ def init_crawls_api( raise HTTPException(status_code=403, detail="Not Allowed") crawls = await ops.list_crawls(crawl_id=crawl_id) - print("crawls", crawls) if len(crawls) < 1: raise HTTPException(status_code=404, detail="crawl_not_found") diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index 15e6ed3a..e2187163 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -253,9 +253,13 @@ class ProfileOps: return {"success": True} - async def list_profiles(self, archive: Archive): + async def list_profiles(self, archive: Archive, userid: Optional[UUID4] = None): """list all profiles""" - cursor = self.profiles.find({"aid": archive.id}) + query = {"aid": archive.id} + if userid: + query["userid"] = userid + + cursor = self.profiles.find(query) results = await cursor.to_list(length=1000) return [Profile.from_dict(res) for res in results] @@ -395,8 +399,9 @@ def init_profiles_api(mdb, crawl_manager, archive_ops, user_dep): @router.get("", response_model=List[Profile]) async def list_profiles( archive: Archive = Depends(archive_crawl_dep), + userid: Optional[UUID4] = None, ): - return await ops.list_profiles(archive) + return await ops.list_profiles(archive, userid) @router.post("", response_model=Profile) async def commit_browser_to_new( diff --git a/backend/test/conftest.py b/backend/test/conftest.py index ed03e607..84ad5471 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -12,6 +12,12 @@ ADMIN_PW = "PASSW0RD!" VIEWER_USERNAME = "viewer@example.com" VIEWER_PW = "viewerPASSW0RD!" +CRAWLER_USERNAME = "crawler@example.com" +CRAWLER_PW = "crawlerPASSWORD!" + +_admin_config_id = None +_crawler_config_id = None + @pytest.fixture(scope="session") def admin_auth_headers(): @@ -58,6 +64,10 @@ def admin_crawl_id(admin_auth_headers, admin_aid): json=crawl_data, ) data = r.json() + + global _admin_config_id + _admin_config_id = data["added"] + crawl_id = data["run_now_job"] # Wait for it to complete and then return crawl ID while True: @@ -71,6 +81,11 @@ def admin_crawl_id(admin_auth_headers, admin_aid): time.sleep(5) +@pytest.fixture(scope="session") +def admin_config_id(admin_crawl_id): + return _admin_config_id + + @pytest.fixture(scope="session") def viewer_auth_headers(admin_auth_headers, admin_aid): requests.post( @@ -90,8 +105,74 @@ def viewer_auth_headers(admin_auth_headers, admin_aid): "password": VIEWER_PW, "grant_type": "password", }, - headers=admin_auth_headers, ) data = r.json() access_token = data.get("access_token") return {"Authorization": f"Bearer {access_token}"} + + +@pytest.fixture(scope="session") +def crawler_auth_headers(admin_auth_headers, admin_aid): + requests.post( + f"{API_PREFIX}/archives/{admin_aid}/add-user", + json={ + "email": CRAWLER_USERNAME, + "password": CRAWLER_PW, + "name": "new-crawler", + "role": 20, + }, + headers=admin_auth_headers, + ) + r = requests.post( + f"{API_PREFIX}/auth/jwt/login", + data={ + "username": CRAWLER_USERNAME, + "password": CRAWLER_PW, + "grant_type": "password", + }, + ) + data = r.json() + access_token = data.get("access_token") + return {"Authorization": f"Bearer {access_token}"} + + +@pytest.fixture(scope="session") +def crawler_userid(crawler_auth_headers): + r = requests.get(f"{API_PREFIX}/users/me", headers=crawler_auth_headers) + return r.json()["id"] + + +@pytest.fixture(scope="session") +def crawler_crawl_id(crawler_auth_headers, admin_aid): + # Start crawl. + crawl_data = { + "runNow": True, + "name": "Crawler User Test Crawl", + "config": {"seeds": ["https://webrecorder.net/"], "limit": 1}, + } + r = requests.post( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", + headers=crawler_auth_headers, + json=crawl_data, + ) + data = r.json() + + global _crawler_config_id + _crawler_config_id = data["added"] + + crawl_id = data["run_now_job"] + # Wait for it to complete and then return crawl ID + while True: + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawls/{crawl_id}/replay.json", + headers=crawler_auth_headers, + ) + data = r.json() + if data["state"] == "complete": + return crawl_id + time.sleep(5) + + +@pytest.fixture(scope="session") +def crawler_config_id(crawler_crawl_id): + return _crawler_config_id diff --git a/backend/test/test_crawl_config_tags.py b/backend/test/test_crawl_config_tags.py index 71f2d07f..770b1443 100644 --- a/backend/test/test_crawl_config_tags.py +++ b/backend/test/test_crawl_config_tags.py @@ -5,6 +5,7 @@ from .conftest import API_PREFIX new_cid_1 = None new_cid_2 = None + def get_sample_crawl_data(tags): return { "runNow": False, @@ -13,11 +14,12 @@ def get_sample_crawl_data(tags): "tags": tags, } + def test_create_new_config_1(admin_auth_headers, admin_aid): r = requests.post( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", headers=admin_auth_headers, - json=get_sample_crawl_data(["tag-1", "tag-2"]) + json=get_sample_crawl_data(["tag-1", "tag-2"]), ) assert r.status_code == 200 @@ -29,6 +31,7 @@ def test_create_new_config_1(admin_auth_headers, admin_aid): global new_cid_1 new_cid_1 = data["added"] + def test_get_config_1(admin_auth_headers, admin_aid): r = requests.get( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_1}", @@ -36,6 +39,7 @@ def test_get_config_1(admin_auth_headers, admin_aid): ) assert r.json()["tags"] == ["tag-1", "tag-2"] + def test_get_config_by_tag_1(admin_auth_headers, admin_aid): r = requests.get( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags", @@ -43,11 +47,12 @@ def test_get_config_by_tag_1(admin_auth_headers, admin_aid): ) assert r.json() == ["tag-1", "tag-2"] + def test_create_new_config_2(admin_auth_headers, admin_aid): r = requests.post( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", headers=admin_auth_headers, - json=get_sample_crawl_data(["tag-3", "tag-0"]) + json=get_sample_crawl_data(["tag-3", "tag-0"]), ) assert r.status_code == 200 @@ -59,6 +64,7 @@ def test_create_new_config_2(admin_auth_headers, admin_aid): global new_cid_2 new_cid_2 = data["added"] + def test_get_config_by_tag_2(admin_auth_headers, admin_aid): r = requests.get( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/tags", @@ -66,11 +72,10 @@ def test_get_config_by_tag_2(admin_auth_headers, admin_aid): ) assert r.json() == ["tag-0", "tag-1", "tag-2", "tag-3"] + def test_get_config_2(admin_auth_headers, admin_aid): r = requests.get( f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/{new_cid_2}", headers=admin_auth_headers, ) assert r.json()["tags"] == ["tag-3", "tag-0"] - - diff --git a/backend/test/test_filter_results.py b/backend/test/test_filter_results.py new file mode 100644 index 00000000..68185414 --- /dev/null +++ b/backend/test/test_filter_results.py @@ -0,0 +1,72 @@ +import requests + +from .conftest import API_PREFIX + + +def get_sample_crawl_data(): + return { + "runNow": False, + "name": "Test Crawl", + "config": {"seeds": ["https://example.com/"]}, + } + + +def test_create_new_config_crawler_user(crawler_auth_headers, admin_aid): + r = requests.post( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs/", + headers=crawler_auth_headers, + json=get_sample_crawl_data(), + ) + + assert r.status_code == 200 + + data = r.json() + assert data["added"] + assert data["run_now_job"] == None + + +def test_get_config_by_user(crawler_auth_headers, admin_aid, crawler_userid): + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawlconfigs?userid={crawler_userid}", + headers=crawler_auth_headers, + ) + assert len(r.json()["crawlConfigs"]) == 1 + + +def test_ensure_crawl_and_admin_user_crawls( + admin_aid, crawler_auth_headers, crawler_crawl_id, admin_crawl_id +): + assert crawler_crawl_id + assert admin_crawl_id + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawls", + headers=crawler_auth_headers, + ) + assert len(r.json()["crawls"]) == 2 + + +def test_get_crawl_job_by_user( + crawler_auth_headers, admin_aid, crawler_userid, crawler_crawl_id +): + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawls?userid={crawler_userid}", + headers=crawler_auth_headers, + ) + assert len(r.json()["crawls"]) == 1 + + +def test_get_crawl_job_by_config( + crawler_auth_headers, admin_aid, admin_config_id, crawler_config_id +): + + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawls?cid={admin_config_id}", + headers=crawler_auth_headers, + ) + assert len(r.json()["crawls"]) == 1 + + r = requests.get( + f"{API_PREFIX}/archives/{admin_aid}/crawls?cid={crawler_config_id}", + headers=crawler_auth_headers, + ) + assert len(r.json()["crawls"]) == 1 diff --git a/backend/test/test_permissions.py b/backend/test/test_permissions.py index 2e4418e3..1b757491 100644 --- a/backend/test/test_permissions.py +++ b/backend/test/test_permissions.py @@ -8,9 +8,13 @@ def test_admin_get_archive_crawls(admin_auth_headers, admin_aid, admin_crawl_id) f"{API_PREFIX}/archives/{admin_aid}/crawls", headers=admin_auth_headers ) data = r.json() - assert len(data["crawls"]) > 0 - assert data["crawls"][0]["id"] == admin_crawl_id - assert data["crawls"][0]["aid"] == admin_aid + crawls = data["crawls"] + crawl_ids = [] + assert len(crawls) > 0 + for crawl in crawls: + assert crawl["aid"] == admin_aid + crawl_ids.append(crawl["id"]) + assert admin_crawl_id in crawl_ids def test_viewer_get_archive_crawls(viewer_auth_headers, admin_aid, admin_crawl_id): @@ -20,9 +24,10 @@ def test_viewer_get_archive_crawls(viewer_auth_headers, admin_aid, admin_crawl_i data = r.json() crawls = data["crawls"] crawl_ids = [] - for crawl in crawls: - crawl_ids.append(crawl["id"]) assert len(crawls) > 0 + for crawl in crawls: + assert crawl["aid"] == admin_aid + crawl_ids.append(crawl["id"]) assert admin_crawl_id in crawl_ids