Add collections and tags to upload API endpoints (#993)

* Add collections and tags to uploads

* Fix order of deletion check test

* Re-add tags to UploadedCrawl model after rebase

* Fix Users model heading
This commit is contained in:
Tessa Walsh 2023-07-21 10:44:56 -04:00 committed by GitHub
parent 4014d98243
commit 9f32aa697b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 126 additions and 71 deletions

View File

@ -437,6 +437,7 @@ class UploadedCrawl(BaseCrawl):
type: str = Field("upload", const=True) type: str = Field("upload", const=True)
name: str name: str
tags: Optional[List[str]] = []
# ============================================================================ # ============================================================================
@ -777,7 +778,7 @@ class ProfileCreateUpdate(BaseModel):
# ============================================================================ # ============================================================================
### PROFILES ### ### USERS ###
# ============================================================================ # ============================================================================

View File

@ -4,6 +4,7 @@ import uuid
import hashlib import hashlib
import os import os
import base64 import base64
from urllib.parse import unquote
from io import BufferedReader from io import BufferedReader
from typing import Optional, List from typing import Optional, List
@ -46,6 +47,8 @@ class UploadOps(BaseCrawlOps):
filename: str, filename: str,
name: Optional[str], name: Optional[str],
notes: Optional[str], notes: Optional[str],
collections: Optional[List[UUID4]],
tags: Optional[List[str]],
org: Organization, org: Organization,
user: User, user: User,
replaceId: Optional[str], replaceId: Optional[str],
@ -92,7 +95,9 @@ class UploadOps(BaseCrawlOps):
except Exception as exc: except Exception as exc:
print("replace file deletion failed", exc) print("replace file deletion failed", exc)
return await self._create_upload(files, name, notes, id_, org, user) return await self._create_upload(
files, name, notes, collections, tags, id_, org, user
)
# pylint: disable=too-many-arguments, too-many-locals # pylint: disable=too-many-arguments, too-many-locals
async def upload_formdata( async def upload_formdata(
@ -100,6 +105,8 @@ class UploadOps(BaseCrawlOps):
uploads: List[UploadFile], uploads: List[UploadFile],
name: Optional[str], name: Optional[str],
notes: Optional[str], notes: Optional[str],
collections: Optional[List[UUID4]],
tags: Optional[List[str]],
org: Organization, org: Organization,
user: User, user: User,
): ):
@ -117,9 +124,13 @@ class UploadOps(BaseCrawlOps):
) )
files.append(file_reader.file_prep.get_crawl_file()) files.append(file_reader.file_prep.get_crawl_file())
return await self._create_upload(files, name, notes, id_, org, user) return await self._create_upload(
files, name, notes, collections, tags, id_, org, user
)
async def _create_upload(self, files, name, notes, id_, org, user): async def _create_upload(
self, files, name, notes, collections, tags, id_, org, user
):
now = dt_now() now = dt_now()
# ts_now = now.strftime("%Y%m%d%H%M%S") # ts_now = now.strftime("%Y%m%d%H%M%S")
# crawl_id = f"upload-{ts_now}-{str(id_)[:12]}" # crawl_id = f"upload-{ts_now}-{str(id_)[:12]}"
@ -127,10 +138,16 @@ class UploadOps(BaseCrawlOps):
file_size = sum(file_.size for file_ in files) file_size = sum(file_.size for file_ in files)
collection_uuids = []
for coll in collections:
collection_uuids.append(uuid.UUID(coll))
uploaded = UploadedCrawl( uploaded = UploadedCrawl(
id=crawl_id, id=crawl_id,
name=name or "New Upload @ " + str(now), name=name or "New Upload @ " + str(now),
notes=notes, notes=notes,
collections=collection_uuids,
tags=tags,
userid=user.id, userid=user.id,
oid=org.id, oid=org.id,
files=files, files=files,
@ -224,10 +241,24 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
uploads: List[UploadFile] = File(...), uploads: List[UploadFile] = File(...),
name: Optional[str] = "", name: Optional[str] = "",
notes: Optional[str] = "", notes: Optional[str] = "",
collections: Optional[str] = "",
tags: Optional[str] = "",
org: Organization = Depends(org_crawl_dep), org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep), user: User = Depends(user_dep),
): ):
return await ops.upload_formdata(uploads, name, notes, org, user) name = unquote(name)
notes = unquote(notes)
colls_list = []
if collections:
colls_list = unquote(collections).split(",")
tags_list = []
if tags:
tags_list = unquote(tags).split(",")
return await ops.upload_formdata(
uploads, name, notes, colls_list, tags_list, org, user
)
@app.put("/orgs/{oid}/uploads/stream", tags=["uploads"]) @app.put("/orgs/{oid}/uploads/stream", tags=["uploads"])
async def upload_stream( async def upload_stream(
@ -235,12 +266,32 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
filename: str, filename: str,
name: Optional[str] = "", name: Optional[str] = "",
notes: Optional[str] = "", notes: Optional[str] = "",
collections: Optional[str] = "",
tags: Optional[str] = "",
replaceId: Optional[str] = "", replaceId: Optional[str] = "",
org: Organization = Depends(org_crawl_dep), org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep), user: User = Depends(user_dep),
): ):
name = unquote(name)
notes = unquote(notes)
colls_list = []
if collections:
colls_list = unquote(collections).split(",")
tags_list = []
if tags:
tags_list = unquote(tags).split(",")
return await ops.upload_stream( return await ops.upload_stream(
request.stream(), filename, name, notes, org, user, replaceId request.stream(),
filename,
name,
notes,
colls_list,
tags_list,
org,
user,
replaceId,
) )
@app.get("/orgs/{oid}/uploads", tags=["uploads"], response_model=PaginatedResponse) @app.get("/orgs/{oid}/uploads", tags=["uploads"], response_model=PaginatedResponse)

View File

@ -329,3 +329,14 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def auto_add_config_id(auto_add_crawl_id): def auto_add_config_id(auto_add_crawl_id):
return _auto_add_config_id return _auto_add_config_id
@pytest.fixture(scope="session")
def uploads_collection_id(crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={"name": "Upload test collection"},
)
assert r.status_code == 200
return r.json()["id"]

View File

@ -13,10 +13,10 @@ upload_dl_path = None
curr_dir = os.path.dirname(os.path.realpath(__file__)) curr_dir = os.path.dirname(os.path.realpath(__file__))
def test_upload_stream(admin_auth_headers, default_org_id): def test_upload_stream(admin_auth_headers, default_org_id, uploads_collection_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh: with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put( r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&notes=Testing%0AData", f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&notes=Testing%0AData&collections={uploads_collection_id}&tags=one%2Ctwo",
headers=admin_auth_headers, headers=admin_auth_headers,
data=read_in_chunks(fh), data=read_in_chunks(fh),
) )
@ -28,7 +28,7 @@ def test_upload_stream(admin_auth_headers, default_org_id):
upload_id = r.json()["id"] upload_id = r.json()["id"]
def test_list_stream_upload(admin_auth_headers, default_org_id): def test_list_stream_upload(admin_auth_headers, default_org_id, uploads_collection_id):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads", f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers, headers=admin_auth_headers,
@ -46,17 +46,20 @@ def test_list_stream_upload(admin_auth_headers, default_org_id):
assert found assert found
assert found["name"] == "My Upload" assert found["name"] == "My Upload"
assert found["notes"] == "Testing\nData" assert found["notes"] == "Testing\nData"
assert found["collections"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["one", "two"]
assert "files" not in found assert "files" not in found
assert "resources" not in found assert "resources" not in found
def test_get_stream_upload(admin_auth_headers, default_org_id): def test_get_stream_upload(admin_auth_headers, default_org_id, uploads_collection_id):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json", f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers, headers=admin_auth_headers,
) )
assert r.status_code == 200 assert r.status_code == 200
result = r.json() result = r.json()
assert uploads_collection_id in result["collections"]
assert "files" not in result assert "files" not in result
upload_dl_path = result["resources"][0]["path"] upload_dl_path = result["resources"][0]["path"]
assert "test-" in result["resources"][0]["name"] assert "test-" in result["resources"][0]["name"]
@ -79,7 +82,7 @@ def test_get_stream_upload(admin_auth_headers, default_org_id):
assert r.status_code == 200 assert r.status_code == 200
def test_upload_form(admin_auth_headers, default_org_id): def test_upload_form(admin_auth_headers, default_org_id, uploads_collection_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh: with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
data = fh.read() data = fh.read()
@ -90,7 +93,7 @@ def test_upload_form(admin_auth_headers, default_org_id):
] ]
r = requests.put( r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz", f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz&collections={uploads_collection_id}&tags=three%2Cfour",
headers=admin_auth_headers, headers=admin_auth_headers,
files=files, files=files,
) )
@ -102,7 +105,7 @@ def test_upload_form(admin_auth_headers, default_org_id):
upload_id_2 = r.json()["id"] upload_id_2 = r.json()["id"]
def test_list_uploads(admin_auth_headers, default_org_id): def test_list_uploads(admin_auth_headers, default_org_id, uploads_collection_id):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads", f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers, headers=admin_auth_headers,
@ -119,58 +122,42 @@ def test_list_uploads(admin_auth_headers, default_org_id):
assert found assert found
assert found["name"] == "test2.wacz" assert found["name"] == "test2.wacz"
assert found["collections"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["four", "three"]
assert "files" not in res assert "files" not in res
assert "resources" not in res assert "resources" not in res
def test_collection_uploads(admin_auth_headers, default_org_id): def test_collection_uploads(admin_auth_headers, default_org_id, uploads_collection_id):
# Create collection with one upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={
"crawlIds": [upload_id],
"name": "My Test Coll",
},
)
assert r.status_code == 200
data = r.json()
coll_id = data["id"]
assert data["added"]
# Test uploads filtered by collection # Test uploads filtered by collection
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads?collectionId={coll_id}", f"{API_PREFIX}/orgs/{default_org_id}/uploads?collectionId={uploads_collection_id}",
headers=admin_auth_headers, headers=admin_auth_headers,
) )
results = r.json() results = r.json()
assert len(results["items"]) == 1 assert len(results["items"]) == 2
assert results["items"][0]["id"] == upload_id assert results["items"][0]["id"] in (upload_id, upload_id_2)
assert results["items"][1]["id"] in (upload_id, upload_id_2)
# Test all crawls filtered by collection # Test all crawls filtered by collection
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={coll_id}", f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={uploads_collection_id}",
headers=admin_auth_headers, headers=admin_auth_headers,
) )
results = r.json() results = r.json()
assert len(results["items"]) == 1 assert len(results["items"]) == 2
assert results["items"][0]["id"] == upload_id assert results["items"][0]["id"] in (upload_id, upload_id_2)
assert results["items"][1]["id"] in (upload_id, upload_id_2)
# Delete Collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
def test_get_upload_replay_json(admin_auth_headers, default_org_id): def test_get_upload_replay_json(
admin_auth_headers, default_org_id, uploads_collection_id
):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json", f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers, headers=admin_auth_headers,
@ -181,6 +168,8 @@ def test_get_upload_replay_json(admin_auth_headers, default_org_id):
assert data assert data
assert data["id"] == upload_id assert data["id"] == upload_id
assert data["name"] == "My Upload" assert data["name"] == "My Upload"
assert data["collections"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"] assert data["resources"]
assert data["resources"][0]["path"] assert data["resources"][0]["path"]
assert data["resources"][0]["size"] assert data["resources"][0]["size"]
@ -189,7 +178,9 @@ def test_get_upload_replay_json(admin_auth_headers, default_org_id):
assert "files" not in data assert "files" not in data
def test_get_upload_replay_json_admin(admin_auth_headers, default_org_id): def test_get_upload_replay_json_admin(
admin_auth_headers, default_org_id, uploads_collection_id
):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/all/uploads/{upload_id}/replay.json", f"{API_PREFIX}/orgs/all/uploads/{upload_id}/replay.json",
headers=admin_auth_headers, headers=admin_auth_headers,
@ -200,6 +191,8 @@ def test_get_upload_replay_json_admin(admin_auth_headers, default_org_id):
assert data assert data
assert data["id"] == upload_id assert data["id"] == upload_id
assert data["name"] == "My Upload" assert data["name"] == "My Upload"
assert data["collections"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"] assert data["resources"]
assert data["resources"][0]["path"] assert data["resources"][0]["path"]
assert data["resources"][0]["size"] assert data["resources"][0]["size"]
@ -208,16 +201,20 @@ def test_get_upload_replay_json_admin(admin_auth_headers, default_org_id):
assert "files" not in data assert "files" not in data
def test_replace_upload(admin_auth_headers, default_org_id): def test_replace_upload(admin_auth_headers, default_org_id, uploads_collection_id):
actual_id = do_upload_replace(admin_auth_headers, default_org_id, upload_id) actual_id = do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
)
assert upload_id == actual_id assert upload_id == actual_id
def do_upload_replace(admin_auth_headers, default_org_id, upload_id): def do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
):
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh: with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
r = requests.put( r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}", f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}&collections={uploads_collection_id}",
headers=admin_auth_headers, headers=admin_auth_headers,
data=read_in_chunks(fh), data=read_in_chunks(fh),
) )
@ -294,11 +291,27 @@ def test_delete_stream_upload(admin_auth_headers, default_org_id):
assert r.json()["deleted"] == True assert r.json()["deleted"] == True
def test_replace_upload_non_existent(admin_auth_headers, default_org_id): def test_ensure_deleted(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
for res in results["items"]:
if res["id"] == upload_id:
assert False
def test_replace_upload_non_existent(
admin_auth_headers, default_org_id, uploads_collection_id
):
global upload_id global upload_id
# same replacement, but now to a non-existent upload # same replacement, but now to a non-existent upload
actual_id = do_upload_replace(admin_auth_headers, default_org_id, upload_id) actual_id = do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
)
# new upload_id created # new upload_id created
assert actual_id != upload_id assert actual_id != upload_id
@ -306,15 +319,6 @@ def test_replace_upload_non_existent(admin_auth_headers, default_org_id):
upload_id = actual_id upload_id = actual_id
def test_delete_stream_upload_2(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=admin_auth_headers,
json={"crawl_ids": [upload_id]},
)
assert r.json()["deleted"] == True
def test_verify_from_upload_resource_count(admin_auth_headers, default_org_id): def test_verify_from_upload_resource_count(admin_auth_headers, default_org_id):
r = requests.get( r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json", f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json",
@ -429,15 +433,3 @@ def test_delete_form_upload_from_all_crawls(admin_auth_headers, default_org_id):
json={"crawl_ids": [upload_id_2]}, json={"crawl_ids": [upload_id_2]},
) )
assert r.json()["deleted"] == True assert r.json()["deleted"] == True
def test_ensure_deleted(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
for res in results["items"]:
if res["id"] in (upload_id_2, upload_id):
assert False