Add collections and tags to upload API endpoints (#993)

* Add collections and tags to uploads

* Fix order of deletion check test

* Re-add tags to UploadedCrawl model after rebase

* Fix Users model heading
This commit is contained in:
Tessa Walsh 2023-07-21 10:44:56 -04:00 committed by GitHub
parent 4014d98243
commit 9f32aa697b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 126 additions and 71 deletions

View File

@ -437,6 +437,7 @@ class UploadedCrawl(BaseCrawl):
type: str = Field("upload", const=True)
name: str
tags: Optional[List[str]] = []
# ============================================================================
@ -777,7 +778,7 @@ class ProfileCreateUpdate(BaseModel):
# ============================================================================
### PROFILES ###
### USERS ###
# ============================================================================

View File

@ -4,6 +4,7 @@ import uuid
import hashlib
import os
import base64
from urllib.parse import unquote
from io import BufferedReader
from typing import Optional, List
@ -46,6 +47,8 @@ class UploadOps(BaseCrawlOps):
filename: str,
name: Optional[str],
notes: Optional[str],
collections: Optional[List[UUID4]],
tags: Optional[List[str]],
org: Organization,
user: User,
replaceId: Optional[str],
@ -92,7 +95,9 @@ class UploadOps(BaseCrawlOps):
except Exception as exc:
print("replace file deletion failed", exc)
return await self._create_upload(files, name, notes, id_, org, user)
return await self._create_upload(
files, name, notes, collections, tags, id_, org, user
)
# pylint: disable=too-many-arguments, too-many-locals
async def upload_formdata(
@ -100,6 +105,8 @@ class UploadOps(BaseCrawlOps):
uploads: List[UploadFile],
name: Optional[str],
notes: Optional[str],
collections: Optional[List[UUID4]],
tags: Optional[List[str]],
org: Organization,
user: User,
):
@ -117,9 +124,13 @@ class UploadOps(BaseCrawlOps):
)
files.append(file_reader.file_prep.get_crawl_file())
return await self._create_upload(files, name, notes, id_, org, user)
return await self._create_upload(
files, name, notes, collections, tags, id_, org, user
)
async def _create_upload(self, files, name, notes, id_, org, user):
async def _create_upload(
self, files, name, notes, collections, tags, id_, org, user
):
now = dt_now()
# ts_now = now.strftime("%Y%m%d%H%M%S")
# crawl_id = f"upload-{ts_now}-{str(id_)[:12]}"
@ -127,10 +138,16 @@ class UploadOps(BaseCrawlOps):
file_size = sum(file_.size for file_ in files)
collection_uuids = []
for coll in collections:
collection_uuids.append(uuid.UUID(coll))
uploaded = UploadedCrawl(
id=crawl_id,
name=name or "New Upload @ " + str(now),
notes=notes,
collections=collection_uuids,
tags=tags,
userid=user.id,
oid=org.id,
files=files,
@ -224,10 +241,24 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
uploads: List[UploadFile] = File(...),
name: Optional[str] = "",
notes: Optional[str] = "",
collections: Optional[str] = "",
tags: Optional[str] = "",
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
return await ops.upload_formdata(uploads, name, notes, org, user)
name = unquote(name)
notes = unquote(notes)
colls_list = []
if collections:
colls_list = unquote(collections).split(",")
tags_list = []
if tags:
tags_list = unquote(tags).split(",")
return await ops.upload_formdata(
uploads, name, notes, colls_list, tags_list, org, user
)
@app.put("/orgs/{oid}/uploads/stream", tags=["uploads"])
async def upload_stream(
@ -235,12 +266,32 @@ def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_d
filename: str,
name: Optional[str] = "",
notes: Optional[str] = "",
collections: Optional[str] = "",
tags: Optional[str] = "",
replaceId: Optional[str] = "",
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
name = unquote(name)
notes = unquote(notes)
colls_list = []
if collections:
colls_list = unquote(collections).split(",")
tags_list = []
if tags:
tags_list = unquote(tags).split(",")
return await ops.upload_stream(
request.stream(), filename, name, notes, org, user, replaceId
request.stream(),
filename,
name,
notes,
colls_list,
tags_list,
org,
user,
replaceId,
)
@app.get("/orgs/{oid}/uploads", tags=["uploads"], response_model=PaginatedResponse)

View File

@ -329,3 +329,14 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_
@pytest.fixture(scope="session")
def auto_add_config_id(auto_add_crawl_id):
return _auto_add_config_id
@pytest.fixture(scope="session")
def uploads_collection_id(crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=crawler_auth_headers,
json={"name": "Upload test collection"},
)
assert r.status_code == 200
return r.json()["id"]

View File

@ -13,10 +13,10 @@ upload_dl_path = None
curr_dir = os.path.dirname(os.path.realpath(__file__))
def test_upload_stream(admin_auth_headers, default_org_id):
def test_upload_stream(admin_auth_headers, default_org_id, uploads_collection_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&notes=Testing%0AData",
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&notes=Testing%0AData&collections={uploads_collection_id}&tags=one%2Ctwo",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
@ -28,7 +28,7 @@ def test_upload_stream(admin_auth_headers, default_org_id):
upload_id = r.json()["id"]
def test_list_stream_upload(admin_auth_headers, default_org_id):
def test_list_stream_upload(admin_auth_headers, default_org_id, uploads_collection_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
@ -46,17 +46,20 @@ def test_list_stream_upload(admin_auth_headers, default_org_id):
assert found
assert found["name"] == "My Upload"
assert found["notes"] == "Testing\nData"
assert found["collections"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["one", "two"]
assert "files" not in found
assert "resources" not in found
def test_get_stream_upload(admin_auth_headers, default_org_id):
def test_get_stream_upload(admin_auth_headers, default_org_id, uploads_collection_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
result = r.json()
assert uploads_collection_id in result["collections"]
assert "files" not in result
upload_dl_path = result["resources"][0]["path"]
assert "test-" in result["resources"][0]["name"]
@ -79,7 +82,7 @@ def test_get_stream_upload(admin_auth_headers, default_org_id):
assert r.status_code == 200
def test_upload_form(admin_auth_headers, default_org_id):
def test_upload_form(admin_auth_headers, default_org_id, uploads_collection_id):
with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh:
data = fh.read()
@ -90,7 +93,7 @@ def test_upload_form(admin_auth_headers, default_org_id):
]
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz",
f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz&collections={uploads_collection_id}&tags=three%2Cfour",
headers=admin_auth_headers,
files=files,
)
@ -102,7 +105,7 @@ def test_upload_form(admin_auth_headers, default_org_id):
upload_id_2 = r.json()["id"]
def test_list_uploads(admin_auth_headers, default_org_id):
def test_list_uploads(admin_auth_headers, default_org_id, uploads_collection_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
@ -119,58 +122,42 @@ def test_list_uploads(admin_auth_headers, default_org_id):
assert found
assert found["name"] == "test2.wacz"
assert found["collections"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["four", "three"]
assert "files" not in res
assert "resources" not in res
def test_collection_uploads(admin_auth_headers, default_org_id):
# Create collection with one upload
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections",
headers=admin_auth_headers,
json={
"crawlIds": [upload_id],
"name": "My Test Coll",
},
)
assert r.status_code == 200
data = r.json()
coll_id = data["id"]
assert data["added"]
def test_collection_uploads(admin_auth_headers, default_org_id, uploads_collection_id):
# Test uploads filtered by collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads?collectionId={coll_id}",
f"{API_PREFIX}/orgs/{default_org_id}/uploads?collectionId={uploads_collection_id}",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) == 1
assert results["items"][0]["id"] == upload_id
assert len(results["items"]) == 2
assert results["items"][0]["id"] in (upload_id, upload_id_2)
assert results["items"][1]["id"] in (upload_id, upload_id_2)
# Test all crawls filtered by collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={coll_id}",
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={uploads_collection_id}",
headers=admin_auth_headers,
)
results = r.json()
assert len(results["items"]) == 1
assert results["items"][0]["id"] == upload_id
# Delete Collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{coll_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
assert len(results["items"]) == 2
assert results["items"][0]["id"] in (upload_id, upload_id_2)
assert results["items"][1]["id"] in (upload_id, upload_id_2)
def test_get_upload_replay_json(admin_auth_headers, default_org_id):
def test_get_upload_replay_json(
admin_auth_headers, default_org_id, uploads_collection_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
@ -181,6 +168,8 @@ def test_get_upload_replay_json(admin_auth_headers, default_org_id):
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["collections"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
@ -189,7 +178,9 @@ def test_get_upload_replay_json(admin_auth_headers, default_org_id):
assert "files" not in data
def test_get_upload_replay_json_admin(admin_auth_headers, default_org_id):
def test_get_upload_replay_json_admin(
admin_auth_headers, default_org_id, uploads_collection_id
):
r = requests.get(
f"{API_PREFIX}/orgs/all/uploads/{upload_id}/replay.json",
headers=admin_auth_headers,
@ -200,6 +191,8 @@ def test_get_upload_replay_json_admin(admin_auth_headers, default_org_id):
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["collections"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"]
assert data["resources"][0]["path"]
assert data["resources"][0]["size"]
@ -208,16 +201,20 @@ def test_get_upload_replay_json_admin(admin_auth_headers, default_org_id):
assert "files" not in data
def test_replace_upload(admin_auth_headers, default_org_id):
actual_id = do_upload_replace(admin_auth_headers, default_org_id, upload_id)
def test_replace_upload(admin_auth_headers, default_org_id, uploads_collection_id):
actual_id = do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
)
assert upload_id == actual_id
def do_upload_replace(admin_auth_headers, default_org_id, upload_id):
def do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
):
with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}",
f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}&collections={uploads_collection_id}",
headers=admin_auth_headers,
data=read_in_chunks(fh),
)
@ -294,11 +291,27 @@ def test_delete_stream_upload(admin_auth_headers, default_org_id):
assert r.json()["deleted"] == True
def test_replace_upload_non_existent(admin_auth_headers, default_org_id):
def test_ensure_deleted(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
for res in results["items"]:
if res["id"] == upload_id:
assert False
def test_replace_upload_non_existent(
admin_auth_headers, default_org_id, uploads_collection_id
):
global upload_id
# same replacement, but now to a non-existent upload
actual_id = do_upload_replace(admin_auth_headers, default_org_id, upload_id)
actual_id = do_upload_replace(
admin_auth_headers, default_org_id, upload_id, uploads_collection_id
)
# new upload_id created
assert actual_id != upload_id
@ -306,15 +319,6 @@ def test_replace_upload_non_existent(admin_auth_headers, default_org_id):
upload_id = actual_id
def test_delete_stream_upload_2(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete",
headers=admin_auth_headers,
json={"crawl_ids": [upload_id]},
)
assert r.json()["deleted"] == True
def test_verify_from_upload_resource_count(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json",
@ -429,15 +433,3 @@ def test_delete_form_upload_from_all_crawls(admin_auth_headers, default_org_id):
json={"crawl_ids": [upload_id_2]},
)
assert r.json()["deleted"] == True
def test_ensure_deleted(admin_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads",
headers=admin_auth_headers,
)
results = r.json()
for res in results["items"]:
if res["id"] in (upload_id_2, upload_id):
assert False