feat: implement 'collections' array with {name, id} for archived item details (#1098)

- rename 'collections' -> 'collectionIds', adding migration 0014
- only populate 'collections' array with {name, id} pair for get_crawl() / single archived item
path, but not for aggregate/list methods
- remove Crawl.get_crawl(), redundant with BaseCrawl.get_crawl() version
- ensure _files_to_resources returns an empty [] instead of none if empty (matching BaseCrawl.get_crawl() behavior to Crawl.get_crawl())
- tests: update tests to use collectionIds for id list, add 'collections' for {name, id} test
- frontend: change Crawl object to have collectionIds instead of collections

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Anish Lakhwara 2023-08-25 03:26:46 -04:00 committed by GitHub
parent 989ed2a8da
commit 8b16124675
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 127 additions and 74 deletions

View File

@ -52,11 +52,12 @@ class BaseCrawlOps:
# pylint: disable=duplicate-code, too-many-arguments, too-many-locals
def __init__(self, mdb, users, crawl_configs, crawl_manager):
def __init__(self, mdb, users, crawl_configs, crawl_manager, colls):
self.crawls = mdb["crawls"]
self.crawl_configs = crawl_configs
self.crawl_manager = crawl_manager
self.user_manager = users
self.colls = colls
self.presign_duration_seconds = (
int(os.environ.get("PRESIGN_DURATION_MINUTES", 60)) * 60
@ -85,10 +86,11 @@ class BaseCrawlOps:
return res
async def _files_to_resources(self, files, org, crawlid):
if files:
crawl_files = [CrawlFile(**data) for data in files]
if not files:
return []
return await self._resolve_signed_urls(crawl_files, org, crawlid)
crawl_files = [CrawlFile(**data) for data in files]
return await self._resolve_signed_urls(crawl_files, org, crawlid)
async def get_crawl(
self,
@ -105,6 +107,11 @@ class BaseCrawlOps:
res.get("files"), org, crawlid
)
if res.get("collectionIds"):
res["collections"] = await self.colls.get_collection_names(
res.get("collectionIds")
)
del res["files"]
del res["errors"]
@ -300,7 +307,7 @@ class BaseCrawlOps:
"""Add crawls to collection."""
for crawl_id in crawl_ids:
crawl_raw = await self.get_crawl_raw(crawl_id, org)
crawl_collections = crawl_raw.get("collections")
crawl_collections = crawl_raw.get("collectionIds")
if crawl_collections and crawl_id in crawl_collections:
raise HTTPException(
status_code=400, detail="crawl_already_in_collection"
@ -308,7 +315,7 @@ class BaseCrawlOps:
await self.crawls.find_one_and_update(
{"_id": crawl_id},
{"$push": {"collections": collection_id}},
{"$push": {"collectionIds": collection_id}},
)
async def remove_from_collection(
@ -318,17 +325,17 @@ class BaseCrawlOps:
for crawl_id in crawl_ids:
await self.crawls.find_one_and_update(
{"_id": crawl_id},
{"$pull": {"collections": collection_id}},
{"$pull": {"collectionIds": collection_id}},
)
async def remove_collection_from_all_crawls(self, collection_id: uuid.UUID):
"""Remove collection id from all crawls it's currently in."""
await self.crawls.update_many(
{"collections": collection_id},
{"$pull": {"collections": collection_id}},
{"collectionIds": collection_id},
{"$pull": {"collectionIds": collection_id}},
)
# pylint: disable=too-many-branches, invalid-name
# pylint: disable=too-many-branches, invalid-name, too-many-statements
async def list_all_base_crawls(
self,
org: Optional[Organization] = None,
@ -393,7 +400,7 @@ class BaseCrawlOps:
aggregate.extend([{"$match": {"description": description}}])
if collection_id:
aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])
if sort_by:
if sort_by not in ("started", "finished", "fileSize"):
@ -498,12 +505,12 @@ class BaseCrawlOps:
# ============================================================================
def init_base_crawls_api(
app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep
app, mdb, users, crawl_manager, crawl_config_ops, orgs, colls, user_dep
):
"""base crawls api"""
# pylint: disable=invalid-name, duplicate-code, too-many-arguments, too-many-locals
ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager)
ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager, colls)
org_viewer_dep = orgs.org_viewer_dep
org_crawl_dep = orgs.org_crawl_dep

View File

@ -16,6 +16,7 @@ from .models import (
Collection,
CollIn,
CollOut,
CollIdName,
UpdateColl,
AddRemoveCrawlList,
CrawlOutWithResources,
@ -33,14 +34,18 @@ class CollectionOps:
# pylint: disable=too-many-arguments
def __init__(self, mdb, crawls, crawl_manager, orgs):
def __init__(self, mdb, crawl_manager, orgs):
self.collections = mdb["collections"]
self.crawls = mdb["crawls"]
self.crawl_ops = None
self.crawl_ops = crawls
self.crawl_manager = crawl_manager
self.orgs = orgs
def set_crawl_ops(self, ops):
"""set crawl ops"""
self.crawl_ops = ops
async def init_index(self):
"""init lookup index"""
await self.collections.create_index(
@ -253,6 +258,18 @@ class CollectionOps:
return all_files
async def get_collection_names(self, uuids: List[uuid.UUID]):
"""return object of {_id, names} given list of collection ids"""
cursor = self.collections.find(
{"_id": {"$in": uuids}}, projection=["_id", "name"]
)
names = await cursor.to_list(length=1000)
names = [
CollIdName(id=namedata["_id"], name=namedata["name"]) for namedata in names
]
print("names", names)
return names
async def get_collection_search_values(self, org: Organization):
"""Return list of collection names"""
names = await self.collections.distinct("name", {"oid": org.id})
@ -292,7 +309,7 @@ async def update_collection_counts_and_tags(
total_size = 0
tags = []
cursor = crawls.find({"collections": collection_id})
cursor = crawls.find({"collectionIds": collection_id})
crawls = await cursor.to_list(length=10_000)
for crawl in crawls:
if crawl["state"] not in SUCCESSFUL_STATES:
@ -325,8 +342,8 @@ async def update_collection_counts_and_tags(
async def update_crawl_collections(collections, crawls, crawl_id: str):
"""Update counts and tags for all collections in crawl"""
crawl = await crawls.find_one({"_id": crawl_id})
crawl_collections = crawl.get("collections")
for collection_id in crawl_collections:
crawl_coll_ids = crawl.get("collectionIds")
for collection_id in crawl_coll_ids:
await update_collection_counts_and_tags(collections, crawls, collection_id)
@ -340,18 +357,18 @@ async def add_successful_crawl_to_collections(
if auto_add_collections:
await crawls.find_one_and_update(
{"_id": crawl_id},
{"$set": {"collections": auto_add_collections}},
{"$set": {"collectionIds": auto_add_collections}},
)
await update_crawl_collections(collections, crawls, crawl_id)
# ============================================================================
# pylint: disable=too-many-locals
def init_collections_api(app, mdb, crawls, orgs, crawl_manager):
def init_collections_api(app, mdb, orgs, crawl_manager):
"""init collections api"""
# pylint: disable=invalid-name, unused-argument, too-many-arguments
colls = CollectionOps(mdb, crawls, crawl_manager, orgs)
colls = CollectionOps(mdb, crawl_manager, orgs)
org_crawl_dep = orgs.org_crawl_dep
org_viewer_dep = orgs.org_viewer_dep

View File

@ -22,7 +22,6 @@ from .storages import get_wacz_logs
from .utils import dt_now, parse_jsonl_error_messages
from .basecrawls import BaseCrawlOps
from .models import (
CrawlFile,
UpdateCrawl,
DeleteCrawlList,
CrawlConfig,
@ -43,14 +42,15 @@ class CrawlOps(BaseCrawlOps):
"""Crawl Ops"""
# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
def __init__(self, mdb, users, crawl_manager, crawl_configs, orgs):
super().__init__(mdb, users, crawl_configs, crawl_manager)
def __init__(self, mdb, users, crawl_manager, crawl_configs, orgs, colls):
super().__init__(mdb, users, crawl_configs, crawl_manager, colls)
self.crawls = self.crawls
self.crawl_configs = crawl_configs
self.user_manager = users
self.orgs = orgs
self.crawl_configs.set_crawl_ops(self)
self.colls.set_crawl_ops(self)
async def init_index(self):
"""init index for crawls db collection"""
@ -146,7 +146,7 @@ class CrawlOps(BaseCrawlOps):
aggregate.extend([{"$match": {"firstSeed": first_seed}}])
if collection_id:
aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])
if sort_by:
if sort_by not in (
@ -210,25 +210,6 @@ class CrawlOps(BaseCrawlOps):
return crawls, total
# pylint: disable=arguments-differ
async def get_crawl(self, crawlid: str, org: Organization):
"""Get data for single crawl"""
res = await self.get_crawl_raw(crawlid, org)
if res.get("files"):
files = [CrawlFile(**data) for data in res["files"]]
del res["files"]
res["resources"] = await self._resolve_signed_urls(files, org, crawlid)
del res["errors"]
crawl = CrawlOutWithResources.from_dict(res)
return await self._resolve_crawl_refs(crawl, org)
async def delete_crawls(
self, org: Organization, delete_list: DeleteCrawlList, type_="crawl"
):
@ -627,11 +608,13 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id):
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep):
def init_crawls_api(
app, mdb, users, crawl_manager, crawl_config_ops, orgs, colls, user_dep
):
"""API for crawl management, including crawl done callback"""
# pylint: disable=invalid-name
ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs)
ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs, colls)
org_viewer_dep = orgs.org_viewer_dep
org_crawl_dep = orgs.org_crawl_dep
@ -782,7 +765,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
return await ops.get_crawl(crawl_id, None)
return await ops.get_crawl(crawl_id, None, "crawl")
@app.get(
"/orgs/{oid}/crawls/{crawl_id}/replay.json",
@ -790,7 +773,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
response_model=CrawlOutWithResources,
)
async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)):
return await ops.get_crawl(crawl_id, org)
return await ops.get_crawl(crawl_id, org, "crawl")
@app.get(
"/orgs/all/crawls/{crawl_id}",
@ -910,7 +893,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
logLevel: Optional[str] = None,
context: Optional[str] = None,
):
crawl = await ops.get_crawl(crawl_id, org)
crawl = await ops.get_crawl(crawl_id, org, "crawl")
log_levels = []
contexts = []

View File

@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
from .migrations import BaseMigration
CURR_DB_VERSION = "0013"
CURR_DB_VERSION = "0014"
# ============================================================================

View File

@ -100,6 +100,8 @@ def main():
profiles,
)
coll_ops = init_collections_api(app, mdb, org_ops, crawl_manager)
init_base_crawls_api(
app,
mdb,
@ -107,6 +109,7 @@ def main():
crawl_manager,
crawl_config_ops,
org_ops,
coll_ops,
current_active_user,
)
@ -117,6 +120,7 @@ def main():
crawl_manager,
crawl_config_ops,
org_ops,
coll_ops,
current_active_user,
)
@ -127,11 +131,10 @@ def main():
crawl_manager,
crawl_config_ops,
org_ops,
coll_ops,
current_active_user,
)
coll_ops = init_collections_api(app, mdb, crawls, org_ops, crawl_manager)
crawl_config_ops.set_coll_ops(coll_ops)
# run only in first worker

View File

@ -0,0 +1,30 @@
"""
Migration 0014 - collections to collectionIDs
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0014"
class Migration(BaseMigration):
"""Migration class."""
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
super().__init__(mdb, migration_version)
async def migrate_up(self):
"""Perform migration up.
Rename crawl 'collections' field to 'collectionIds'
"""
# pylint: disable=duplicate-code
crawls = self.mdb["crawls"]
try:
await crawls.update_many({}, {"$rename": {"collections": "collectionIds"}})
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error renaming crawl 'collections' to 'collectionIds': {err}",
flush=True,
)

View File

@ -308,12 +308,20 @@ class BaseCrawl(BaseMongoModel):
errors: Optional[List[str]] = []
collections: Optional[List[UUID4]] = []
collectionIds: Optional[List[UUID4]] = []
fileSize: int = 0
fileCount: int = 0
# ============================================================================
class CollIdName(BaseModel):
"""Collection id and name object"""
id: UUID4
name: str
# ============================================================================
class CrawlOut(BaseMongoModel):
"""Crawl output model, shared across all crawl types"""
@ -346,7 +354,7 @@ class CrawlOut(BaseMongoModel):
errors: Optional[List[str]]
collections: Optional[List[UUID4]] = []
collectionIds: Optional[List[UUID4]] = []
# automated crawl fields
config: Optional[RawCrawlConfig]
@ -365,6 +373,7 @@ class CrawlOutWithResources(CrawlOut):
"""Crawl output model including resources"""
resources: Optional[List[CrawlFileOut]] = []
collections: Optional[List[CollIdName]] = []
# ============================================================================

View File

@ -146,7 +146,7 @@ class UploadOps(BaseCrawlOps):
id=crawl_id,
name=name or "New Upload @ " + str(now),
description=description,
collections=collection_uuids,
collectionIds=collection_uuids,
tags=tags,
userid=user.id,
oid=org.id,
@ -227,11 +227,13 @@ class UploadFileReader(BufferedReader):
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, invalid-name
def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_dep):
def init_uploads_api(
app, mdb, users, crawl_manager, crawl_configs, orgs, colls, user_dep
):
"""uploads api"""
# ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs)
ops = UploadOps(mdb, users, crawl_configs, crawl_manager)
ops = UploadOps(mdb, users, crawl_configs, crawl_manager, colls)
org_viewer_dep = orgs.org_viewer_dep
org_crawl_dep = orgs.org_crawl_dep

View File

@ -44,7 +44,8 @@ def test_create_collection(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collections"]
assert _coll_id in r.json()["collectionIds"]
assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}]
def test_create_collection_taken_name(
@ -185,7 +186,7 @@ def test_add_remove_crawl_from_collection(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collections"]
assert _coll_id in r.json()["collectionIds"]
# Remove crawls
r = requests.post(
@ -207,13 +208,13 @@ def test_add_remove_crawl_from_collection(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collections"]
assert _coll_id not in r.json()["collectionIds"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collections"]
assert _coll_id not in r.json()["collectionIds"]
# Add crawls back for further tests
r = requests.post(
@ -350,7 +351,8 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collections"]
assert _coll_id in r.json()["collectionIds"]
assert r.json()["collections"] == [{"name": UPDATED_NAME, "id": _coll_id}]
def test_download_streaming_collection(crawler_auth_headers, default_org_id):
@ -432,7 +434,7 @@ def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collections"]
assert _coll_id not in r.json()["collectionIds"]
def test_filter_sort_collections(
@ -629,7 +631,7 @@ def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_i
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _second_coll_id not in r.json()["collections"]
assert _second_coll_id not in r.json()["collectionIds"]
# Make a new empty (no crawls) collection and delete it
r = requests.post(

View File

@ -200,7 +200,7 @@ def test_get_crawls_by_collection_id(
)
assert r.json()["total"] >= 1
for crawl in r.json()["items"]:
assert collection_id in crawl["collections"]
assert collection_id in crawl["collectionIds"]
def test_sort_crawls(

View File

@ -48,7 +48,7 @@ def test_list_stream_upload(admin_auth_headers, default_org_id, uploads_collecti
assert found
assert found["name"] == "My Upload"
assert found["description"] == "Testing\nData"
assert found["collections"] == [uploads_collection_id]
assert found["collectionIds"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["one", "two"]
assert "files" not in found
assert "resources" not in found
@ -61,7 +61,7 @@ def test_get_stream_upload(admin_auth_headers, default_org_id, uploads_collectio
)
assert r.status_code == 200
result = r.json()
assert uploads_collection_id in result["collections"]
assert uploads_collection_id in result["collectionIds"]
assert "files" not in result
upload_dl_path = result["resources"][0]["path"]
assert "test-" in result["resources"][0]["name"]
@ -124,7 +124,7 @@ def test_list_uploads(admin_auth_headers, default_org_id, uploads_collection_id)
assert found
assert found["name"] == "test2.wacz"
assert found["collections"] == [uploads_collection_id]
assert found["collectionIds"] == [uploads_collection_id]
assert sorted(found["tags"]) == ["four", "three"]
assert "files" not in res
@ -170,7 +170,7 @@ def test_get_upload_replay_json(
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["collections"] == [uploads_collection_id]
assert data["collectionIds"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"]
assert data["resources"][0]["path"]
@ -193,7 +193,7 @@ def test_get_upload_replay_json_admin(
assert data
assert data["id"] == upload_id
assert data["name"] == "My Upload"
assert data["collections"] == [uploads_collection_id]
assert data["collectionIds"] == [uploads_collection_id]
assert sorted(data["tags"]) == ["one", "two"]
assert data["resources"]
assert data["resources"][0]["path"]

View File

@ -16,7 +16,7 @@ def test_workflow_crawl_auto_added_to_collection(
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert auto_add_collection_id in r.json()["collections"]
assert auto_add_collection_id in r.json()["collectionIds"]
def test_workflow_crawl_auto_added_subsequent_runs(
@ -59,7 +59,7 @@ def test_workflow_crawl_auto_added_subsequent_runs(
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert auto_add_collection_id in r.json()["collections"]
assert auto_add_collection_id in r.json()["collectionIds"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{auto_add_collection_id}",

View File

@ -835,7 +835,7 @@ ${this.crawl?.description}
.authState=${this.authState!}
.crawlConfig=${{
...this.crawl,
autoAddCollections: this.crawl.collections,
autoAddCollections: this.crawl.collectionIds,
}}
hideTags
></btrix-config-details>

View File

@ -127,7 +127,7 @@ export type Crawl = CrawlConfig & {
firstSeed: string;
seedCount: number;
stopping: boolean;
collections: string[];
collectionIds: string[];
type?: "crawl" | "upload" | null;
};