diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 73e1cc34..7e3d6bfb 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -52,11 +52,12 @@ class BaseCrawlOps: # pylint: disable=duplicate-code, too-many-arguments, too-many-locals - def __init__(self, mdb, users, crawl_configs, crawl_manager): + def __init__(self, mdb, users, crawl_configs, crawl_manager, colls): self.crawls = mdb["crawls"] self.crawl_configs = crawl_configs self.crawl_manager = crawl_manager self.user_manager = users + self.colls = colls self.presign_duration_seconds = ( int(os.environ.get("PRESIGN_DURATION_MINUTES", 60)) * 60 @@ -85,10 +86,11 @@ class BaseCrawlOps: return res async def _files_to_resources(self, files, org, crawlid): - if files: - crawl_files = [CrawlFile(**data) for data in files] + if not files: + return [] - return await self._resolve_signed_urls(crawl_files, org, crawlid) + crawl_files = [CrawlFile(**data) for data in files] + return await self._resolve_signed_urls(crawl_files, org, crawlid) async def get_crawl( self, @@ -105,6 +107,11 @@ class BaseCrawlOps: res.get("files"), org, crawlid ) + if res.get("collectionIds"): + res["collections"] = await self.colls.get_collection_names( + res.get("collectionIds") + ) + del res["files"] del res["errors"] @@ -300,7 +307,7 @@ class BaseCrawlOps: """Add crawls to collection.""" for crawl_id in crawl_ids: crawl_raw = await self.get_crawl_raw(crawl_id, org) - crawl_collections = crawl_raw.get("collections") + crawl_collections = crawl_raw.get("collectionIds") if crawl_collections and crawl_id in crawl_collections: raise HTTPException( status_code=400, detail="crawl_already_in_collection" @@ -308,7 +315,7 @@ class BaseCrawlOps: await self.crawls.find_one_and_update( {"_id": crawl_id}, - {"$push": {"collections": collection_id}}, + {"$push": {"collectionIds": collection_id}}, ) async def remove_from_collection( @@ -318,17 +325,17 @@ class BaseCrawlOps: for crawl_id in crawl_ids: await self.crawls.find_one_and_update( {"_id": crawl_id}, - {"$pull": {"collections": collection_id}}, + {"$pull": {"collectionIds": collection_id}}, ) async def remove_collection_from_all_crawls(self, collection_id: uuid.UUID): """Remove collection id from all crawls it's currently in.""" await self.crawls.update_many( - {"collections": collection_id}, - {"$pull": {"collections": collection_id}}, + {"collectionIds": collection_id}, + {"$pull": {"collectionIds": collection_id}}, ) - # pylint: disable=too-many-branches, invalid-name + # pylint: disable=too-many-branches, invalid-name, too-many-statements async def list_all_base_crawls( self, org: Optional[Organization] = None, @@ -393,7 +400,7 @@ class BaseCrawlOps: aggregate.extend([{"$match": {"description": description}}]) if collection_id: - aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}]) + aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}]) if sort_by: if sort_by not in ("started", "finished", "fileSize"): @@ -498,12 +505,12 @@ class BaseCrawlOps: # ============================================================================ def init_base_crawls_api( - app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep + app, mdb, users, crawl_manager, crawl_config_ops, orgs, colls, user_dep ): """base crawls api""" # pylint: disable=invalid-name, duplicate-code, too-many-arguments, too-many-locals - ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager) + ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager, colls) org_viewer_dep = orgs.org_viewer_dep org_crawl_dep = orgs.org_crawl_dep diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 8a5927da..d7087f00 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -16,6 +16,7 @@ from .models import ( Collection, CollIn, CollOut, + CollIdName, UpdateColl, AddRemoveCrawlList, CrawlOutWithResources, @@ -33,14 +34,18 @@ class CollectionOps: # pylint: disable=too-many-arguments - def __init__(self, mdb, crawls, crawl_manager, orgs): + def __init__(self, mdb, crawl_manager, orgs): self.collections = mdb["collections"] self.crawls = mdb["crawls"] + self.crawl_ops = None - self.crawl_ops = crawls self.crawl_manager = crawl_manager self.orgs = orgs + def set_crawl_ops(self, ops): + """set crawl ops""" + self.crawl_ops = ops + async def init_index(self): """init lookup index""" await self.collections.create_index( @@ -253,6 +258,18 @@ class CollectionOps: return all_files + async def get_collection_names(self, uuids: List[uuid.UUID]): + """return object of {_id, names} given list of collection ids""" + cursor = self.collections.find( + {"_id": {"$in": uuids}}, projection=["_id", "name"] + ) + names = await cursor.to_list(length=1000) + names = [ + CollIdName(id=namedata["_id"], name=namedata["name"]) for namedata in names + ] + print("names", names) + return names + async def get_collection_search_values(self, org: Organization): """Return list of collection names""" names = await self.collections.distinct("name", {"oid": org.id}) @@ -292,7 +309,7 @@ async def update_collection_counts_and_tags( total_size = 0 tags = [] - cursor = crawls.find({"collections": collection_id}) + cursor = crawls.find({"collectionIds": collection_id}) crawls = await cursor.to_list(length=10_000) for crawl in crawls: if crawl["state"] not in SUCCESSFUL_STATES: @@ -325,8 +342,8 @@ async def update_collection_counts_and_tags( async def update_crawl_collections(collections, crawls, crawl_id: str): """Update counts and tags for all collections in crawl""" crawl = await crawls.find_one({"_id": crawl_id}) - crawl_collections = crawl.get("collections") - for collection_id in crawl_collections: + crawl_coll_ids = crawl.get("collectionIds") + for collection_id in crawl_coll_ids: await update_collection_counts_and_tags(collections, crawls, collection_id) @@ -340,18 +357,18 @@ async def add_successful_crawl_to_collections( if auto_add_collections: await crawls.find_one_and_update( {"_id": crawl_id}, - {"$set": {"collections": auto_add_collections}}, + {"$set": {"collectionIds": auto_add_collections}}, ) await update_crawl_collections(collections, crawls, crawl_id) # ============================================================================ # pylint: disable=too-many-locals -def init_collections_api(app, mdb, crawls, orgs, crawl_manager): +def init_collections_api(app, mdb, orgs, crawl_manager): """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments - colls = CollectionOps(mdb, crawls, crawl_manager, orgs) + colls = CollectionOps(mdb, crawl_manager, orgs) org_crawl_dep = orgs.org_crawl_dep org_viewer_dep = orgs.org_viewer_dep diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 80db4f3f..d4db643d 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -22,7 +22,6 @@ from .storages import get_wacz_logs from .utils import dt_now, parse_jsonl_error_messages from .basecrawls import BaseCrawlOps from .models import ( - CrawlFile, UpdateCrawl, DeleteCrawlList, CrawlConfig, @@ -43,14 +42,15 @@ class CrawlOps(BaseCrawlOps): """Crawl Ops""" # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods - def __init__(self, mdb, users, crawl_manager, crawl_configs, orgs): - super().__init__(mdb, users, crawl_configs, crawl_manager) + def __init__(self, mdb, users, crawl_manager, crawl_configs, orgs, colls): + super().__init__(mdb, users, crawl_configs, crawl_manager, colls) self.crawls = self.crawls self.crawl_configs = crawl_configs self.user_manager = users self.orgs = orgs self.crawl_configs.set_crawl_ops(self) + self.colls.set_crawl_ops(self) async def init_index(self): """init index for crawls db collection""" @@ -146,7 +146,7 @@ class CrawlOps(BaseCrawlOps): aggregate.extend([{"$match": {"firstSeed": first_seed}}]) if collection_id: - aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}]) + aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}]) if sort_by: if sort_by not in ( @@ -210,25 +210,6 @@ class CrawlOps(BaseCrawlOps): return crawls, total - # pylint: disable=arguments-differ - async def get_crawl(self, crawlid: str, org: Organization): - """Get data for single crawl""" - - res = await self.get_crawl_raw(crawlid, org) - - if res.get("files"): - files = [CrawlFile(**data) for data in res["files"]] - - del res["files"] - - res["resources"] = await self._resolve_signed_urls(files, org, crawlid) - - del res["errors"] - - crawl = CrawlOutWithResources.from_dict(res) - - return await self._resolve_crawl_refs(crawl, org) - async def delete_crawls( self, org: Organization, delete_list: DeleteCrawlList, type_="crawl" ): @@ -627,11 +608,13 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, too-many-statements -def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep): +def init_crawls_api( + app, mdb, users, crawl_manager, crawl_config_ops, orgs, colls, user_dep +): """API for crawl management, including crawl done callback""" # pylint: disable=invalid-name - ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs) + ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs, colls) org_viewer_dep = orgs.org_viewer_dep org_crawl_dep = orgs.org_crawl_dep @@ -782,7 +765,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl(crawl_id, None) + return await ops.get_crawl(crawl_id, None, "crawl") @app.get( "/orgs/{oid}/crawls/{crawl_id}/replay.json", @@ -790,7 +773,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user response_model=CrawlOutWithResources, ) async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl(crawl_id, org) + return await ops.get_crawl(crawl_id, org, "crawl") @app.get( "/orgs/all/crawls/{crawl_id}", @@ -910,7 +893,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user logLevel: Optional[str] = None, context: Optional[str] = None, ): - crawl = await ops.get_crawl(crawl_id, org) + crawl = await ops.get_crawl(crawl_id, org, "crawl") log_levels = [] contexts = [] diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index be63c140..537096ca 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -15,7 +15,7 @@ from pymongo.errors import InvalidName from .migrations import BaseMigration -CURR_DB_VERSION = "0013" +CURR_DB_VERSION = "0014" # ============================================================================ diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 9aa09639..ecb6bb23 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -100,6 +100,8 @@ def main(): profiles, ) + coll_ops = init_collections_api(app, mdb, org_ops, crawl_manager) + init_base_crawls_api( app, mdb, @@ -107,6 +109,7 @@ def main(): crawl_manager, crawl_config_ops, org_ops, + coll_ops, current_active_user, ) @@ -117,6 +120,7 @@ def main(): crawl_manager, crawl_config_ops, org_ops, + coll_ops, current_active_user, ) @@ -127,11 +131,10 @@ def main(): crawl_manager, crawl_config_ops, org_ops, + coll_ops, current_active_user, ) - coll_ops = init_collections_api(app, mdb, crawls, org_ops, crawl_manager) - crawl_config_ops.set_coll_ops(coll_ops) # run only in first worker diff --git a/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py b/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py new file mode 100644 index 00000000..8243136c --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py @@ -0,0 +1,30 @@ +""" +Migration 0014 - collections to collectionIDs +""" +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0014" + + +class Migration(BaseMigration): + """Migration class.""" + + def __init__(self, mdb, migration_version=MIGRATION_VERSION): + super().__init__(mdb, migration_version) + + async def migrate_up(self): + """Perform migration up. + + Rename crawl 'collections' field to 'collectionIds' + """ + # pylint: disable=duplicate-code + crawls = self.mdb["crawls"] + try: + await crawls.update_many({}, {"$rename": {"collections": "collectionIds"}}) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Error renaming crawl 'collections' to 'collectionIds': {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e82294c5..d58708e1 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -308,12 +308,20 @@ class BaseCrawl(BaseMongoModel): errors: Optional[List[str]] = [] - collections: Optional[List[UUID4]] = [] + collectionIds: Optional[List[UUID4]] = [] fileSize: int = 0 fileCount: int = 0 +# ============================================================================ +class CollIdName(BaseModel): + """Collection id and name object""" + + id: UUID4 + name: str + + # ============================================================================ class CrawlOut(BaseMongoModel): """Crawl output model, shared across all crawl types""" @@ -346,7 +354,7 @@ class CrawlOut(BaseMongoModel): errors: Optional[List[str]] - collections: Optional[List[UUID4]] = [] + collectionIds: Optional[List[UUID4]] = [] # automated crawl fields config: Optional[RawCrawlConfig] @@ -365,6 +373,7 @@ class CrawlOutWithResources(CrawlOut): """Crawl output model including resources""" resources: Optional[List[CrawlFileOut]] = [] + collections: Optional[List[CollIdName]] = [] # ============================================================================ diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index d4313a45..f3ef2f17 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -146,7 +146,7 @@ class UploadOps(BaseCrawlOps): id=crawl_id, name=name or "New Upload @ " + str(now), description=description, - collections=collection_uuids, + collectionIds=collection_uuids, tags=tags, userid=user.id, oid=org.id, @@ -227,11 +227,13 @@ class UploadFileReader(BufferedReader): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name -def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_dep): +def init_uploads_api( + app, mdb, users, crawl_manager, crawl_configs, orgs, colls, user_dep +): """uploads api""" # ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs) - ops = UploadOps(mdb, users, crawl_configs, crawl_manager) + ops = UploadOps(mdb, users, crawl_configs, crawl_manager, colls) org_viewer_dep = orgs.org_viewer_dep org_crawl_dep = orgs.org_crawl_dep diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 460414b4..83c2705f 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -44,7 +44,8 @@ def test_create_collection( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json", headers=crawler_auth_headers, ) - assert _coll_id in r.json()["collections"] + assert _coll_id in r.json()["collectionIds"] + assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}] def test_create_collection_taken_name( @@ -185,7 +186,7 @@ def test_add_remove_crawl_from_collection( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json", headers=crawler_auth_headers, ) - assert _coll_id in r.json()["collections"] + assert _coll_id in r.json()["collectionIds"] # Remove crawls r = requests.post( @@ -207,13 +208,13 @@ def test_add_remove_crawl_from_collection( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json", headers=crawler_auth_headers, ) - assert _coll_id not in r.json()["collections"] + assert _coll_id not in r.json()["collectionIds"] r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json", headers=crawler_auth_headers, ) - assert _coll_id not in r.json()["collections"] + assert _coll_id not in r.json()["collectionIds"] # Add crawls back for further tests r = requests.post( @@ -350,7 +351,8 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id): f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json", headers=crawler_auth_headers, ) - assert _coll_id in r.json()["collections"] + assert _coll_id in r.json()["collectionIds"] + assert r.json()["collections"] == [{"name": UPDATED_NAME, "id": _coll_id}] def test_download_streaming_collection(crawler_auth_headers, default_org_id): @@ -432,7 +434,7 @@ def test_remove_upload_from_collection(crawler_auth_headers, default_org_id): f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json", headers=crawler_auth_headers, ) - assert _coll_id not in r.json()["collections"] + assert _coll_id not in r.json()["collectionIds"] def test_filter_sort_collections( @@ -629,7 +631,7 @@ def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_i f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json", headers=crawler_auth_headers, ) - assert _second_coll_id not in r.json()["collections"] + assert _second_coll_id not in r.json()["collectionIds"] # Make a new empty (no crawls) collection and delete it r = requests.post( diff --git a/backend/test/test_filter_sort_results.py b/backend/test/test_filter_sort_results.py index f01b5f3a..c770aced 100644 --- a/backend/test/test_filter_sort_results.py +++ b/backend/test/test_filter_sort_results.py @@ -200,7 +200,7 @@ def test_get_crawls_by_collection_id( ) assert r.json()["total"] >= 1 for crawl in r.json()["items"]: - assert collection_id in crawl["collections"] + assert collection_id in crawl["collectionIds"] def test_sort_crawls( diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index d3ccf8d9..0200884f 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -48,7 +48,7 @@ def test_list_stream_upload(admin_auth_headers, default_org_id, uploads_collecti assert found assert found["name"] == "My Upload" assert found["description"] == "Testing\nData" - assert found["collections"] == [uploads_collection_id] + assert found["collectionIds"] == [uploads_collection_id] assert sorted(found["tags"]) == ["one", "two"] assert "files" not in found assert "resources" not in found @@ -61,7 +61,7 @@ def test_get_stream_upload(admin_auth_headers, default_org_id, uploads_collectio ) assert r.status_code == 200 result = r.json() - assert uploads_collection_id in result["collections"] + assert uploads_collection_id in result["collectionIds"] assert "files" not in result upload_dl_path = result["resources"][0]["path"] assert "test-" in result["resources"][0]["name"] @@ -124,7 +124,7 @@ def test_list_uploads(admin_auth_headers, default_org_id, uploads_collection_id) assert found assert found["name"] == "test2.wacz" - assert found["collections"] == [uploads_collection_id] + assert found["collectionIds"] == [uploads_collection_id] assert sorted(found["tags"]) == ["four", "three"] assert "files" not in res @@ -170,7 +170,7 @@ def test_get_upload_replay_json( assert data assert data["id"] == upload_id assert data["name"] == "My Upload" - assert data["collections"] == [uploads_collection_id] + assert data["collectionIds"] == [uploads_collection_id] assert sorted(data["tags"]) == ["one", "two"] assert data["resources"] assert data["resources"][0]["path"] @@ -193,7 +193,7 @@ def test_get_upload_replay_json_admin( assert data assert data["id"] == upload_id assert data["name"] == "My Upload" - assert data["collections"] == [uploads_collection_id] + assert data["collectionIds"] == [uploads_collection_id] assert sorted(data["tags"]) == ["one", "two"] assert data["resources"] assert data["resources"][0]["path"] diff --git a/backend/test/test_workflow_auto_add_to_collection.py b/backend/test/test_workflow_auto_add_to_collection.py index dc53e26b..bf3a16bc 100644 --- a/backend/test/test_workflow_auto_add_to_collection.py +++ b/backend/test/test_workflow_auto_add_to_collection.py @@ -16,7 +16,7 @@ def test_workflow_crawl_auto_added_to_collection( headers=crawler_auth_headers, ) assert r.status_code == 200 - assert auto_add_collection_id in r.json()["collections"] + assert auto_add_collection_id in r.json()["collectionIds"] def test_workflow_crawl_auto_added_subsequent_runs( @@ -59,7 +59,7 @@ def test_workflow_crawl_auto_added_subsequent_runs( headers=crawler_auth_headers, ) assert r.status_code == 200 - assert auto_add_collection_id in r.json()["collections"] + assert auto_add_collection_id in r.json()["collectionIds"] r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/collections/{auto_add_collection_id}", diff --git a/frontend/src/pages/org/crawl-detail.ts b/frontend/src/pages/org/crawl-detail.ts index 7bb56105..01cea9c7 100644 --- a/frontend/src/pages/org/crawl-detail.ts +++ b/frontend/src/pages/org/crawl-detail.ts @@ -835,7 +835,7 @@ ${this.crawl?.description} .authState=${this.authState!} .crawlConfig=${{ ...this.crawl, - autoAddCollections: this.crawl.collections, + autoAddCollections: this.crawl.collectionIds, }} hideTags > diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 77e39d87..a1e61419 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -127,7 +127,7 @@ export type Crawl = CrawlConfig & { firstSeed: string; seedCount: number; stopping: boolean; - collections: string[]; + collectionIds: string[]; type?: "crawl" | "upload" | null; };