feat: implement 'collections' array with {name, id} for archived item details (#1098)

- rename 'collections' -> 'collectionIds', adding migration 0014 - only populate 'collections' array with {name, id} pair for get_crawl() / single archived item path, but not for aggregate/list methods - remove Crawl.get_crawl(), redundant with BaseCrawl.get_crawl() version - ensure _files_to_resources returns an empty [] instead of none if empty (matching BaseCrawl.get_crawl() behavior to Crawl.get_crawl()) - tests: update tests to use collectionIds for id list, add 'collections' for {name, id} test - frontend: change Crawl object to have collectionIds instead of collections --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-08-25 03:26:46 -04:00 · 2023-08-25 03:26:46 -04:00 · 8b16124675
commit 8b16124675
parent 989ed2a8da
14 changed files with 127 additions and 74 deletions
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@ -52,11 +52,12 @@ class BaseCrawlOps:

    # pylint: disable=duplicate-code, too-many-arguments, too-many-locals

-    def __init__(self, mdb, users, crawl_configs, crawl_manager):
+    def __init__(self, mdb, users, crawl_configs, crawl_manager, colls):
        self.crawls = mdb["crawls"]
        self.crawl_configs = crawl_configs
        self.crawl_manager = crawl_manager
        self.user_manager = users
+        self.colls = colls

        self.presign_duration_seconds = (
            int(os.environ.get("PRESIGN_DURATION_MINUTES", 60)) * 60
@ -85,10 +86,11 @@ class BaseCrawlOps:
        return res

    async def _files_to_resources(self, files, org, crawlid):
-        if files:
-            crawl_files = [CrawlFile(**data) for data in files]
+        if not files:
+            return []

-            return await self._resolve_signed_urls(crawl_files, org, crawlid)
+        crawl_files = [CrawlFile(**data) for data in files]
+        return await self._resolve_signed_urls(crawl_files, org, crawlid)

    async def get_crawl(
        self,
@ -105,6 +107,11 @@ class BaseCrawlOps:
                res.get("files"), org, crawlid
            )

+            if res.get("collectionIds"):
+                res["collections"] = await self.colls.get_collection_names(
+                    res.get("collectionIds")
+                )
+
        del res["files"]
        del res["errors"]

@ -300,7 +307,7 @@ class BaseCrawlOps:
        """Add crawls to collection."""
        for crawl_id in crawl_ids:
            crawl_raw = await self.get_crawl_raw(crawl_id, org)
-            crawl_collections = crawl_raw.get("collections")
+            crawl_collections = crawl_raw.get("collectionIds")
            if crawl_collections and crawl_id in crawl_collections:
                raise HTTPException(
                    status_code=400, detail="crawl_already_in_collection"
@ -308,7 +315,7 @@ class BaseCrawlOps:

            await self.crawls.find_one_and_update(
                {"_id": crawl_id},
-                {"$push": {"collections": collection_id}},
+                {"$push": {"collectionIds": collection_id}},
            )

    async def remove_from_collection(
@ -318,17 +325,17 @@ class BaseCrawlOps:
        for crawl_id in crawl_ids:
            await self.crawls.find_one_and_update(
                {"_id": crawl_id},
-                {"$pull": {"collections": collection_id}},
+                {"$pull": {"collectionIds": collection_id}},
            )

    async def remove_collection_from_all_crawls(self, collection_id: uuid.UUID):
        """Remove collection id from all crawls it's currently in."""
        await self.crawls.update_many(
-            {"collections": collection_id},
-            {"$pull": {"collections": collection_id}},
+            {"collectionIds": collection_id},
+            {"$pull": {"collectionIds": collection_id}},
        )

-    # pylint: disable=too-many-branches, invalid-name
+    # pylint: disable=too-many-branches, invalid-name, too-many-statements
    async def list_all_base_crawls(
        self,
        org: Optional[Organization] = None,
@ -393,7 +400,7 @@ class BaseCrawlOps:
            aggregate.extend([{"$match": {"description": description}}])

        if collection_id:
-            aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
+            aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])

        if sort_by:
            if sort_by not in ("started", "finished", "fileSize"):
@ -498,12 +505,12 @@ class BaseCrawlOps:

 # ============================================================================
 def init_base_crawls_api(
-    app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep
+    app, mdb, users, crawl_manager, crawl_config_ops, orgs, colls, user_dep
 ):
    """base crawls api"""
    # pylint: disable=invalid-name, duplicate-code, too-many-arguments, too-many-locals

-    ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager)
+    ops = BaseCrawlOps(mdb, users, crawl_config_ops, crawl_manager, colls)

    org_viewer_dep = orgs.org_viewer_dep
    org_crawl_dep = orgs.org_crawl_dep
--- a/backend/btrixcloud/colls.py
+++ b/backend/btrixcloud/colls.py
@ -16,6 +16,7 @@ from .models import (
    Collection,
    CollIn,
    CollOut,
+    CollIdName,
    UpdateColl,
    AddRemoveCrawlList,
    CrawlOutWithResources,
@ -33,14 +34,18 @@ class CollectionOps:

    # pylint: disable=too-many-arguments

-    def __init__(self, mdb, crawls, crawl_manager, orgs):
+    def __init__(self, mdb, crawl_manager, orgs):
        self.collections = mdb["collections"]
        self.crawls = mdb["crawls"]
+        self.crawl_ops = None

-        self.crawl_ops = crawls
        self.crawl_manager = crawl_manager
        self.orgs = orgs

+    def set_crawl_ops(self, ops):
+        """set crawl ops"""
+        self.crawl_ops = ops
+
    async def init_index(self):
        """init lookup index"""
        await self.collections.create_index(
@ -253,6 +258,18 @@ class CollectionOps:

        return all_files

+    async def get_collection_names(self, uuids: List[uuid.UUID]):
+        """return object of {_id, names} given list of collection ids"""
+        cursor = self.collections.find(
+            {"_id": {"$in": uuids}}, projection=["_id", "name"]
+        )
+        names = await cursor.to_list(length=1000)
+        names = [
+            CollIdName(id=namedata["_id"], name=namedata["name"]) for namedata in names
+        ]
+        print("names", names)
+        return names
+
    async def get_collection_search_values(self, org: Organization):
        """Return list of collection names"""
        names = await self.collections.distinct("name", {"oid": org.id})
@ -292,7 +309,7 @@ async def update_collection_counts_and_tags(
    total_size = 0
    tags = []

-    cursor = crawls.find({"collections": collection_id})
+    cursor = crawls.find({"collectionIds": collection_id})
    crawls = await cursor.to_list(length=10_000)
    for crawl in crawls:
        if crawl["state"] not in SUCCESSFUL_STATES:
@ -325,8 +342,8 @@ async def update_collection_counts_and_tags(
 async def update_crawl_collections(collections, crawls, crawl_id: str):
    """Update counts and tags for all collections in crawl"""
    crawl = await crawls.find_one({"_id": crawl_id})
-    crawl_collections = crawl.get("collections")
-    for collection_id in crawl_collections:
+    crawl_coll_ids = crawl.get("collectionIds")
+    for collection_id in crawl_coll_ids:
        await update_collection_counts_and_tags(collections, crawls, collection_id)


@ -340,18 +357,18 @@ async def add_successful_crawl_to_collections(
    if auto_add_collections:
        await crawls.find_one_and_update(
            {"_id": crawl_id},
-            {"$set": {"collections": auto_add_collections}},
+            {"$set": {"collectionIds": auto_add_collections}},
        )
        await update_crawl_collections(collections, crawls, crawl_id)


 # ============================================================================
 # pylint: disable=too-many-locals
-def init_collections_api(app, mdb, crawls, orgs, crawl_manager):
+def init_collections_api(app, mdb, orgs, crawl_manager):
    """init collections api"""
    # pylint: disable=invalid-name, unused-argument, too-many-arguments

-    colls = CollectionOps(mdb, crawls, crawl_manager, orgs)
+    colls = CollectionOps(mdb, crawl_manager, orgs)

    org_crawl_dep = orgs.org_crawl_dep
    org_viewer_dep = orgs.org_viewer_dep
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -22,7 +22,6 @@ from .storages import get_wacz_logs
 from .utils import dt_now, parse_jsonl_error_messages
 from .basecrawls import BaseCrawlOps
 from .models import (
-    CrawlFile,
    UpdateCrawl,
    DeleteCrawlList,
    CrawlConfig,
@ -43,14 +42,15 @@ class CrawlOps(BaseCrawlOps):
    """Crawl Ops"""

    # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
-    def __init__(self, mdb, users, crawl_manager, crawl_configs, orgs):
-        super().__init__(mdb, users, crawl_configs, crawl_manager)
+    def __init__(self, mdb, users, crawl_manager, crawl_configs, orgs, colls):
+        super().__init__(mdb, users, crawl_configs, crawl_manager, colls)
        self.crawls = self.crawls
        self.crawl_configs = crawl_configs
        self.user_manager = users
        self.orgs = orgs

        self.crawl_configs.set_crawl_ops(self)
+        self.colls.set_crawl_ops(self)

    async def init_index(self):
        """init index for crawls db collection"""
@ -146,7 +146,7 @@ class CrawlOps(BaseCrawlOps):
            aggregate.extend([{"$match": {"firstSeed": first_seed}}])

        if collection_id:
-            aggregate.extend([{"$match": {"collections": {"$in": [collection_id]}}}])
+            aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])

        if sort_by:
            if sort_by not in (
@ -210,25 +210,6 @@ class CrawlOps(BaseCrawlOps):

        return crawls, total

-    # pylint: disable=arguments-differ
-    async def get_crawl(self, crawlid: str, org: Organization):
-        """Get data for single crawl"""
-
-        res = await self.get_crawl_raw(crawlid, org)
-
-        if res.get("files"):
-            files = [CrawlFile(**data) for data in res["files"]]
-
-            del res["files"]
-
-            res["resources"] = await self._resolve_signed_urls(files, org, crawlid)
-
-        del res["errors"]
-
-        crawl = CrawlOutWithResources.from_dict(res)
-
-        return await self._resolve_crawl_refs(crawl, org)
-
    async def delete_crawls(
        self, org: Organization, delete_list: DeleteCrawlList, type_="crawl"
    ):
@ -627,11 +608,13 @@ async def recompute_crawl_file_count_and_size(crawls, crawl_id):

 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, too-many-statements
-def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user_dep):
+def init_crawls_api(
+    app, mdb, users, crawl_manager, crawl_config_ops, orgs, colls, user_dep
+):
    """API for crawl management, including crawl done callback"""
    # pylint: disable=invalid-name

-    ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs)
+    ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs, colls)

    org_viewer_dep = orgs.org_viewer_dep
    org_crawl_dep = orgs.org_crawl_dep
@ -782,7 +765,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
        if not user.is_superuser:
            raise HTTPException(status_code=403, detail="Not Allowed")

-        return await ops.get_crawl(crawl_id, None)
+        return await ops.get_crawl(crawl_id, None, "crawl")

    @app.get(
        "/orgs/{oid}/crawls/{crawl_id}/replay.json",
@ -790,7 +773,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
        response_model=CrawlOutWithResources,
    )
    async def get_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)):
-        return await ops.get_crawl(crawl_id, org)
+        return await ops.get_crawl(crawl_id, org, "crawl")

    @app.get(
        "/orgs/all/crawls/{crawl_id}",
@ -910,7 +893,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
        logLevel: Optional[str] = None,
        context: Optional[str] = None,
    ):
-        crawl = await ops.get_crawl(crawl_id, org)
+        crawl = await ops.get_crawl(crawl_id, org, "crawl")

        log_levels = []
        contexts = []
--- a/backend/btrixcloud/db.py
+++ b/backend/btrixcloud/db.py
@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
 from .migrations import BaseMigration


-CURR_DB_VERSION = "0013"
+CURR_DB_VERSION = "0014"


 # ============================================================================
--- a/backend/btrixcloud/main.py
+++ b/backend/btrixcloud/main.py
@ -100,6 +100,8 @@ def main():
        profiles,
    )

+    coll_ops = init_collections_api(app, mdb, org_ops, crawl_manager)
+
    init_base_crawls_api(
        app,
        mdb,
@ -107,6 +109,7 @@ def main():
        crawl_manager,
        crawl_config_ops,
        org_ops,
+        coll_ops,
        current_active_user,
    )

@ -117,6 +120,7 @@ def main():
        crawl_manager,
        crawl_config_ops,
        org_ops,
+        coll_ops,
        current_active_user,
    )

@ -127,11 +131,10 @@ def main():
        crawl_manager,
        crawl_config_ops,
        org_ops,
+        coll_ops,
        current_active_user,
    )

-    coll_ops = init_collections_api(app, mdb, crawls, org_ops, crawl_manager)
-
    crawl_config_ops.set_coll_ops(coll_ops)

    # run only in first worker
--- a/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py
+++ b/backend/btrixcloud/migrations/migration_0014_to_collection_ids.py
@ -0,0 +1,30 @@
+"""
+Migration 0014 - collections to collectionIDs
+"""
+from btrixcloud.migrations import BaseMigration
+
+
+MIGRATION_VERSION = "0014"
+
+
+class Migration(BaseMigration):
+    """Migration class."""
+
+    def __init__(self, mdb, migration_version=MIGRATION_VERSION):
+        super().__init__(mdb, migration_version)
+
+    async def migrate_up(self):
+        """Perform migration up.
+
+        Rename crawl 'collections' field to 'collectionIds'
+        """
+        # pylint: disable=duplicate-code
+        crawls = self.mdb["crawls"]
+        try:
+            await crawls.update_many({}, {"$rename": {"collections": "collectionIds"}})
+        # pylint: disable=broad-exception-caught
+        except Exception as err:
+            print(
+                f"Error renaming crawl 'collections' to 'collectionIds': {err}",
+                flush=True,
+            )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -308,12 +308,20 @@ class BaseCrawl(BaseMongoModel):

    errors: Optional[List[str]] = []

-    collections: Optional[List[UUID4]] = []
+    collectionIds: Optional[List[UUID4]] = []

    fileSize: int = 0
    fileCount: int = 0


+# ============================================================================
+class CollIdName(BaseModel):
+    """Collection id and name object"""
+
+    id: UUID4
+    name: str
+
+
 # ============================================================================
 class CrawlOut(BaseMongoModel):
    """Crawl output model, shared across all crawl types"""
@ -346,7 +354,7 @@ class CrawlOut(BaseMongoModel):

    errors: Optional[List[str]]

-    collections: Optional[List[UUID4]] = []
+    collectionIds: Optional[List[UUID4]] = []

    # automated crawl fields
    config: Optional[RawCrawlConfig]
@ -365,6 +373,7 @@ class CrawlOutWithResources(CrawlOut):
    """Crawl output model including resources"""

    resources: Optional[List[CrawlFileOut]] = []
+    collections: Optional[List[CollIdName]] = []


 # ============================================================================
--- a/backend/btrixcloud/uploads.py
+++ b/backend/btrixcloud/uploads.py
@ -146,7 +146,7 @@ class UploadOps(BaseCrawlOps):
            id=crawl_id,
            name=name or "New Upload @ " + str(now),
            description=description,
-            collections=collection_uuids,
+            collectionIds=collection_uuids,
            tags=tags,
            userid=user.id,
            oid=org.id,
@ -227,11 +227,13 @@ class UploadFileReader(BufferedReader):

 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name
-def init_uploads_api(app, mdb, users, crawl_manager, crawl_configs, orgs, user_dep):
+def init_uploads_api(
+    app, mdb, users, crawl_manager, crawl_configs, orgs, colls, user_dep
+):
    """uploads api"""

    # ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, orgs)
-    ops = UploadOps(mdb, users, crawl_configs, crawl_manager)
+    ops = UploadOps(mdb, users, crawl_configs, crawl_manager, colls)

    org_viewer_dep = orgs.org_viewer_dep
    org_crawl_dep = orgs.org_crawl_dep
--- a/backend/test/test_collections.py
+++ b/backend/test/test_collections.py
@ -44,7 +44,8 @@ def test_create_collection(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
        headers=crawler_auth_headers,
    )
-    assert _coll_id in r.json()["collections"]
+    assert _coll_id in r.json()["collectionIds"]
+    assert r.json()["collections"] == [{"name": COLLECTION_NAME, "id": _coll_id}]


 def test_create_collection_taken_name(
@ -185,7 +186,7 @@ def test_add_remove_crawl_from_collection(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
        headers=crawler_auth_headers,
    )
-    assert _coll_id in r.json()["collections"]
+    assert _coll_id in r.json()["collectionIds"]

    # Remove crawls
    r = requests.post(
@ -207,13 +208,13 @@ def test_add_remove_crawl_from_collection(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
        headers=crawler_auth_headers,
    )
-    assert _coll_id not in r.json()["collections"]
+    assert _coll_id not in r.json()["collectionIds"]

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
        headers=crawler_auth_headers,
    )
-    assert _coll_id not in r.json()["collections"]
+    assert _coll_id not in r.json()["collectionIds"]

    # Add crawls back for further tests
    r = requests.post(
@ -350,7 +351,8 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
        f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
        headers=crawler_auth_headers,
    )
-    assert _coll_id in r.json()["collections"]
+    assert _coll_id in r.json()["collectionIds"]
+    assert r.json()["collections"] == [{"name": UPDATED_NAME, "id": _coll_id}]


 def test_download_streaming_collection(crawler_auth_headers, default_org_id):
@ -432,7 +434,7 @@ def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
        f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json",
        headers=crawler_auth_headers,
    )
-    assert _coll_id not in r.json()["collections"]
+    assert _coll_id not in r.json()["collectionIds"]


 def test_filter_sort_collections(
@ -629,7 +631,7 @@ def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_i
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
        headers=crawler_auth_headers,
    )
-    assert _second_coll_id not in r.json()["collections"]
+    assert _second_coll_id not in r.json()["collectionIds"]

    # Make a new empty (no crawls) collection and delete it
    r = requests.post(
--- a/backend/test/test_filter_sort_results.py
+++ b/backend/test/test_filter_sort_results.py
@ -200,7 +200,7 @@ def test_get_crawls_by_collection_id(
    )
    assert r.json()["total"] >= 1
    for crawl in r.json()["items"]:
-        assert collection_id in crawl["collections"]
+        assert collection_id in crawl["collectionIds"]


 def test_sort_crawls(
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@ -48,7 +48,7 @@ def test_list_stream_upload(admin_auth_headers, default_org_id, uploads_collecti
    assert found
    assert found["name"] == "My Upload"
    assert found["description"] == "Testing\nData"
-    assert found["collections"] == [uploads_collection_id]
+    assert found["collectionIds"] == [uploads_collection_id]
    assert sorted(found["tags"]) == ["one", "two"]
    assert "files" not in found
    assert "resources" not in found
@ -61,7 +61,7 @@ def test_get_stream_upload(admin_auth_headers, default_org_id, uploads_collectio
    )
    assert r.status_code == 200
    result = r.json()
-    assert uploads_collection_id in result["collections"]
+    assert uploads_collection_id in result["collectionIds"]
    assert "files" not in result
    upload_dl_path = result["resources"][0]["path"]
    assert "test-" in result["resources"][0]["name"]
@ -124,7 +124,7 @@ def test_list_uploads(admin_auth_headers, default_org_id, uploads_collection_id)

    assert found
    assert found["name"] == "test2.wacz"
-    assert found["collections"] == [uploads_collection_id]
+    assert found["collectionIds"] == [uploads_collection_id]
    assert sorted(found["tags"]) == ["four", "three"]

    assert "files" not in res
@ -170,7 +170,7 @@ def test_get_upload_replay_json(
    assert data
    assert data["id"] == upload_id
    assert data["name"] == "My Upload"
-    assert data["collections"] == [uploads_collection_id]
+    assert data["collectionIds"] == [uploads_collection_id]
    assert sorted(data["tags"]) == ["one", "two"]
    assert data["resources"]
    assert data["resources"][0]["path"]
@ -193,7 +193,7 @@ def test_get_upload_replay_json_admin(
    assert data
    assert data["id"] == upload_id
    assert data["name"] == "My Upload"
-    assert data["collections"] == [uploads_collection_id]
+    assert data["collectionIds"] == [uploads_collection_id]
    assert sorted(data["tags"]) == ["one", "two"]
    assert data["resources"]
    assert data["resources"][0]["path"]
--- a/backend/test/test_workflow_auto_add_to_collection.py
+++ b/backend/test/test_workflow_auto_add_to_collection.py
@ -16,7 +16,7 @@ def test_workflow_crawl_auto_added_to_collection(
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert auto_add_collection_id in r.json()["collections"]
+    assert auto_add_collection_id in r.json()["collectionIds"]


 def test_workflow_crawl_auto_added_subsequent_runs(
@ -59,7 +59,7 @@ def test_workflow_crawl_auto_added_subsequent_runs(
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
-    assert auto_add_collection_id in r.json()["collections"]
+    assert auto_add_collection_id in r.json()["collectionIds"]

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/collections/{auto_add_collection_id}",
--- a/frontend/src/pages/org/crawl-detail.ts
+++ b/frontend/src/pages/org/crawl-detail.ts
@ -835,7 +835,7 @@ ${this.crawl?.description}
        .authState=${this.authState!}
        .crawlConfig=${{
          ...this.crawl,
-          autoAddCollections: this.crawl.collections,
+          autoAddCollections: this.crawl.collectionIds,
        }}
        hideTags
      ></btrix-config-details>
--- a/frontend/src/types/crawler.ts
+++ b/frontend/src/types/crawler.ts
@ -127,7 +127,7 @@ export type Crawl = CrawlConfig & {
  firstSeed: string;
  seedCount: number;
  stopping: boolean;
-  collections: string[];
+  collectionIds: string[];
  type?: "crawl" | "upload" | null;
 };