Add totalSize to collections and make it sortable in list endpoint (#1001)
* Precompute collection.totalSize and make sortable * Add migration to recompute collection data with totalSize
This commit is contained in:
parent
75b011f951
commit
fcd48b1831
@ -182,7 +182,7 @@ class CollectionOps:
|
|||||||
aggregate = [{"$match": match_query}]
|
aggregate = [{"$match": match_query}]
|
||||||
|
|
||||||
if sort_by:
|
if sort_by:
|
||||||
if sort_by not in ("modified", "name", "description"):
|
if sort_by not in ("modified", "name", "description", "totalSize"):
|
||||||
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
||||||
if sort_direction not in (1, -1):
|
if sort_direction not in (1, -1):
|
||||||
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
||||||
@ -267,6 +267,7 @@ async def update_collection_counts_and_tags(
|
|||||||
"""Set current crawl info in config when crawl begins"""
|
"""Set current crawl info in config when crawl begins"""
|
||||||
crawl_count = 0
|
crawl_count = 0
|
||||||
page_count = 0
|
page_count = 0
|
||||||
|
total_size = 0
|
||||||
tags = []
|
tags = []
|
||||||
|
|
||||||
cursor = crawls.find({"collections": collection_id})
|
cursor = crawls.find({"collections": collection_id})
|
||||||
@ -275,6 +276,9 @@ async def update_collection_counts_and_tags(
|
|||||||
if crawl["state"] not in SUCCESSFUL_STATES:
|
if crawl["state"] not in SUCCESSFUL_STATES:
|
||||||
continue
|
continue
|
||||||
crawl_count += 1
|
crawl_count += 1
|
||||||
|
files = crawl.get("files", [])
|
||||||
|
for file in files:
|
||||||
|
total_size += file.get("size", 0)
|
||||||
if crawl.get("stats"):
|
if crawl.get("stats"):
|
||||||
page_count += crawl.get("stats", {}).get("done", 0)
|
page_count += crawl.get("stats", {}).get("done", 0)
|
||||||
if crawl.get("tags"):
|
if crawl.get("tags"):
|
||||||
@ -288,6 +292,7 @@ async def update_collection_counts_and_tags(
|
|||||||
"$set": {
|
"$set": {
|
||||||
"crawlCount": crawl_count,
|
"crawlCount": crawl_count,
|
||||||
"pageCount": page_count,
|
"pageCount": page_count,
|
||||||
|
"totalSize": total_size,
|
||||||
"tags": sorted_tags,
|
"tags": sorted_tags,
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -15,7 +15,7 @@ from pymongo.errors import InvalidName
|
|||||||
from .migrations import BaseMigration
|
from .migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
CURR_DB_VERSION = "0009"
|
CURR_DB_VERSION = "0010"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -0,0 +1,36 @@
|
|||||||
|
"""
|
||||||
|
Migration 0010 - Precomputing collection total size
|
||||||
|
"""
|
||||||
|
from btrixcloud.colls import update_collection_counts_and_tags
|
||||||
|
from btrixcloud.migrations import BaseMigration
|
||||||
|
|
||||||
|
|
||||||
|
MIGRATION_VERSION = "0010"
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(BaseMigration):
|
||||||
|
"""Migration class."""
|
||||||
|
|
||||||
|
def __init__(self, mdb, migration_version=MIGRATION_VERSION):
|
||||||
|
super().__init__(mdb, migration_version)
|
||||||
|
|
||||||
|
async def migrate_up(self):
|
||||||
|
"""Perform migration up.
|
||||||
|
|
||||||
|
Recompute collection data to include totalSize.
|
||||||
|
"""
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
colls = self.mdb["collections"]
|
||||||
|
crawls = self.mdb["crawls"]
|
||||||
|
|
||||||
|
colls_to_update = [res async for res in colls.find({})]
|
||||||
|
if not colls_to_update:
|
||||||
|
return
|
||||||
|
|
||||||
|
for coll in colls_to_update:
|
||||||
|
coll_id = coll["_id"]
|
||||||
|
try:
|
||||||
|
await update_collection_counts_and_tags(colls, crawls, coll_id)
|
||||||
|
# pylint: disable=broad-exception-caught
|
||||||
|
except Exception as err:
|
||||||
|
print(f"Unable to update collection {coll_id}: {err}", flush=True)
|
@ -463,6 +463,7 @@ class Collection(BaseMongoModel):
|
|||||||
|
|
||||||
crawlCount: Optional[int] = 0
|
crawlCount: Optional[int] = 0
|
||||||
pageCount: Optional[int] = 0
|
pageCount: Optional[int] = 0
|
||||||
|
totalSize: Optional[int] = 0
|
||||||
|
|
||||||
# Sorted by count, descending
|
# Sorted by count, descending
|
||||||
tags: Optional[List[str]] = []
|
tags: Optional[List[str]] = []
|
||||||
|
@ -98,6 +98,7 @@ def test_update_collection(
|
|||||||
assert data["description"] == DESCRIPTION
|
assert data["description"] == DESCRIPTION
|
||||||
assert data["crawlCount"] == 1
|
assert data["crawlCount"] == 1
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0
|
||||||
|
assert data["totalSize"] > 0
|
||||||
global modified
|
global modified
|
||||||
modified = data["modified"]
|
modified = data["modified"]
|
||||||
assert modified
|
assert modified
|
||||||
@ -172,6 +173,7 @@ def test_add_remove_crawl_from_collection(
|
|||||||
assert data["id"] == _coll_id
|
assert data["id"] == _coll_id
|
||||||
assert data["crawlCount"] == 2
|
assert data["crawlCount"] == 2
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0
|
||||||
|
assert data["totalSize"] > 0
|
||||||
assert data["modified"] >= modified
|
assert data["modified"] >= modified
|
||||||
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
||||||
|
|
||||||
@ -193,6 +195,7 @@ def test_add_remove_crawl_from_collection(
|
|||||||
assert data["id"] == _coll_id
|
assert data["id"] == _coll_id
|
||||||
assert data["crawlCount"] == 0
|
assert data["crawlCount"] == 0
|
||||||
assert data["pageCount"] == 0
|
assert data["pageCount"] == 0
|
||||||
|
assert data["totalSize"] == 0
|
||||||
assert data["modified"] >= modified
|
assert data["modified"] >= modified
|
||||||
assert data.get("tags", []) == []
|
assert data.get("tags", []) == []
|
||||||
|
|
||||||
@ -220,6 +223,7 @@ def test_add_remove_crawl_from_collection(
|
|||||||
assert data["id"] == _coll_id
|
assert data["id"] == _coll_id
|
||||||
assert data["crawlCount"] == 2
|
assert data["crawlCount"] == 2
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0
|
||||||
|
assert data["totalSize"] > 0
|
||||||
assert data["modified"] >= modified
|
assert data["modified"] >= modified
|
||||||
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
||||||
|
|
||||||
@ -237,6 +241,7 @@ def test_get_collection(crawler_auth_headers, default_org_id):
|
|||||||
assert data["description"] == DESCRIPTION
|
assert data["description"] == DESCRIPTION
|
||||||
assert data["crawlCount"] == 2
|
assert data["crawlCount"] == 2
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0
|
||||||
|
assert data["totalSize"] > 0
|
||||||
assert data["modified"] >= modified
|
assert data["modified"] >= modified
|
||||||
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
||||||
|
|
||||||
@ -256,6 +261,7 @@ def test_get_collection_replay(
|
|||||||
assert data["description"] == DESCRIPTION
|
assert data["description"] == DESCRIPTION
|
||||||
assert data["crawlCount"] == 2
|
assert data["crawlCount"] == 2
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0
|
||||||
|
assert data["totalSize"] > 0
|
||||||
assert data["modified"] >= modified
|
assert data["modified"] >= modified
|
||||||
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
||||||
|
|
||||||
@ -292,6 +298,7 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
|
|||||||
assert data["id"] == _coll_id
|
assert data["id"] == _coll_id
|
||||||
assert data["crawlCount"] == 3
|
assert data["crawlCount"] == 3
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0
|
||||||
|
assert data["totalSize"] > 0
|
||||||
assert data["modified"]
|
assert data["modified"]
|
||||||
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
assert data["tags"] == ["wr-test-2", "wr-test-1"]
|
||||||
|
|
||||||
@ -323,6 +330,7 @@ def test_list_collections(
|
|||||||
assert first_coll["description"] == DESCRIPTION
|
assert first_coll["description"] == DESCRIPTION
|
||||||
assert first_coll["crawlCount"] == 3
|
assert first_coll["crawlCount"] == 3
|
||||||
assert first_coll["pageCount"] > 0
|
assert first_coll["pageCount"] > 0
|
||||||
|
assert first_coll["totalSize"] > 0
|
||||||
assert first_coll["modified"]
|
assert first_coll["modified"]
|
||||||
assert first_coll["tags"] == ["wr-test-2", "wr-test-1"]
|
assert first_coll["tags"] == ["wr-test-2", "wr-test-1"]
|
||||||
|
|
||||||
@ -333,6 +341,7 @@ def test_list_collections(
|
|||||||
assert second_coll.get("description") is None
|
assert second_coll.get("description") is None
|
||||||
assert second_coll["crawlCount"] == 1
|
assert second_coll["crawlCount"] == 1
|
||||||
assert second_coll["pageCount"] > 0
|
assert second_coll["pageCount"] > 0
|
||||||
|
assert second_coll["totalSize"] > 0
|
||||||
assert second_coll["modified"]
|
assert second_coll["modified"]
|
||||||
assert second_coll["tags"] == ["wr-test-2"]
|
assert second_coll["tags"] == ["wr-test-2"]
|
||||||
|
|
||||||
@ -349,6 +358,7 @@ def test_remove_upload_from_collection(crawler_auth_headers, default_org_id):
|
|||||||
assert data["id"] == _coll_id
|
assert data["id"] == _coll_id
|
||||||
assert data["crawlCount"] == 2
|
assert data["crawlCount"] == 2
|
||||||
assert data["pageCount"] > 0
|
assert data["pageCount"] > 0
|
||||||
|
assert data["totalSize"] > 0
|
||||||
assert data["modified"] >= modified
|
assert data["modified"] >= modified
|
||||||
assert data.get("tags") == ["wr-test-2", "wr-test-1"]
|
assert data.get("tags") == ["wr-test-2", "wr-test-1"]
|
||||||
|
|
||||||
@ -499,6 +509,46 @@ def test_filter_sort_collections(
|
|||||||
items = data["items"]
|
items = data["items"]
|
||||||
assert items[0]["modified"] >= items[1]["modified"]
|
assert items[0]["modified"] >= items[1]["modified"]
|
||||||
|
|
||||||
|
# Test sorting by size, ascending
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["total"] == 2
|
||||||
|
|
||||||
|
items = data["items"]
|
||||||
|
assert items[0]["totalSize"] <= items[1]["totalSize"]
|
||||||
|
|
||||||
|
# Test sorting by size, descending
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=totalSize&sortDirection=-1",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["total"] == 2
|
||||||
|
|
||||||
|
items = data["items"]
|
||||||
|
assert items[0]["totalSize"] >= items[1]["totalSize"]
|
||||||
|
|
||||||
|
# Invalid sort value
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=invalid",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 400
|
||||||
|
assert r.json()["detail"] == "invalid_sort_by"
|
||||||
|
|
||||||
|
# Invalid sort_direction value
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/collections?sortBy=modified&sortDirection=0",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 400
|
||||||
|
assert r.json()["detail"] == "invalid_sort_direction"
|
||||||
|
|
||||||
|
|
||||||
def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
def test_delete_collection(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
||||||
# Delete second collection
|
# Delete second collection
|
||||||
|
Loading…
Reference in New Issue
Block a user