remove deleted collections from crawlconfigs (#2615)

simplified version of #2608, add a remove_collection_from_all_configs() in CrawlConfigs, also check org.
update tests to ensure removal
This commit is contained in:
Ilya Kreymer 2025-05-20 18:38:40 -07:00 committed by GitHub
parent 86e35e358d
commit 8a713155ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 51 additions and 7 deletions

View File

@ -626,11 +626,16 @@ class BaseCrawlOps:
{"$pull": {"collectionIds": collection_id}},
)
async def remove_collection_from_all_crawls(self, collection_id: UUID):
async def remove_collection_from_all_crawls(
self, collection_id: UUID, org: Organization
):
"""Remove collection id from all crawls it's currently in."""
await self.crawls.update_many(
{"collectionIds": collection_id},
{"$pull": {"collectionIds": collection_id}},
await asyncio.gather(
self.crawls.update_many(
{"oid": org.id, "collectionIds": collection_id},
{"$pull": {"collectionIds": collection_id}},
),
self.crawl_configs.remove_collection_from_all_configs(collection_id, org),
)
# pylint: disable=too-many-branches, invalid-name, too-many-statements

View File

@ -621,7 +621,7 @@ class CollectionOps:
async def delete_collection(self, coll_id: UUID, org: Organization):
"""Delete collection and remove from associated crawls."""
await self.crawl_ops.remove_collection_from_all_crawls(coll_id)
await self.crawl_ops.remove_collection_from_all_crawls(coll_id, org)
result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
if result.deleted_count < 1:

View File

@ -924,6 +924,15 @@ class CrawlConfigOps:
return crawl_config.config
async def remove_collection_from_all_configs(
self, coll_id: UUID, org: Organization
):
"""remove collection from all autoAddCollection list"""
await self.crawl_configs.update_many(
{"oid": org.id, "autoAddCollections": coll_id},
{"$pull": {"autoAddCollections": coll_id}},
)
async def get_crawl_config_tags(self, org):
"""get distinct tags from all crawl configs for this org"""
tags = await self.crawl_configs.distinct("tags", {"oid": org.id})

View File

@ -94,7 +94,7 @@ def test_create_collection(
assert data["defaultThumbnailName"] == default_thumbnail_name
assert data["allowPublicDownload"]
assert data["topPageHosts"] == [{'count': 3, 'host': 'webrecorder.net'}]
assert data["topPageHosts"] == [{"count": 3, "host": "webrecorder.net"}]
def test_create_public_collection(
@ -313,7 +313,7 @@ def test_add_remove_crawl_from_collection(
assert data["tags"] == ["wr-test-2", "wr-test-1"]
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["topPageHosts"] == [{'count': 7, 'host': 'webrecorder.net'}]
assert data["topPageHosts"] == [{"count": 7, "host": "webrecorder.net"}]
# Verify it was added
r = requests.get(

View File

@ -68,3 +68,33 @@ def test_workflow_crawl_auto_added_subsequent_runs(
assert r.status_code == 200
new_crawl_count = r.json()["crawlCount"]
assert new_crawl_count == crawl_count + 1
def test_workflow_autoadd_collection_removed_on_delete(
default_org_id, auto_add_config_id, crawler_auth_headers, auto_add_collection_id
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{auto_add_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["autoAddCollections"] == [auto_add_collection_id]
# Delete Collection
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{auto_add_collection_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{auto_add_config_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["autoAddCollections"] == []