From 04c8b504236379f20e93f0c7ab4c2a75fd86d6a8 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 22 Aug 2024 10:36:04 -0700 Subject: [PATCH] add a crawling defaults on the Org to allow setting certain crawl workflow fields as defaults: (#2031) - add POST /orgs//defaults/crawling API to update all defaults (defaults unset are cleared) - defaults returned as 'crawlingDefaults' object on Org, if set - fixes #2016 --------- Co-authored-by: Emma Segal-Grossman --- backend/btrixcloud/models.py | 28 ++++++++++++++++++++++++++++ backend/btrixcloud/orgs.py | 22 ++++++++++++++++++++++ backend/test/test_org.py | 18 ++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index c99a4667..2dfad3ed 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -496,6 +496,30 @@ class UpdateCrawlConfig(BaseModel): config: Optional[RawCrawlConfig] = None +# ============================================================================ +class CrawlConfigDefaults(BaseModel): + """Crawl Config Org Defaults""" + + crawlTimeout: Optional[int] = None + maxCrawlSize: Optional[int] = None + + pageLoadTimeout: Optional[int] = None + postLoadDelay: Optional[int] = None + behaviorTimeout: Optional[int] = None + pageExtraDelay: Optional[int] = None + + blockAds: Optional[bool] = None + + profileid: Optional[UUID] = None + crawlerChannel: Optional[str] = None + + lang: Optional[str] = None + + userAgent: Optional[str] = None + + exclude: Optional[List[str]] = None + + # ============================================================================ class CrawlConfigAddedResponse(BaseModel): """Response model for adding crawlconfigs""" @@ -1353,6 +1377,8 @@ class OrgOut(BaseMongoModel): subscription: Optional[Subscription] = None + crawlingDefaults: Optional[CrawlConfigDefaults] = None + # ============================================================================ class Organization(BaseMongoModel): @@ -1404,6 +1430,8 @@ class Organization(BaseMongoModel): subscription: Optional[Subscription] = None + crawlingDefaults: Optional[CrawlConfigDefaults] = None + def is_owner(self, user): """Check if user is owner""" return self._is_auth(user, UserRole.OWNER) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index cacb3d15..fec17667 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -55,6 +55,7 @@ from .models import ( PaginatedOrgOutResponse, CrawlConfig, Crawl, + CrawlConfigDefaults, UploadedCrawl, ConfigRevision, Profile, @@ -586,6 +587,17 @@ class OrgOps: ) return res is not None + async def update_crawling_defaults( + self, org: Organization, defaults: CrawlConfigDefaults + ): + """Update crawling defaults""" + res = await self.orgs.find_one_and_update( + {"_id": org.id}, + {"$set": {"crawlingDefaults": defaults.model_dump()}}, + return_document=ReturnDocument.AFTER, + ) + return res is not None + async def add_user_by_invite( self, invite: InvitePending, @@ -1535,6 +1547,16 @@ def init_orgs_api( return {"updated": True} + @router.post( + "/defaults/crawling", tags=["organizations"], response_model=UpdatedResponse + ) + async def update_crawling_defaults( + defaults: CrawlConfigDefaults, + org: Organization = Depends(org_owner_dep), + ): + await ops.update_crawling_defaults(org, defaults) + return {"updated": True} + @router.post( "/recalculate-storage", tags=["organizations"], response_model=SuccessResponse ) diff --git a/backend/test/test_org.py b/backend/test/test_org.py index 20823132..34bda297 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -56,6 +56,24 @@ def test_get_org_crawler(crawler_auth_headers, default_org_id): assert data.get("users") == {} +def test_update_org_crawling_defaults(admin_auth_headers, default_org_id): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling", + headers=admin_auth_headers, + json={"maxCrawlSize": 200000, "lang": "fr"}, + ) + + assert r.status_code == 200 + assert r.json()["updated"] == True + + r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers) + + data = r.json() + assert data["crawlingDefaults"] + assert data["crawlingDefaults"]["maxCrawlSize"] == 200000 + assert data["crawlingDefaults"]["lang"] == "fr" + + def test_rename_org(admin_auth_headers, default_org_id): UPDATED_NAME = "updated org name" UPDATED_SLUG = "updated-org-name"