add a crawling defaults on the Org to allow setting certain crawl workflow fields as defaults: (#2031)

- add POST /orgs/<id>/defaults/crawling API to update all defaults
(defaults unset are cleared)
- defaults returned as 'crawlingDefaults' object on Org, if set
- fixes #2016

---------

Co-authored-by: Emma Segal-Grossman <hi@emma.cafe>
This commit is contained in:
Ilya Kreymer 2024-08-22 10:36:04 -07:00 committed by GitHub
parent 0e16d526c0
commit 04c8b50423
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 68 additions and 0 deletions

View File

@ -496,6 +496,30 @@ class UpdateCrawlConfig(BaseModel):
config: Optional[RawCrawlConfig] = None
# ============================================================================
class CrawlConfigDefaults(BaseModel):
"""Crawl Config Org Defaults"""
crawlTimeout: Optional[int] = None
maxCrawlSize: Optional[int] = None
pageLoadTimeout: Optional[int] = None
postLoadDelay: Optional[int] = None
behaviorTimeout: Optional[int] = None
pageExtraDelay: Optional[int] = None
blockAds: Optional[bool] = None
profileid: Optional[UUID] = None
crawlerChannel: Optional[str] = None
lang: Optional[str] = None
userAgent: Optional[str] = None
exclude: Optional[List[str]] = None
# ============================================================================
class CrawlConfigAddedResponse(BaseModel):
"""Response model for adding crawlconfigs"""
@ -1353,6 +1377,8 @@ class OrgOut(BaseMongoModel):
subscription: Optional[Subscription] = None
crawlingDefaults: Optional[CrawlConfigDefaults] = None
# ============================================================================
class Organization(BaseMongoModel):
@ -1404,6 +1430,8 @@ class Organization(BaseMongoModel):
subscription: Optional[Subscription] = None
crawlingDefaults: Optional[CrawlConfigDefaults] = None
def is_owner(self, user):
"""Check if user is owner"""
return self._is_auth(user, UserRole.OWNER)

View File

@ -55,6 +55,7 @@ from .models import (
PaginatedOrgOutResponse,
CrawlConfig,
Crawl,
CrawlConfigDefaults,
UploadedCrawl,
ConfigRevision,
Profile,
@ -586,6 +587,17 @@ class OrgOps:
)
return res is not None
async def update_crawling_defaults(
self, org: Organization, defaults: CrawlConfigDefaults
):
"""Update crawling defaults"""
res = await self.orgs.find_one_and_update(
{"_id": org.id},
{"$set": {"crawlingDefaults": defaults.model_dump()}},
return_document=ReturnDocument.AFTER,
)
return res is not None
async def add_user_by_invite(
self,
invite: InvitePending,
@ -1535,6 +1547,16 @@ def init_orgs_api(
return {"updated": True}
@router.post(
"/defaults/crawling", tags=["organizations"], response_model=UpdatedResponse
)
async def update_crawling_defaults(
defaults: CrawlConfigDefaults,
org: Organization = Depends(org_owner_dep),
):
await ops.update_crawling_defaults(org, defaults)
return {"updated": True}
@router.post(
"/recalculate-storage", tags=["organizations"], response_model=SuccessResponse
)

View File

@ -56,6 +56,24 @@ def test_get_org_crawler(crawler_auth_headers, default_org_id):
assert data.get("users") == {}
def test_update_org_crawling_defaults(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling",
headers=admin_auth_headers,
json={"maxCrawlSize": 200000, "lang": "fr"},
)
assert r.status_code == 200
assert r.json()["updated"] == True
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
data = r.json()
assert data["crawlingDefaults"]
assert data["crawlingDefaults"]["maxCrawlSize"] == 200000
assert data["crawlingDefaults"]["lang"] == "fr"
def test_rename_org(admin_auth_headers, default_org_id):
UPDATED_NAME = "updated org name"
UPDATED_SLUG = "updated-org-name"