Add and enforce readOnly field in Organization (#1886)
Fixes https://github.com/webrecorder/browsertrix/issues/1883 Backend work for https://github.com/webrecorder/browsertrix/issues/1876 - If readOnly is set true, disallow crawls and QA analysis runs - If readOnly is set to true, skip scheduled crawls - Add endpoint to set `readOnly` with optional `readOnlyReason` (which is automatically set back to an empty string when `readOnly` is being set to false), which can be displayed in banner - Operator: ensures cronjobs that are skipped due to internal logic (eg. readonly mode) simply succeed right away and do not leave a k8s job dangling. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
48dfa485e5
commit
9140dd75bc
@ -219,6 +219,10 @@ class CrawlConfigOps:
|
||||
storage_quota_reached = await self.org_ops.storage_quota_reached(org.id)
|
||||
exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id)
|
||||
|
||||
if org.readOnly:
|
||||
run_now = False
|
||||
print(f"Org {org.id} set to read-only", flush=True)
|
||||
|
||||
if storage_quota_reached:
|
||||
run_now = False
|
||||
print(f"Storage quota exceeded for org {org.id}", flush=True)
|
||||
@ -843,6 +847,9 @@ class CrawlConfigOps:
|
||||
except:
|
||||
await self.readd_configmap(crawlconfig, org)
|
||||
|
||||
if org.readOnly:
|
||||
raise HTTPException(status_code=403, detail="org_set_to_read_only")
|
||||
|
||||
if await self.org_ops.storage_quota_reached(org.id):
|
||||
raise HTTPException(status_code=403, detail="storage_quota_reached")
|
||||
|
||||
|
@ -738,6 +738,10 @@ class CrawlOps(BaseCrawlOps):
|
||||
|
||||
crawl = await self.get_crawl(crawl_id, org)
|
||||
|
||||
# ensure org execution is allowed
|
||||
if org.readOnly:
|
||||
raise HTTPException(status_code=403, detail="org_set_to_read_only")
|
||||
|
||||
# can only QA finished crawls
|
||||
if not crawl.finished:
|
||||
raise HTTPException(status_code=400, detail="crawl_not_finished")
|
||||
|
@ -971,6 +971,14 @@ class OrgQuotaUpdate(BaseModel):
|
||||
update: OrgQuotas
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class OrgReadOnlyUpdate(BaseModel):
|
||||
"""Organization readonly update"""
|
||||
|
||||
readOnly: bool
|
||||
readOnlyReason: Optional[str] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class OrgWebhookUrls(BaseModel):
|
||||
"""Organization webhook URLs"""
|
||||
@ -1025,6 +1033,9 @@ class OrgOut(BaseMongoModel):
|
||||
|
||||
webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
|
||||
|
||||
readOnly: Optional[bool]
|
||||
readOnlyReason: Optional[str]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Organization(BaseMongoModel):
|
||||
@ -1069,6 +1080,9 @@ class Organization(BaseMongoModel):
|
||||
|
||||
origin: Optional[AnyHttpUrl] = None
|
||||
|
||||
readOnly: Optional[bool] = False
|
||||
readOnlyReason: Optional[str] = None
|
||||
|
||||
def is_owner(self, user):
|
||||
"""Check if user is owner"""
|
||||
return self._is_auth(user, UserRole.OWNER)
|
||||
|
@ -1,9 +1,10 @@
|
||||
""" Operator handler for crawl CronJobs """
|
||||
|
||||
from uuid import UUID
|
||||
from typing import Optional
|
||||
import yaml
|
||||
|
||||
from btrixcloud.utils import to_k8s_date
|
||||
from btrixcloud.utils import to_k8s_date, dt_now
|
||||
from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP
|
||||
from .baseoperator import BaseOperator
|
||||
|
||||
@ -38,6 +39,30 @@ class CronJobOperator(BaseOperator):
|
||||
]
|
||||
}
|
||||
|
||||
def get_finished_response(
|
||||
self, metadata: dict[str, str], set_status=True, finished: Optional[str] = None
|
||||
):
|
||||
"""get final response to indicate cronjob created job is finished"""
|
||||
|
||||
if not finished:
|
||||
finished = to_k8s_date(dt_now())
|
||||
|
||||
status = None
|
||||
# set status on decorated job to indicate that its finished
|
||||
if set_status:
|
||||
status = {
|
||||
"succeeded": 1,
|
||||
"startTime": metadata.get("creationTimestamp"),
|
||||
"completionTime": finished,
|
||||
}
|
||||
|
||||
return {
|
||||
"attachments": [],
|
||||
# set on job to match default behavior when job finishes
|
||||
"annotations": {"finished": finished},
|
||||
"status": status,
|
||||
}
|
||||
|
||||
async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
|
||||
"""create crawljobs from a job object spawned by cronjob"""
|
||||
|
||||
@ -52,21 +77,14 @@ class CronJobOperator(BaseOperator):
|
||||
crawl_id, is_qa=False
|
||||
)
|
||||
if finished:
|
||||
status = None
|
||||
finished_str = to_k8s_date(finished)
|
||||
set_status = False
|
||||
# mark job as completed
|
||||
if not data.object["status"].get("succeeded"):
|
||||
print("Cron Job Complete!", finished)
|
||||
status = {
|
||||
"succeeded": 1,
|
||||
"startTime": metadata.get("creationTimestamp"),
|
||||
"completionTime": to_k8s_date(finished),
|
||||
}
|
||||
set_status = True
|
||||
|
||||
return {
|
||||
"attachments": [],
|
||||
"annotations": {"finished": finished},
|
||||
"status": status,
|
||||
}
|
||||
return self.get_finished_response(metadata, set_status, finished_str)
|
||||
|
||||
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
|
||||
|
||||
@ -88,16 +106,22 @@ class CronJobOperator(BaseOperator):
|
||||
print(
|
||||
f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
|
||||
)
|
||||
return {"attachments": []}
|
||||
return self.get_finished_response(metadata)
|
||||
|
||||
# db create
|
||||
user = await self.user_ops.get_by_id(UUID(userid))
|
||||
if not user:
|
||||
print(f"error: missing user for id {userid}")
|
||||
return {"attachments": []}
|
||||
return self.get_finished_response(metadata)
|
||||
|
||||
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
|
||||
|
||||
if org.readOnly:
|
||||
print(
|
||||
f"org {org.id} set to read-only. skipping scheduled crawl for workflow {cid}"
|
||||
)
|
||||
return self.get_finished_response(metadata)
|
||||
|
||||
await self.crawl_config_ops.add_new_crawl(
|
||||
crawl_id,
|
||||
crawlconfig,
|
||||
|
@ -25,6 +25,7 @@ from .models import (
|
||||
StorageRef,
|
||||
OrgQuotas,
|
||||
OrgQuotaUpdate,
|
||||
OrgReadOnlyUpdate,
|
||||
OrgMetrics,
|
||||
OrgWebhookUrls,
|
||||
CreateOrg,
|
||||
@ -679,6 +680,16 @@ class OrgOps:
|
||||
slug_id_map[org["_id"]] = org["slug"]
|
||||
return slug_id_map
|
||||
|
||||
async def update_read_only(self, org: Organization, update: OrgReadOnlyUpdate):
|
||||
"""Set readOnly field for Organization"""
|
||||
if update.readOnly is False:
|
||||
# Set reason to empty string if readOnly is false
|
||||
update.readOnlyReason = ""
|
||||
|
||||
query = update.dict(exclude_unset=True)
|
||||
|
||||
return await self.orgs.find_one_and_update({"_id": org.id}, {"$set": query})
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-statements
|
||||
@ -817,6 +828,19 @@ def init_orgs_api(app, mdb, user_manager, invites, user_dep):
|
||||
|
||||
return {"updated": True}
|
||||
|
||||
@router.post("/read-only", tags=["organizations"])
|
||||
async def update_read_only(
|
||||
update: OrgReadOnlyUpdate,
|
||||
org: Organization = Depends(org_owner_dep),
|
||||
user: User = Depends(user_dep),
|
||||
):
|
||||
if not user.is_superuser:
|
||||
raise HTTPException(status_code=403, detail="Not Allowed")
|
||||
|
||||
await ops.update_read_only(org, update)
|
||||
|
||||
return {"updated": True}
|
||||
|
||||
@router.post("/event-webhook-urls", tags=["organizations"])
|
||||
async def update_event_webhook_urls(
|
||||
urls: OrgWebhookUrls,
|
||||
|
@ -486,3 +486,58 @@ def test_get_org_slug_lookup_non_superadmin(crawler_auth_headers):
|
||||
r = requests.get(f"{API_PREFIX}/orgs/slug-lookup", headers=crawler_auth_headers)
|
||||
assert r.status_code == 403
|
||||
assert r.json()["detail"] == "Not Allowed"
|
||||
|
||||
|
||||
def test_update_read_only(admin_auth_headers, default_org_id):
|
||||
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
|
||||
data = r.json()
|
||||
assert data["readOnly"] in (False, None)
|
||||
assert data["readOnlyReason"] in (None, "")
|
||||
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
|
||||
headers=admin_auth_headers,
|
||||
json={"readOnly": True, "readOnlyReason": "Payment suspended"},
|
||||
)
|
||||
assert r.json()["updated"]
|
||||
|
||||
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
|
||||
data = r.json()
|
||||
assert data["readOnly"] is True
|
||||
assert data["readOnlyReason"] == "Payment suspended"
|
||||
|
||||
# Try to start crawls, should fail
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Read Only Test Crawl",
|
||||
"description": "Should not run now",
|
||||
"tags": [],
|
||||
"config": {
|
||||
"seeds": [{"url": "https://webrecorder.net/", "depth": 1}],
|
||||
"exclude": "community",
|
||||
},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
data = r.json()
|
||||
|
||||
assert data["added"]
|
||||
assert data["id"]
|
||||
assert data["run_now_job"] is None
|
||||
|
||||
# Reset back to False, future crawls in tests should run fine
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
|
||||
headers=admin_auth_headers,
|
||||
json={"readOnly": False},
|
||||
)
|
||||
assert r.json()["updated"]
|
||||
|
||||
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
|
||||
data = r.json()
|
||||
assert data["readOnly"] is False
|
||||
# Test that reason is unset when readOnly is set to false, even implicitly
|
||||
assert data["readOnlyReason"] == ""
|
||||
|
Loading…
Reference in New Issue
Block a user