Add and enforce readOnly field in Organization (#1886)

Fixes https://github.com/webrecorder/browsertrix/issues/1883
Backend work for https://github.com/webrecorder/browsertrix/issues/1876

- If readOnly is set true, disallow crawls and QA analysis runs
- If readOnly is set to true, skip scheduled crawls
- Add endpoint to set `readOnly` with optional `readOnlyReason` (which
is automatically set back to an empty string when `readOnly` is being
set to false), which can be displayed in banner
- Operator: ensures cronjobs that are skipped due to internal logic (eg. readonly mode) simply succeed right away and do not leave a k8s job dangling.

---------
Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2024-06-25 22:30:53 -04:00 committed by GitHub
parent 48dfa485e5
commit 9140dd75bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 142 additions and 14 deletions

View File

@ -219,6 +219,10 @@ class CrawlConfigOps:
storage_quota_reached = await self.org_ops.storage_quota_reached(org.id)
exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id)
if org.readOnly:
run_now = False
print(f"Org {org.id} set to read-only", flush=True)
if storage_quota_reached:
run_now = False
print(f"Storage quota exceeded for org {org.id}", flush=True)
@ -843,6 +847,9 @@ class CrawlConfigOps:
except:
await self.readd_configmap(crawlconfig, org)
if org.readOnly:
raise HTTPException(status_code=403, detail="org_set_to_read_only")
if await self.org_ops.storage_quota_reached(org.id):
raise HTTPException(status_code=403, detail="storage_quota_reached")

View File

@ -738,6 +738,10 @@ class CrawlOps(BaseCrawlOps):
crawl = await self.get_crawl(crawl_id, org)
# ensure org execution is allowed
if org.readOnly:
raise HTTPException(status_code=403, detail="org_set_to_read_only")
# can only QA finished crawls
if not crawl.finished:
raise HTTPException(status_code=400, detail="crawl_not_finished")

View File

@ -971,6 +971,14 @@ class OrgQuotaUpdate(BaseModel):
update: OrgQuotas
# ============================================================================
class OrgReadOnlyUpdate(BaseModel):
"""Organization readonly update"""
readOnly: bool
readOnlyReason: Optional[str] = None
# ============================================================================
class OrgWebhookUrls(BaseModel):
"""Organization webhook URLs"""
@ -1025,6 +1033,9 @@ class OrgOut(BaseMongoModel):
webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
readOnly: Optional[bool]
readOnlyReason: Optional[str]
# ============================================================================
class Organization(BaseMongoModel):
@ -1069,6 +1080,9 @@ class Organization(BaseMongoModel):
origin: Optional[AnyHttpUrl] = None
readOnly: Optional[bool] = False
readOnlyReason: Optional[str] = None
def is_owner(self, user):
"""Check if user is owner"""
return self._is_auth(user, UserRole.OWNER)

View File

@ -1,9 +1,10 @@
""" Operator handler for crawl CronJobs """
from uuid import UUID
from typing import Optional
import yaml
from btrixcloud.utils import to_k8s_date
from btrixcloud.utils import to_k8s_date, dt_now
from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP
from .baseoperator import BaseOperator
@ -38,6 +39,30 @@ class CronJobOperator(BaseOperator):
]
}
def get_finished_response(
self, metadata: dict[str, str], set_status=True, finished: Optional[str] = None
):
"""get final response to indicate cronjob created job is finished"""
if not finished:
finished = to_k8s_date(dt_now())
status = None
# set status on decorated job to indicate that its finished
if set_status:
status = {
"succeeded": 1,
"startTime": metadata.get("creationTimestamp"),
"completionTime": finished,
}
return {
"attachments": [],
# set on job to match default behavior when job finishes
"annotations": {"finished": finished},
"status": status,
}
async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
"""create crawljobs from a job object spawned by cronjob"""
@ -52,21 +77,14 @@ class CronJobOperator(BaseOperator):
crawl_id, is_qa=False
)
if finished:
status = None
finished_str = to_k8s_date(finished)
set_status = False
# mark job as completed
if not data.object["status"].get("succeeded"):
print("Cron Job Complete!", finished)
status = {
"succeeded": 1,
"startTime": metadata.get("creationTimestamp"),
"completionTime": to_k8s_date(finished),
}
set_status = True
return {
"attachments": [],
"annotations": {"finished": finished},
"status": status,
}
return self.get_finished_response(metadata, set_status, finished_str)
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
@ -88,16 +106,22 @@ class CronJobOperator(BaseOperator):
print(
f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
)
return {"attachments": []}
return self.get_finished_response(metadata)
# db create
user = await self.user_ops.get_by_id(UUID(userid))
if not user:
print(f"error: missing user for id {userid}")
return {"attachments": []}
return self.get_finished_response(metadata)
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
if org.readOnly:
print(
f"org {org.id} set to read-only. skipping scheduled crawl for workflow {cid}"
)
return self.get_finished_response(metadata)
await self.crawl_config_ops.add_new_crawl(
crawl_id,
crawlconfig,

View File

@ -25,6 +25,7 @@ from .models import (
StorageRef,
OrgQuotas,
OrgQuotaUpdate,
OrgReadOnlyUpdate,
OrgMetrics,
OrgWebhookUrls,
CreateOrg,
@ -679,6 +680,16 @@ class OrgOps:
slug_id_map[org["_id"]] = org["slug"]
return slug_id_map
async def update_read_only(self, org: Organization, update: OrgReadOnlyUpdate):
"""Set readOnly field for Organization"""
if update.readOnly is False:
# Set reason to empty string if readOnly is false
update.readOnlyReason = ""
query = update.dict(exclude_unset=True)
return await self.orgs.find_one_and_update({"_id": org.id}, {"$set": query})
# ============================================================================
# pylint: disable=too-many-statements
@ -817,6 +828,19 @@ def init_orgs_api(app, mdb, user_manager, invites, user_dep):
return {"updated": True}
@router.post("/read-only", tags=["organizations"])
async def update_read_only(
update: OrgReadOnlyUpdate,
org: Organization = Depends(org_owner_dep),
user: User = Depends(user_dep),
):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
await ops.update_read_only(org, update)
return {"updated": True}
@router.post("/event-webhook-urls", tags=["organizations"])
async def update_event_webhook_urls(
urls: OrgWebhookUrls,

View File

@ -486,3 +486,58 @@ def test_get_org_slug_lookup_non_superadmin(crawler_auth_headers):
r = requests.get(f"{API_PREFIX}/orgs/slug-lookup", headers=crawler_auth_headers)
assert r.status_code == 403
assert r.json()["detail"] == "Not Allowed"
def test_update_read_only(admin_auth_headers, default_org_id):
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
data = r.json()
assert data["readOnly"] in (False, None)
assert data["readOnlyReason"] in (None, "")
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
headers=admin_auth_headers,
json={"readOnly": True, "readOnlyReason": "Payment suspended"},
)
assert r.json()["updated"]
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
data = r.json()
assert data["readOnly"] is True
assert data["readOnlyReason"] == "Payment suspended"
# Try to start crawls, should fail
crawl_data = {
"runNow": True,
"name": "Read Only Test Crawl",
"description": "Should not run now",
"tags": [],
"config": {
"seeds": [{"url": "https://webrecorder.net/", "depth": 1}],
"exclude": "community",
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
assert data["added"]
assert data["id"]
assert data["run_now_job"] is None
# Reset back to False, future crawls in tests should run fine
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
headers=admin_auth_headers,
json={"readOnly": False},
)
assert r.json()["updated"]
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
data = r.json()
assert data["readOnly"] is False
# Test that reason is unset when readOnly is set to false, even implicitly
assert data["readOnlyReason"] == ""