Add and enforce readOnly field in Organization (#1886)

Fixes https://github.com/webrecorder/browsertrix/issues/1883
Backend work for https://github.com/webrecorder/browsertrix/issues/1876

- If readOnly is set true, disallow crawls and QA analysis runs
- If readOnly is set to true, skip scheduled crawls
- Add endpoint to set `readOnly` with optional `readOnlyReason` (which
is automatically set back to an empty string when `readOnly` is being
set to false), which can be displayed in banner
- Operator: ensures cronjobs that are skipped due to internal logic (eg. readonly mode) simply succeed right away and do not leave a k8s job dangling.

---------
Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2024-06-25 22:30:53 -04:00 committed by GitHub
parent 48dfa485e5
commit 9140dd75bc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 142 additions and 14 deletions

View File

@ -219,6 +219,10 @@ class CrawlConfigOps:
storage_quota_reached = await self.org_ops.storage_quota_reached(org.id) storage_quota_reached = await self.org_ops.storage_quota_reached(org.id)
exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id) exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id)
if org.readOnly:
run_now = False
print(f"Org {org.id} set to read-only", flush=True)
if storage_quota_reached: if storage_quota_reached:
run_now = False run_now = False
print(f"Storage quota exceeded for org {org.id}", flush=True) print(f"Storage quota exceeded for org {org.id}", flush=True)
@ -843,6 +847,9 @@ class CrawlConfigOps:
except: except:
await self.readd_configmap(crawlconfig, org) await self.readd_configmap(crawlconfig, org)
if org.readOnly:
raise HTTPException(status_code=403, detail="org_set_to_read_only")
if await self.org_ops.storage_quota_reached(org.id): if await self.org_ops.storage_quota_reached(org.id):
raise HTTPException(status_code=403, detail="storage_quota_reached") raise HTTPException(status_code=403, detail="storage_quota_reached")

View File

@ -738,6 +738,10 @@ class CrawlOps(BaseCrawlOps):
crawl = await self.get_crawl(crawl_id, org) crawl = await self.get_crawl(crawl_id, org)
# ensure org execution is allowed
if org.readOnly:
raise HTTPException(status_code=403, detail="org_set_to_read_only")
# can only QA finished crawls # can only QA finished crawls
if not crawl.finished: if not crawl.finished:
raise HTTPException(status_code=400, detail="crawl_not_finished") raise HTTPException(status_code=400, detail="crawl_not_finished")

View File

@ -971,6 +971,14 @@ class OrgQuotaUpdate(BaseModel):
update: OrgQuotas update: OrgQuotas
# ============================================================================
class OrgReadOnlyUpdate(BaseModel):
"""Organization readonly update"""
readOnly: bool
readOnlyReason: Optional[str] = None
# ============================================================================ # ============================================================================
class OrgWebhookUrls(BaseModel): class OrgWebhookUrls(BaseModel):
"""Organization webhook URLs""" """Organization webhook URLs"""
@ -1025,6 +1033,9 @@ class OrgOut(BaseMongoModel):
webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls() webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
readOnly: Optional[bool]
readOnlyReason: Optional[str]
# ============================================================================ # ============================================================================
class Organization(BaseMongoModel): class Organization(BaseMongoModel):
@ -1069,6 +1080,9 @@ class Organization(BaseMongoModel):
origin: Optional[AnyHttpUrl] = None origin: Optional[AnyHttpUrl] = None
readOnly: Optional[bool] = False
readOnlyReason: Optional[str] = None
def is_owner(self, user): def is_owner(self, user):
"""Check if user is owner""" """Check if user is owner"""
return self._is_auth(user, UserRole.OWNER) return self._is_auth(user, UserRole.OWNER)

View File

@ -1,9 +1,10 @@
""" Operator handler for crawl CronJobs """ """ Operator handler for crawl CronJobs """
from uuid import UUID from uuid import UUID
from typing import Optional
import yaml import yaml
from btrixcloud.utils import to_k8s_date from btrixcloud.utils import to_k8s_date, dt_now
from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP
from .baseoperator import BaseOperator from .baseoperator import BaseOperator
@ -38,6 +39,30 @@ class CronJobOperator(BaseOperator):
] ]
} }
def get_finished_response(
self, metadata: dict[str, str], set_status=True, finished: Optional[str] = None
):
"""get final response to indicate cronjob created job is finished"""
if not finished:
finished = to_k8s_date(dt_now())
status = None
# set status on decorated job to indicate that its finished
if set_status:
status = {
"succeeded": 1,
"startTime": metadata.get("creationTimestamp"),
"completionTime": finished,
}
return {
"attachments": [],
# set on job to match default behavior when job finishes
"annotations": {"finished": finished},
"status": status,
}
async def sync_cronjob_crawl(self, data: MCDecoratorSyncData): async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
"""create crawljobs from a job object spawned by cronjob""" """create crawljobs from a job object spawned by cronjob"""
@ -52,21 +77,14 @@ class CronJobOperator(BaseOperator):
crawl_id, is_qa=False crawl_id, is_qa=False
) )
if finished: if finished:
status = None finished_str = to_k8s_date(finished)
set_status = False
# mark job as completed # mark job as completed
if not data.object["status"].get("succeeded"): if not data.object["status"].get("succeeded"):
print("Cron Job Complete!", finished) print("Cron Job Complete!", finished)
status = { set_status = True
"succeeded": 1,
"startTime": metadata.get("creationTimestamp"),
"completionTime": to_k8s_date(finished),
}
return { return self.get_finished_response(metadata, set_status, finished_str)
"attachments": [],
"annotations": {"finished": finished},
"status": status,
}
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"] configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
@ -88,16 +106,22 @@ class CronJobOperator(BaseOperator):
print( print(
f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?" f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
) )
return {"attachments": []} return self.get_finished_response(metadata)
# db create # db create
user = await self.user_ops.get_by_id(UUID(userid)) user = await self.user_ops.get_by_id(UUID(userid))
if not user: if not user:
print(f"error: missing user for id {userid}") print(f"error: missing user for id {userid}")
return {"attachments": []} return self.get_finished_response(metadata)
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig) warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
if org.readOnly:
print(
f"org {org.id} set to read-only. skipping scheduled crawl for workflow {cid}"
)
return self.get_finished_response(metadata)
await self.crawl_config_ops.add_new_crawl( await self.crawl_config_ops.add_new_crawl(
crawl_id, crawl_id,
crawlconfig, crawlconfig,

View File

@ -25,6 +25,7 @@ from .models import (
StorageRef, StorageRef,
OrgQuotas, OrgQuotas,
OrgQuotaUpdate, OrgQuotaUpdate,
OrgReadOnlyUpdate,
OrgMetrics, OrgMetrics,
OrgWebhookUrls, OrgWebhookUrls,
CreateOrg, CreateOrg,
@ -679,6 +680,16 @@ class OrgOps:
slug_id_map[org["_id"]] = org["slug"] slug_id_map[org["_id"]] = org["slug"]
return slug_id_map return slug_id_map
async def update_read_only(self, org: Organization, update: OrgReadOnlyUpdate):
"""Set readOnly field for Organization"""
if update.readOnly is False:
# Set reason to empty string if readOnly is false
update.readOnlyReason = ""
query = update.dict(exclude_unset=True)
return await self.orgs.find_one_and_update({"_id": org.id}, {"$set": query})
# ============================================================================ # ============================================================================
# pylint: disable=too-many-statements # pylint: disable=too-many-statements
@ -817,6 +828,19 @@ def init_orgs_api(app, mdb, user_manager, invites, user_dep):
return {"updated": True} return {"updated": True}
@router.post("/read-only", tags=["organizations"])
async def update_read_only(
update: OrgReadOnlyUpdate,
org: Organization = Depends(org_owner_dep),
user: User = Depends(user_dep),
):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
await ops.update_read_only(org, update)
return {"updated": True}
@router.post("/event-webhook-urls", tags=["organizations"]) @router.post("/event-webhook-urls", tags=["organizations"])
async def update_event_webhook_urls( async def update_event_webhook_urls(
urls: OrgWebhookUrls, urls: OrgWebhookUrls,

View File

@ -486,3 +486,58 @@ def test_get_org_slug_lookup_non_superadmin(crawler_auth_headers):
r = requests.get(f"{API_PREFIX}/orgs/slug-lookup", headers=crawler_auth_headers) r = requests.get(f"{API_PREFIX}/orgs/slug-lookup", headers=crawler_auth_headers)
assert r.status_code == 403 assert r.status_code == 403
assert r.json()["detail"] == "Not Allowed" assert r.json()["detail"] == "Not Allowed"
def test_update_read_only(admin_auth_headers, default_org_id):
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
data = r.json()
assert data["readOnly"] in (False, None)
assert data["readOnlyReason"] in (None, "")
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
headers=admin_auth_headers,
json={"readOnly": True, "readOnlyReason": "Payment suspended"},
)
assert r.json()["updated"]
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
data = r.json()
assert data["readOnly"] is True
assert data["readOnlyReason"] == "Payment suspended"
# Try to start crawls, should fail
crawl_data = {
"runNow": True,
"name": "Read Only Test Crawl",
"description": "Should not run now",
"tags": [],
"config": {
"seeds": [{"url": "https://webrecorder.net/", "depth": 1}],
"exclude": "community",
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
assert data["added"]
assert data["id"]
assert data["run_now_job"] is None
# Reset back to False, future crawls in tests should run fine
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
headers=admin_auth_headers,
json={"readOnly": False},
)
assert r.json()["updated"]
r = requests.get(f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers)
data = r.json()
assert data["readOnly"] is False
# Test that reason is unset when readOnly is set to false, even implicitly
assert data["readOnlyReason"] == ""