diff --git a/backend/btrixcloud/background_jobs.py b/backend/btrixcloud/background_jobs.py index b8cd420e..59204ed3 100644 --- a/backend/btrixcloud/background_jobs.py +++ b/backend/btrixcloud/background_jobs.py @@ -197,7 +197,8 @@ class BackgroundJobOps: job_id = await self.create_delete_replica_job( org, file, object_id, object_type, replica_ref ) - ids.append(job_id) + if job_id: + ids.append(job_id) return {"added": True, "ids": ids} @@ -209,17 +210,17 @@ class BackgroundJobOps: object_type: str, replica_ref: StorageRef, existing_job_id: Optional[str] = None, - ) -> str: + ) -> Optional[str]: """Create a job to delete one replica of a given file""" - replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref) - replica_endpoint, bucket_suffix = self.strip_bucket( - replica_storage.endpoint_url - ) - replica_file_path = bucket_suffix + file.filename - - job_type = BgJobType.DELETE_REPLICA.value - try: + replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref) + replica_endpoint, bucket_suffix = self.strip_bucket( + replica_storage.endpoint_url + ) + replica_file_path = bucket_suffix + file.filename + + job_type = BgJobType.DELETE_REPLICA.value + job_id = await self.crawl_manager.run_replica_job( oid=str(org.id), job_type=job_type, @@ -262,11 +263,13 @@ class BackgroundJobOps: return job_id + # pylint: disable=broad-exception-caught except Exception as exc: - # pylint: disable=raise-missing-from - raise HTTPException( - status_code=400, detail=f"Error starting background job: {exc}" + print( + "warning: replica deletion job could not be started " + + f"for {object_type} {file}: {exc}" ) + return None async def job_finished( self, diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 5c354828..5a2b87d2 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -162,7 +162,7 @@ def main(): init_uploads_api(*base_crawl_init) - org_ops.set_base_crawl_ops(base_crawl_ops) + org_ops.set_ops(base_crawl_ops, profiles, coll_ops) user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 47b35742..45904192 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -54,6 +54,7 @@ from .models import ( Collection, OrgOutExport, PageWithAllQA, + DeleteCrawlList, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .utils import slug_from_name, validate_slug, JSONSerializer @@ -61,15 +62,20 @@ from .utils import slug_from_name, validate_slug, JSONSerializer if TYPE_CHECKING: from .invites import InviteOps from .basecrawls import BaseCrawlOps + from .colls import CollectionOps + from .profiles import ProfileOps from .users import UserManager else: - InviteOps = BaseCrawlOps = UserManager = object + InviteOps = BaseCrawlOps = ProfileOps = CollectionOps = UserManager = object DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization") MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3)) +# number of items to delete at a time +DEL_ITEMS = 1000 + # ============================================================================ # pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals @@ -91,6 +97,7 @@ class OrgOps: self.users_db = mdb["users"] self.pages_db = mdb["pages"] self.version_db = mdb["version"] + self.invites_db = mdb["invites"] self.router = None self.org_viewer_dep = None @@ -104,9 +111,17 @@ class OrgOps: self.user_manager = user_manager self.register_to_org_id = os.environ.get("REGISTER_TO_ORG_ID") - def set_base_crawl_ops(self, base_crawl_ops: BaseCrawlOps) -> None: + def set_ops( + self, + base_crawl_ops: BaseCrawlOps, + profile_ops: ProfileOps, + coll_ops: CollectionOps, + ) -> None: """Set base crawl ops""" + # pylint: disable=attribute-defined-outside-init self.base_crawl_ops = base_crawl_ops + self.profile_ops = profile_ops + self.coll_ops = coll_ops def set_default_primary_storage(self, storage: StorageRef): """set default primary storage""" @@ -1058,6 +1073,59 @@ class OrgOps: collection = json_stream.to_standard_types(collection) await self.colls_db.insert_one(Collection.from_dict(collection).to_dict()) + async def delete_org_and_data(self, org: Organization, user_manager: UserManager): + """Delete org and all of its associated data.""" + # Delete archived items + cursor = self.crawls_db.find({"oid": org.id}, projection=["_id"]) + items = await cursor.to_list(length=DEL_ITEMS) + while items: + item_ids = [item["_id"] for item in items] + + await self.base_crawl_ops.delete_crawls_all_types( + delete_list=DeleteCrawlList(crawl_ids=item_ids), org=org + ) + + items = await cursor.to_list(length=DEL_ITEMS) + + # Delete workflows and revisions + cursor = self.crawl_configs_db.find({"oid": org.id}, projection=["_id"]) + workflows = await cursor.to_list(length=DEL_ITEMS) + while workflows: + workflow_ids = [workflow["_id"] for workflow in workflows] + await self.configs_revs_db.delete_many({"cid": {"$in": workflow_ids}}) + + workflows = await cursor.to_list(length=DEL_ITEMS) + + await self.crawl_configs_db.delete_many({"oid": org.id}) + + # Delete profiles + async for profile in self.profiles_db.find({"oid": org.id}, projection=["_id"]): + await self.profile_ops.delete_profile(profile["_id"], org) + + # Delete collections + async for coll in self.colls_db.find({"oid": org.id}, projection=["_id"]): + await self.coll_ops.delete_collection(coll["_id"], org) + + # Delete users that only belong to this org + for org_user_id in org.users.keys(): + user = await user_manager.get_by_id(UUID(org_user_id)) + if not user: + continue + orgs, total_orgs = await self.get_orgs_for_user(user) + if total_orgs == 1: + first_org = orgs[0] + if first_org.id != org.id: + continue + await self.users_db.delete_one({"id": user.id}) + + # Delete invites + await self.invites_db.delete_many({"oid": org.id}) + + # Delete org + await self.orgs.delete_one({"_id": org.id}) + + return {"deleted": True} + # ============================================================================ # pylint: disable=too-many-statements, too-many-arguments @@ -1206,6 +1274,15 @@ def init_orgs_api( org_out.execMinutesQuotaReached = await ops.exec_mins_quota_reached(org.id) return org_out + @router.delete("", tags=["organizations"]) + async def delete_org( + org: Organization = Depends(org_dep), user: User = Depends(user_dep) + ): + if not user.is_superuser: + raise HTTPException(status_code=403, detail="Not Allowed") + + return await ops.delete_org_and_data(org, user_manager) + @router.post("/rename", tags=["organizations"]) async def rename_org( rename: RenameOrg, diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 4395ddb3..c91dd138 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1028,5 +1028,5 @@ def test_delete_form_upload_and_crawls_from_all_crawls( if count + 1 == MAX_ATTEMPTS: assert False - time.sleep(5) + time.sleep(10) count += 1 diff --git a/backend/test/test_z_org_import_export.py b/backend/test/test_y_org_import_export.py similarity index 100% rename from backend/test/test_z_org_import_export.py rename to backend/test/test_y_org_import_export.py diff --git a/backend/test/test_z_delete_org.py b/backend/test/test_z_delete_org.py new file mode 100644 index 00000000..4a6d1da3 --- /dev/null +++ b/backend/test/test_z_delete_org.py @@ -0,0 +1,42 @@ +import requests + +from .conftest import API_PREFIX + + +def test_delete_org_non_superadmin(crawler_auth_headers, default_org_id): + # Assert that non-superadmin can't delete org + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}", headers=crawler_auth_headers + ) + assert r.status_code == 403 + assert r.json()["detail"] == "Not Allowed" + + +def test_delete_org_superadmin(admin_auth_headers, default_org_id): + # Track items in org to ensure they're deleted later (we may want to expand + # this, but currently only have the ability to check items across all orgs) + item_ids = [] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls", headers=admin_auth_headers + ) + assert r.status_code == 200 + data = r.json() + assert data["total"] > 0 + for item in data["items"]: + item_ids.append(item["id"]) + + # Delete org and its data + r = requests.delete( + f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers + ) + assert r.status_code == 200 + assert r.json()["deleted"] + + # Ensure items got deleted + for item_id in item_ids: + r = requests.get( + f"{API_PREFIX}/orgs/all/all-crawls/{item_id}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 404 diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index f9a952a8..dead7f3b 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -38,6 +38,7 @@ metadata: labels: crawl: {{ id }} role: crawler + network-policy: limit-crawler-egress spec: hostname: {{ name }} diff --git a/chart/app-templates/profilebrowser.yaml b/chart/app-templates/profilebrowser.yaml index 0f0c3d1b..8eda40c6 100644 --- a/chart/app-templates/profilebrowser.yaml +++ b/chart/app-templates/profilebrowser.yaml @@ -7,6 +7,7 @@ metadata: labels: browser: {{ id }} role: browser + network-policy: limit-crawler-egress spec: hostname: browser-{{ id }} diff --git a/chart/templates/networkpolicies.yaml b/chart/templates/networkpolicies.yaml new file mode 100644 index 00000000..4c605a0b --- /dev/null +++ b/chart/templates/networkpolicies.yaml @@ -0,0 +1,101 @@ +{{- if .Values.crawler_enable_network_policy -}} +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: crawler-limit-egress + namespace: {{ .Values.crawler_namespace }} +spec: + podSelector: + matchLabels: + network-policy: limit-crawler-egress + policyTypes: + - Egress + egress: + {{- if .Values.crawler_network_policy_egress | default false -}} + {{- .Values.crawler_network_policy_egress | toYaml | nindent 4 -}} + {{- else }} + # allow WWW + - to: + - ipBlock: + cidr: 0.0.0.0/0 + except: # Exclude traffic to Kubernetes service IPs and pods + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + + # allow frontend access for QA runs + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Release.Namespace }} + podSelector: + matchLabels: + role: frontend + + ports: + - port: 80 + protocol: TCP + + # allow DNS + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - port: 53 + protocol: UDP + - port: 53 + protocol: TCP + + + # allow other redis + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Values.crawler_namespace }} + podSelector: + matchLabels: + role: redis + + ports: + - port: 6379 + protocol: TCP + + + {{ if .Values.minio_local }} + # allow minio + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Release.Namespace }} + podSelector: + matchLabels: + app: local-minio + + ports: + - port: 9000 + protocol: TCP + + {{- end -}} + + + {{ if .Values.signer.enabled }} + # allow auth signer + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ .Release.Namespace }} + podSelector: + matchLabels: + app: auth-signer + + ports: + - port: {{ .Values.signer.port | default "5053" }} + protocol: TCP + + {{- end -}} + {{- end -}} +{{- end -}} diff --git a/chart/values.yaml b/chart/values.yaml index f99fcaf7..45f210fa 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -298,6 +298,12 @@ crawler_liveness_port: 6065 # crawler_fsgroup: 201400007 +# optional: enable/disable crawler network policy +crawler_enable_network_policy: true + +# optional: replace the default crawler egress policy with your own +# see chart/templates/networkpolicies.yaml for an example +# crawler_network_policy_egress: {} # time to wait for graceful stop grace_period: 1000 diff --git a/frontend/00-browsertrix-nginx-init.sh b/frontend/00-browsertrix-nginx-init.sh index afb5ba16..17d51867 100755 --- a/frontend/00-browsertrix-nginx-init.sh +++ b/frontend/00-browsertrix-nginx-init.sh @@ -13,6 +13,6 @@ else fi mkdir -p /etc/nginx/resolvers/ -echo resolver $(awk 'BEGIN{ORS=" "} $1=="nameserver" {print $2}' /etc/resolv.conf) valid=10s ipv6=off";" > /etc/nginx/resolvers/resolvers.conf +echo resolver $(grep -oP '(?<=nameserver\s)[^\s]+' /etc/resolv.conf | awk '{ if ($1 ~ /:/) { printf "[" $1 "] "; } else { printf $1 " "; } }') valid=10s ipv6=off";" > /etc/nginx/resolvers/resolvers.conf cat /etc/nginx/resolvers/resolvers.conf diff --git a/frontend/src/__mocks__/color.js b/frontend/src/__mocks__/color.js deleted file mode 100644 index 4a73c149..00000000 --- a/frontend/src/__mocks__/color.js +++ /dev/null @@ -1,6 +0,0 @@ -import { stub } from "sinon"; - -export default stub(() => ({ - lighten: () => {}, - darken: () => {}, -})); diff --git a/frontend/src/__mocks__/slugify.js b/frontend/src/__mocks__/slugify.js deleted file mode 100644 index 96ffa472..00000000 --- a/frontend/src/__mocks__/slugify.js +++ /dev/null @@ -1,3 +0,0 @@ -export default function slugify(value) { - return value; -} diff --git a/frontend/src/pages/invite/ui/org-form.test.ts b/frontend/src/pages/invite/ui/org-form.test.ts index 45a30c75..11e88e6a 100644 --- a/frontend/src/pages/invite/ui/org-form.test.ts +++ b/frontend/src/pages/invite/ui/org-form.test.ts @@ -1,5 +1,5 @@ import { expect, fixture, oneEvent } from "@open-wc/testing"; -import type { SlInput } from "@shoelace-style/shoelace"; +import { serialize, type SlInput } from "@shoelace-style/shoelace"; import { html } from "lit/static-html.js"; import { restore, stub } from "sinon"; @@ -65,15 +65,23 @@ describe("btrix-org-form", () => { const form = el.shadowRoot!.querySelector("form")!; - form - .querySelector('sl-input[name="orgName"]') - ?.setAttribute("value", "Fake Org Name"); - form - .querySelector('sl-input[name="orgSlug"]') - ?.setAttribute("value", "fake-org-name"); + const orgName = form.querySelector('sl-input[name="orgName"]')!; + const orgSlug = form.querySelector('sl-input[name="orgSlug"]')!; + + orgName.setAttribute("value", "Fake Org Name"); + orgSlug.setAttribute("value", "fake-org-name"); + + await orgName.updateComplete; + await orgSlug.updateComplete; const listener = oneEvent(form, "submit"); + // HACK Not completely sure why this works, but without calling `serialize` + // the form will not be serialized in `org-form`. + // Maybe due the implementation with `Reflect`? + // https://github.com/shoelace-style/shoelace/blob/0aecf6959986817d9315df90c898da55a8a64290/src/utilities/form.ts#L12 + serialize(form); + form.requestSubmit(); await el.updateComplete; diff --git a/frontend/src/pages/org/archived-item-detail/ui/qa.ts b/frontend/src/pages/org/archived-item-detail/ui/qa.ts index c1f6ee80..5638ae3d 100644 --- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts +++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts @@ -168,6 +168,13 @@ export class ArchivedItemDetailQA extends TailwindElement { } render() { + const fileCount = this.crawl?.filePageCount || 0; + const errorCount = this.crawl?.errorPageCount || 0; + const doneCount = this.crawl?.stats?.done + ? parseInt(this.crawl.stats.done) + : 0; + const htmlCount = doneCount - fileCount - errorCount; + return html`
@@ -261,9 +268,40 @@ export class ArchivedItemDetailQA extends TailwindElement { - ${when(this.mostRecentNonFailedQARun && this.qaRuns, (qaRuns) => - this.renderAnalysis(qaRuns), - )} + +
+ ${msg("Crawl Results")} +
+ + + +
+
+
+

+ ${htmlCount} ${msg( + "HTML Pages", + )} +

+

+ ${fileCount} ${msg( + "Non-HTML Files Captured As Pages", + )} +

+

+ ${errorCount} ${msg( + "Failed Pages", + )} +

+
+ ${when(this.mostRecentNonFailedQARun && this.qaRuns, (qaRuns) => + this.renderAnalysis(qaRuns), + )} +

@@ -482,132 +520,128 @@ export class ArchivedItemDetailQA extends TailwindElement { } return html` - -
-
- ${msg("Page Match Analysis")} - ${when(this.qaRuns, (qaRuns) => { - const finishedQARuns = qaRuns.filter(({ state }) => - finishedCrawlStates.includes(state), - ); - const latestFinishedSelected = - this.qaRunId === finishedQARuns[0]?.id; +
+
+ ${msg("HTML Page Match Analysis")} + ${when(this.qaRuns, (qaRuns) => { + const finishedQARuns = qaRuns.filter(({ state }) => + finishedCrawlStates.includes(state), + ); + const latestFinishedSelected = + this.qaRunId === finishedQARuns[0]?.id; - const finishedAndRunningQARuns = qaRuns.filter( - ({ state }) => - finishedCrawlStates.includes(state) || - QA_RUNNING_STATES.includes(state), - ); - const mostRecentSelected = - this.qaRunId === finishedAndRunningQARuns[0]?.id; + const finishedAndRunningQARuns = qaRuns.filter( + ({ state }) => + finishedCrawlStates.includes(state) || + QA_RUNNING_STATES.includes(state), + ); + const mostRecentSelected = + this.qaRunId === finishedAndRunningQARuns[0]?.id; - return html` -
- + + - - ${mostRecentSelected - ? msg("Current") - : latestFinishedSelected - ? msg("Last Finished") - : msg("Outdated")} - - - ) => - (this.qaRunId = e.detail.item.id)} - > -
- `; - })} -
-
-
- ${qaRun.state === "starting" - ? msg("Analysis starting") - : `${formatNumber(qaRun.stats.done)}/${formatNumber(qaRun.stats.found)} - ${pluralOf("pages", qaRun.stats.found)} ${msg("analyzed")}`} -
- - - - -
+ ${mostRecentSelected + ? msg("Current") + : latestFinishedSelected + ? msg("Last Finished") + : msg("Outdated")} + + + ) => + (this.qaRunId = e.detail.item.id)} + > +
+ `; + })}
-
- - - - ${msg("Statistic")} - - - ${msg("Chart")} - - - - - - ${msg("Screenshots")} - - - ${this.qaStats.value - ? this.renderMeter( - qaRun.stats.found, - this.qaStats.value.screenshotMatch, - isRunning, - ) - : this.renderMeter()} - - - - - ${msg("Text")} - - - ${this.qaStats.value - ? this.renderMeter( - qaRun.stats.found, - this.qaStats.value.textMatch, - isRunning, - ) - : this.renderMeter()} - - - - -
-
-
- ${qaStatsThresholds.map( - (threshold) => html` -
-
- ${threshold.lowerBoundary} -
-
${threshold.label}
-
- `, +
+
+ ${qaRun.state === "starting" + ? msg("Analysis starting") + : `${formatNumber(qaRun.stats.done)}/${formatNumber(qaRun.stats.found)} + ${pluralOf("pages", qaRun.stats.found)} ${msg("analyzed")}`} +
+ + -
- + > + + +
+

+
+ + + + ${msg("Statistic")} + + ${msg("Chart")} + + + + + ${msg("Screenshots")} + + + ${this.qaStats.value + ? this.renderMeter( + qaRun.stats.found, + this.qaStats.value.screenshotMatch, + isRunning, + ) + : this.renderMeter()} + + + + + ${msg("Text")} + + + ${this.qaStats.value + ? this.renderMeter( + qaRun.stats.found, + this.qaStats.value.textMatch, + isRunning, + ) + : this.renderMeter()} + + + + +
+
+
+ ${qaStatsThresholds.map( + (threshold) => html` +
+
+ ${threshold.lowerBoundary} +
+
${threshold.label}
+
+ `, + )} +
+
`; } diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 6e2a8c05..ec605894 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -175,6 +175,8 @@ type ArchivedItemBase = { activeQAStats: { done: number; found: number } | null; lastQAState: CrawlState | null; lastQAStarted: string | null; + filePageCount?: number; + errorPageCount?: number; }; export type Crawl = ArchivedItemBase & diff --git a/frontend/web-test-runner.config.mjs b/frontend/web-test-runner.config.mjs index 455f2fe7..904bce07 100644 --- a/frontend/web-test-runner.config.mjs +++ b/frontend/web-test-runner.config.mjs @@ -66,12 +66,6 @@ export default { "@shoelace-style/shoelace/dist/themes/light.css": fileURLToPath( new URL("./src/__mocks__/_empty.js", import.meta.url), ), - color: fileURLToPath( - new URL("./src/__mocks__/color.js", import.meta.url), - ), - slugify: fileURLToPath( - new URL("./src/__mocks__/slugify.js", import.meta.url), - ), }, }, },