Merge branch 'main' into frontend-org-manage-readonly

This commit is contained in:
sua yoo 2024-07-08 11:20:30 -07:00 committed by GitHub
commit c97900ec2b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 424 additions and 164 deletions

View File

@ -197,7 +197,8 @@ class BackgroundJobOps:
job_id = await self.create_delete_replica_job(
org, file, object_id, object_type, replica_ref
)
ids.append(job_id)
if job_id:
ids.append(job_id)
return {"added": True, "ids": ids}
@ -209,17 +210,17 @@ class BackgroundJobOps:
object_type: str,
replica_ref: StorageRef,
existing_job_id: Optional[str] = None,
) -> str:
) -> Optional[str]:
"""Create a job to delete one replica of a given file"""
replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
replica_endpoint, bucket_suffix = self.strip_bucket(
replica_storage.endpoint_url
)
replica_file_path = bucket_suffix + file.filename
job_type = BgJobType.DELETE_REPLICA.value
try:
replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
replica_endpoint, bucket_suffix = self.strip_bucket(
replica_storage.endpoint_url
)
replica_file_path = bucket_suffix + file.filename
job_type = BgJobType.DELETE_REPLICA.value
job_id = await self.crawl_manager.run_replica_job(
oid=str(org.id),
job_type=job_type,
@ -262,11 +263,13 @@ class BackgroundJobOps:
return job_id
# pylint: disable=broad-exception-caught
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail=f"Error starting background job: {exc}"
print(
"warning: replica deletion job could not be started "
+ f"for {object_type} {file}: {exc}"
)
return None
async def job_finished(
self,

View File

@ -162,7 +162,7 @@ def main():
init_uploads_api(*base_crawl_init)
org_ops.set_base_crawl_ops(base_crawl_ops)
org_ops.set_ops(base_crawl_ops, profiles, coll_ops)
user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)

View File

@ -54,6 +54,7 @@ from .models import (
Collection,
OrgOutExport,
PageWithAllQA,
DeleteCrawlList,
)
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .utils import slug_from_name, validate_slug, JSONSerializer
@ -61,15 +62,20 @@ from .utils import slug_from_name, validate_slug, JSONSerializer
if TYPE_CHECKING:
from .invites import InviteOps
from .basecrawls import BaseCrawlOps
from .colls import CollectionOps
from .profiles import ProfileOps
from .users import UserManager
else:
InviteOps = BaseCrawlOps = UserManager = object
InviteOps = BaseCrawlOps = ProfileOps = CollectionOps = UserManager = object
DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization")
MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3))
# number of items to delete at a time
DEL_ITEMS = 1000
# ============================================================================
# pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals
@ -91,6 +97,7 @@ class OrgOps:
self.users_db = mdb["users"]
self.pages_db = mdb["pages"]
self.version_db = mdb["version"]
self.invites_db = mdb["invites"]
self.router = None
self.org_viewer_dep = None
@ -104,9 +111,17 @@ class OrgOps:
self.user_manager = user_manager
self.register_to_org_id = os.environ.get("REGISTER_TO_ORG_ID")
def set_base_crawl_ops(self, base_crawl_ops: BaseCrawlOps) -> None:
def set_ops(
self,
base_crawl_ops: BaseCrawlOps,
profile_ops: ProfileOps,
coll_ops: CollectionOps,
) -> None:
"""Set base crawl ops"""
# pylint: disable=attribute-defined-outside-init
self.base_crawl_ops = base_crawl_ops
self.profile_ops = profile_ops
self.coll_ops = coll_ops
def set_default_primary_storage(self, storage: StorageRef):
"""set default primary storage"""
@ -1058,6 +1073,59 @@ class OrgOps:
collection = json_stream.to_standard_types(collection)
await self.colls_db.insert_one(Collection.from_dict(collection).to_dict())
async def delete_org_and_data(self, org: Organization, user_manager: UserManager):
"""Delete org and all of its associated data."""
# Delete archived items
cursor = self.crawls_db.find({"oid": org.id}, projection=["_id"])
items = await cursor.to_list(length=DEL_ITEMS)
while items:
item_ids = [item["_id"] for item in items]
await self.base_crawl_ops.delete_crawls_all_types(
delete_list=DeleteCrawlList(crawl_ids=item_ids), org=org
)
items = await cursor.to_list(length=DEL_ITEMS)
# Delete workflows and revisions
cursor = self.crawl_configs_db.find({"oid": org.id}, projection=["_id"])
workflows = await cursor.to_list(length=DEL_ITEMS)
while workflows:
workflow_ids = [workflow["_id"] for workflow in workflows]
await self.configs_revs_db.delete_many({"cid": {"$in": workflow_ids}})
workflows = await cursor.to_list(length=DEL_ITEMS)
await self.crawl_configs_db.delete_many({"oid": org.id})
# Delete profiles
async for profile in self.profiles_db.find({"oid": org.id}, projection=["_id"]):
await self.profile_ops.delete_profile(profile["_id"], org)
# Delete collections
async for coll in self.colls_db.find({"oid": org.id}, projection=["_id"]):
await self.coll_ops.delete_collection(coll["_id"], org)
# Delete users that only belong to this org
for org_user_id in org.users.keys():
user = await user_manager.get_by_id(UUID(org_user_id))
if not user:
continue
orgs, total_orgs = await self.get_orgs_for_user(user)
if total_orgs == 1:
first_org = orgs[0]
if first_org.id != org.id:
continue
await self.users_db.delete_one({"id": user.id})
# Delete invites
await self.invites_db.delete_many({"oid": org.id})
# Delete org
await self.orgs.delete_one({"_id": org.id})
return {"deleted": True}
# ============================================================================
# pylint: disable=too-many-statements, too-many-arguments
@ -1206,6 +1274,15 @@ def init_orgs_api(
org_out.execMinutesQuotaReached = await ops.exec_mins_quota_reached(org.id)
return org_out
@router.delete("", tags=["organizations"])
async def delete_org(
org: Organization = Depends(org_dep), user: User = Depends(user_dep)
):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
return await ops.delete_org_and_data(org, user_manager)
@router.post("/rename", tags=["organizations"])
async def rename_org(
rename: RenameOrg,

View File

@ -1028,5 +1028,5 @@ def test_delete_form_upload_and_crawls_from_all_crawls(
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
time.sleep(10)
count += 1

View File

@ -0,0 +1,42 @@
import requests
from .conftest import API_PREFIX
def test_delete_org_non_superadmin(crawler_auth_headers, default_org_id):
# Assert that non-superadmin can't delete org
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}", headers=crawler_auth_headers
)
assert r.status_code == 403
assert r.json()["detail"] == "Not Allowed"
def test_delete_org_superadmin(admin_auth_headers, default_org_id):
# Track items in org to ensure they're deleted later (we may want to expand
# this, but currently only have the ability to check items across all orgs)
item_ids = []
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls", headers=admin_auth_headers
)
assert r.status_code == 200
data = r.json()
assert data["total"] > 0
for item in data["items"]:
item_ids.append(item["id"])
# Delete org and its data
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}", headers=admin_auth_headers
)
assert r.status_code == 200
assert r.json()["deleted"]
# Ensure items got deleted
for item_id in item_ids:
r = requests.get(
f"{API_PREFIX}/orgs/all/all-crawls/{item_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 404

View File

@ -38,6 +38,7 @@ metadata:
labels:
crawl: {{ id }}
role: crawler
network-policy: limit-crawler-egress
spec:
hostname: {{ name }}

View File

@ -7,6 +7,7 @@ metadata:
labels:
browser: {{ id }}
role: browser
network-policy: limit-crawler-egress
spec:
hostname: browser-{{ id }}

View File

@ -0,0 +1,101 @@
{{- if .Values.crawler_enable_network_policy -}}
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: crawler-limit-egress
namespace: {{ .Values.crawler_namespace }}
spec:
podSelector:
matchLabels:
network-policy: limit-crawler-egress
policyTypes:
- Egress
egress:
{{- if .Values.crawler_network_policy_egress | default false -}}
{{- .Values.crawler_network_policy_egress | toYaml | nindent 4 -}}
{{- else }}
# allow WWW
- to:
- ipBlock:
cidr: 0.0.0.0/0
except: # Exclude traffic to Kubernetes service IPs and pods
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
# allow frontend access for QA runs
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
podSelector:
matchLabels:
role: frontend
ports:
- port: 80
protocol: TCP
# allow DNS
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: UDP
- port: 53
protocol: TCP
# allow other redis
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Values.crawler_namespace }}
podSelector:
matchLabels:
role: redis
ports:
- port: 6379
protocol: TCP
{{ if .Values.minio_local }}
# allow minio
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
podSelector:
matchLabels:
app: local-minio
ports:
- port: 9000
protocol: TCP
{{- end -}}
{{ if .Values.signer.enabled }}
# allow auth signer
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: {{ .Release.Namespace }}
podSelector:
matchLabels:
app: auth-signer
ports:
- port: {{ .Values.signer.port | default "5053" }}
protocol: TCP
{{- end -}}
{{- end -}}
{{- end -}}

View File

@ -298,6 +298,12 @@ crawler_liveness_port: 6065
# crawler_fsgroup: 201400007
# optional: enable/disable crawler network policy
crawler_enable_network_policy: true
# optional: replace the default crawler egress policy with your own
# see chart/templates/networkpolicies.yaml for an example
# crawler_network_policy_egress: {}
# time to wait for graceful stop
grace_period: 1000

View File

@ -13,6 +13,6 @@ else
fi
mkdir -p /etc/nginx/resolvers/
echo resolver $(awk 'BEGIN{ORS=" "} $1=="nameserver" {print $2}' /etc/resolv.conf) valid=10s ipv6=off";" > /etc/nginx/resolvers/resolvers.conf
echo resolver $(grep -oP '(?<=nameserver\s)[^\s]+' /etc/resolv.conf | awk '{ if ($1 ~ /:/) { printf "[" $1 "] "; } else { printf $1 " "; } }') valid=10s ipv6=off";" > /etc/nginx/resolvers/resolvers.conf
cat /etc/nginx/resolvers/resolvers.conf

View File

@ -1,6 +0,0 @@
import { stub } from "sinon";
export default stub(() => ({
lighten: () => {},
darken: () => {},
}));

View File

@ -1,3 +0,0 @@
export default function slugify(value) {
return value;
}

View File

@ -1,5 +1,5 @@
import { expect, fixture, oneEvent } from "@open-wc/testing";
import type { SlInput } from "@shoelace-style/shoelace";
import { serialize, type SlInput } from "@shoelace-style/shoelace";
import { html } from "lit/static-html.js";
import { restore, stub } from "sinon";
@ -65,15 +65,23 @@ describe("btrix-org-form", () => {
const form = el.shadowRoot!.querySelector<HTMLFormElement>("form")!;
form
.querySelector('sl-input[name="orgName"]')
?.setAttribute("value", "Fake Org Name");
form
.querySelector('sl-input[name="orgSlug"]')
?.setAttribute("value", "fake-org-name");
const orgName = form.querySelector<SlInput>('sl-input[name="orgName"]')!;
const orgSlug = form.querySelector<SlInput>('sl-input[name="orgSlug"]')!;
orgName.setAttribute("value", "Fake Org Name");
orgSlug.setAttribute("value", "fake-org-name");
await orgName.updateComplete;
await orgSlug.updateComplete;
const listener = oneEvent(form, "submit");
// HACK Not completely sure why this works, but without calling `serialize`
// the form will not be serialized in `org-form`.
// Maybe due the implementation with `Reflect`?
// https://github.com/shoelace-style/shoelace/blob/0aecf6959986817d9315df90c898da55a8a64290/src/utilities/form.ts#L12
serialize(form);
form.requestSubmit();
await el.updateComplete;

View File

@ -168,6 +168,13 @@ export class ArchivedItemDetailQA extends TailwindElement {
}
render() {
const fileCount = this.crawl?.filePageCount || 0;
const errorCount = this.crawl?.errorPageCount || 0;
const doneCount = this.crawl?.stats?.done
? parseInt(this.crawl.stats.done)
: 0;
const htmlCount = doneCount - fileCount - errorCount;
return html`
<div class="mb-5 rounded-lg border p-2">
<btrix-desc-list horizontal>
@ -261,9 +268,40 @@ export class ArchivedItemDetailQA extends TailwindElement {
<sl-divider></sl-divider>
<btrix-tab-group-panel name="pages" class="block">
${when(this.mostRecentNonFailedQARun && this.qaRuns, (qaRuns) =>
this.renderAnalysis(qaRuns),
)}
<btrix-card class="gap-y-1">
<div slot="title" class="flex flex-wrap justify-between">
${msg("Crawl Results")}
<div class="text-neutral-500">
<sl-tooltip
content=${msg(
"Non-HTML files captured as pages are known good files that the crawler found as clickable links on a page and don't need to be analyzed. Failed pages did not respond when the crawler tried to visit them.",
)}
>
<sl-icon class="text-base" name="info-circle"></sl-icon>
</sl-tooltip>
</div>
</div>
<div>
<p>
<span class="text-primary">${htmlCount}</span> ${msg(
"HTML Pages",
)}
</p>
<p>
<span class="text-neutral-600">${fileCount}</span> ${msg(
"Non-HTML Files Captured As Pages",
)}
</p>
<p>
<span class="text-danger">${errorCount}</span> ${msg(
"Failed Pages",
)}
</p>
</div>
${when(this.mostRecentNonFailedQARun && this.qaRuns, (qaRuns) =>
this.renderAnalysis(qaRuns),
)}
</btrix-card>
<div>
<h4 class="mb-2 mt-4 text-lg leading-8">
@ -482,132 +520,128 @@ export class ArchivedItemDetailQA extends TailwindElement {
}
return html`
<btrix-card>
<div slot="title" class="flex flex-wrap justify-between">
<div class="flex flex-wrap items-center gap-x-3 gap-y-1">
${msg("Page Match Analysis")}
${when(this.qaRuns, (qaRuns) => {
const finishedQARuns = qaRuns.filter(({ state }) =>
finishedCrawlStates.includes(state),
);
const latestFinishedSelected =
this.qaRunId === finishedQARuns[0]?.id;
<div
class="mb-3 mt-6 flex flex-wrap justify-between border-b pb-3 text-base font-semibold leading-none"
>
<div class="flex flex-wrap items-center gap-x-3">
${msg("HTML Page Match Analysis")}
${when(this.qaRuns, (qaRuns) => {
const finishedQARuns = qaRuns.filter(({ state }) =>
finishedCrawlStates.includes(state),
);
const latestFinishedSelected =
this.qaRunId === finishedQARuns[0]?.id;
const finishedAndRunningQARuns = qaRuns.filter(
({ state }) =>
finishedCrawlStates.includes(state) ||
QA_RUNNING_STATES.includes(state),
);
const mostRecentSelected =
this.qaRunId === finishedAndRunningQARuns[0]?.id;
const finishedAndRunningQARuns = qaRuns.filter(
({ state }) =>
finishedCrawlStates.includes(state) ||
QA_RUNNING_STATES.includes(state),
);
const mostRecentSelected =
this.qaRunId === finishedAndRunningQARuns[0]?.id;
return html`
<div>
<sl-tooltip
content=${mostRecentSelected
? msg("Youre viewing the latest analysis run results.")
: msg(
"Youre viewing results from an older analysis run.",
)}
return html`
<div>
<sl-tooltip
content=${mostRecentSelected
? msg("Youre viewing the latest analysis run results.")
: msg("Youre viewing results from an older analysis run.")}
>
<sl-tag
size="small"
variant=${mostRecentSelected ? "success" : "warning"}
>
<sl-tag
size="small"
variant=${mostRecentSelected ? "success" : "warning"}
>
${mostRecentSelected
? msg("Current")
: latestFinishedSelected
? msg("Last Finished")
: msg("Outdated")}
</sl-tag>
</sl-tooltip>
<btrix-qa-run-dropdown
.items=${finishedAndRunningQARuns}
selectedId=${this.qaRunId || ""}
@btrix-select=${(e: CustomEvent<SelectDetail>) =>
(this.qaRunId = e.detail.item.id)}
></btrix-qa-run-dropdown>
</div>
`;
})}
</div>
<div class="flex items-center gap-2 text-neutral-500">
<div class="text-sm font-normal">
${qaRun.state === "starting"
? msg("Analysis starting")
: `${formatNumber(qaRun.stats.done)}/${formatNumber(qaRun.stats.found)}
${pluralOf("pages", qaRun.stats.found)} ${msg("analyzed")}`}
</div>
<sl-tooltip
content=${msg(
"Match analysis compares pages during a crawl against their replay during an analysis run. A good match indicates that the crawl is probably good, whereas severe inconsistencies may indicate a bad crawl.",
)}
>
<sl-icon class="text-base" name="info-circle"></sl-icon>
</sl-tooltip>
</div>
${mostRecentSelected
? msg("Current")
: latestFinishedSelected
? msg("Last Finished")
: msg("Outdated")}
</sl-tag>
</sl-tooltip>
<btrix-qa-run-dropdown
.items=${finishedAndRunningQARuns}
selectedId=${this.qaRunId || ""}
@btrix-select=${(e: CustomEvent<SelectDetail>) =>
(this.qaRunId = e.detail.item.id)}
></btrix-qa-run-dropdown>
</div>
`;
})}
</div>
<figure>
<btrix-table class="grid-cols-[min-content_1fr]">
<btrix-table-head class="sr-only">
<btrix-table-header-cell>
${msg("Statistic")}
</btrix-table-header-cell>
<btrix-table-header-cell>
${msg("Chart")}
</btrix-table-header-cell>
</btrix-table-head>
<btrix-table-body>
<btrix-table-row>
<btrix-table-cell class="font-medium">
${msg("Screenshots")}
</btrix-table-cell>
<btrix-table-cell class="p-0">
${this.qaStats.value
? this.renderMeter(
qaRun.stats.found,
this.qaStats.value.screenshotMatch,
isRunning,
)
: this.renderMeter()}
</btrix-table-cell>
</btrix-table-row>
<btrix-table-row>
<btrix-table-cell class="font-medium">
${msg("Text")}
</btrix-table-cell>
<btrix-table-cell class="p-0">
${this.qaStats.value
? this.renderMeter(
qaRun.stats.found,
this.qaStats.value.textMatch,
isRunning,
)
: this.renderMeter()}
</btrix-table-cell>
</btrix-table-row>
</btrix-table-body>
</btrix-table>
</figure>
<figcaption slot="footer" class="mt-2">
<dl class="flex flex-wrap items-center justify-end gap-4">
${qaStatsThresholds.map(
(threshold) => html`
<div class="flex items-center gap-2">
<dt
class="size-4 flex-shrink-0 rounded"
style="background-color: ${threshold.cssColor}"
>
<span class="sr-only">${threshold.lowerBoundary}</span>
</dt>
<dd>${threshold.label}</dd>
</div>
`,
<div class="flex items-center gap-2 text-neutral-500">
<div class="text-sm font-normal">
${qaRun.state === "starting"
? msg("Analysis starting")
: `${formatNumber(qaRun.stats.done)}/${formatNumber(qaRun.stats.found)}
${pluralOf("pages", qaRun.stats.found)} ${msg("analyzed")}`}
</div>
<sl-tooltip
content=${msg(
"Match analysis compares pages during a crawl against their replay during an analysis run. A good match indicates that the crawl is probably good, whereas severe inconsistencies may indicate a bad crawl.",
)}
</dl>
</figcaption>
</btrix-card>
>
<sl-icon class="text-base" name="info-circle"></sl-icon>
</sl-tooltip>
</div>
</div>
<figure>
<btrix-table class="grid-cols-[min-content_1fr]">
<btrix-table-head class="sr-only">
<btrix-table-header-cell>
${msg("Statistic")}
</btrix-table-header-cell>
<btrix-table-header-cell> ${msg("Chart")} </btrix-table-header-cell>
</btrix-table-head>
<btrix-table-body>
<btrix-table-row>
<btrix-table-cell class="font-medium">
${msg("Screenshots")}
</btrix-table-cell>
<btrix-table-cell class="p-0">
${this.qaStats.value
? this.renderMeter(
qaRun.stats.found,
this.qaStats.value.screenshotMatch,
isRunning,
)
: this.renderMeter()}
</btrix-table-cell>
</btrix-table-row>
<btrix-table-row>
<btrix-table-cell class="font-medium">
${msg("Text")}
</btrix-table-cell>
<btrix-table-cell class="p-0">
${this.qaStats.value
? this.renderMeter(
qaRun.stats.found,
this.qaStats.value.textMatch,
isRunning,
)
: this.renderMeter()}
</btrix-table-cell>
</btrix-table-row>
</btrix-table-body>
</btrix-table>
</figure>
<figcaption slot="footer" class="mt-2">
<dl class="flex flex-wrap items-center justify-end gap-4">
${qaStatsThresholds.map(
(threshold) => html`
<div class="flex items-center gap-2">
<dt
class="size-4 flex-shrink-0 rounded"
style="background-color: ${threshold.cssColor}"
>
<span class="sr-only">${threshold.lowerBoundary}</span>
</dt>
<dd>${threshold.label}</dd>
</div>
`,
)}
</dl>
</figcaption>
`;
}

View File

@ -175,6 +175,8 @@ type ArchivedItemBase = {
activeQAStats: { done: number; found: number } | null;
lastQAState: CrawlState | null;
lastQAStarted: string | null;
filePageCount?: number;
errorPageCount?: number;
};
export type Crawl = ArchivedItemBase &

View File

@ -66,12 +66,6 @@ export default {
"@shoelace-style/shoelace/dist/themes/light.css": fileURLToPath(
new URL("./src/__mocks__/_empty.js", import.meta.url),
),
color: fileURLToPath(
new URL("./src/__mocks__/color.js", import.meta.url),
),
slugify: fileURLToPath(
new URL("./src/__mocks__/slugify.js", import.meta.url),
),
},
},
},