browsertrix/backend/btrixcloud/models.py
Ilya Kreymer 4db3053a9f
fix crawlFilenameTemplate + add_crawl_config cleanup (fixes #1932) (#1935)
- ensure crawlFilenameTemplate is part of the CrawlConfig model
- change CrawlConfig init to use type-safe construction
- add a run_now_internal() that is shared for starting crawl, either on
demand or from new config
- add OrgOps.can_run_crawls() to check against org quotas for crawling
- cleanup profile updates, remove _lookup_profile, only check for
EmptyStr in update

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-07-17 10:48:25 -07:00

2226 lines
58 KiB
Python

"""
Crawl-related models and types
"""
from datetime import datetime
from enum import Enum, IntEnum
from uuid import UUID
import os
from typing import Optional, List, Dict, Union, Literal, Any, get_args
from pydantic import (
BaseModel,
conint,
Field,
HttpUrl,
AnyHttpUrl,
EmailStr,
ConstrainedStr,
)
# from fastapi_users import models as fastapi_users_models
from .db import BaseMongoModel
from .utils import dt_now
# crawl scale for constraint
MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3))
# pylint: disable=invalid-name, too-many-lines
# ============================================================================
class UserRole(IntEnum):
"""User role"""
VIEWER = 10
CRAWLER = 20
OWNER = 40
SUPERADMIN = 100
# ============================================================================
### INVITES ###
# ============================================================================
class InvitePending(BaseMongoModel):
"""An invite for a new user, with an email and invite token as id"""
id: UUID
created: datetime
tokenHash: str
inviterEmail: str
fromSuperuser: Optional[bool]
oid: Optional[UUID]
role: UserRole = UserRole.VIEWER
email: Optional[str]
# set if existing user
userid: Optional[UUID]
# ============================================================================
class InviteOut(BaseModel):
"""Single invite output model"""
created: datetime
inviterEmail: str
inviterName: str
oid: Optional[UUID]
orgName: Optional[str]
orgSlug: Optional[str]
role: UserRole = UserRole.VIEWER
email: Optional[str]
firstOrgAdmin: Optional[bool] = None
# ============================================================================
class InviteRequest(BaseModel):
"""Request to invite another user"""
email: str
# ============================================================================
class InviteToOrgRequest(InviteRequest):
"""Request to invite another user to an organization"""
role: UserRole
# ============================================================================
class AddToOrgRequest(InviteRequest):
"""Request to add a new user to an organization directly"""
role: UserRole
password: str
name: str
# ============================================================================
class InviteAddedResponse(BaseModel):
"""Response for API endpoints that add resource and return id and name"""
added: bool
id: UUID
invited: str
token: UUID
# ============================================================================
### MAIN USER MODEL ###
# ============================================================================
class User(BaseModel):
"""
User Model
"""
id: UUID
name: str = ""
email: EmailStr
is_superuser: bool = False
is_verified: bool = False
hashed_password: str
def dict(self, *a, **kw):
"""ensure invites / hashed_password never serialize, just in case"""
exclude = kw.get("exclude") or set()
exclude.add("invites")
exclude.add("hashed_password")
return super().dict(*a, **kw)
# ============================================================================
class FailedLogin(BaseMongoModel):
"""
Failed login model
"""
attempted: datetime = dt_now()
email: str
# Consecutive failed logins, reset to 0 on successful login or after
# password is reset. On failed_logins >= 5 within the hour before this
# object is deleted, the user is unable to log in until they reset their
# password.
count: int = 1
# ============================================================================
class UserOrgInfoOut(BaseModel):
"""org per user"""
id: UUID
name: str
slug: str
default: bool
role: UserRole
# ============================================================================
class UserOut(BaseModel):
"""Output User model"""
id: UUID
name: str = ""
email: EmailStr
is_superuser: bool = False
is_verified: bool = False
orgs: List[UserOrgInfoOut]
# ============================================================================
### CRAWL STATES
# ============================================================================
TYPE_RUNNING_STATES = Literal[
"running", "pending-wait", "generate-wacz", "uploading-wacz"
]
RUNNING_STATES = get_args(TYPE_RUNNING_STATES)
TYPE_STARTING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"]
STARTING_STATES = get_args(TYPE_STARTING_STATES)
TYPE_FAILED_STATES = Literal["canceled", "failed", "skipped_quota_reached"]
FAILED_STATES = get_args(TYPE_FAILED_STATES)
TYPE_SUCCESSFUL_STATES = Literal["complete", "stopped_by_user", "stopped_quota_reached"]
SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES)
TYPE_RUNNING_AND_STARTING_STATES = Literal[TYPE_STARTING_STATES, TYPE_RUNNING_STATES]
RUNNING_AND_STARTING_STATES = [*STARTING_STATES, *RUNNING_STATES]
RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES]
TYPE_NON_RUNNING_STATES = Literal[TYPE_FAILED_STATES, TYPE_SUCCESSFUL_STATES]
NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]
TYPE_ALL_CRAWL_STATES = Literal[
TYPE_RUNNING_AND_STARTING_STATES, TYPE_NON_RUNNING_STATES
]
ALL_CRAWL_STATES = [*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES]
# ============================================================================
### CRAWL CONFIGS ###
# ============================================================================
class JobType(str, Enum):
"""Job Types"""
URL_LIST = "url-list"
SEED_CRAWL = "seed-crawl"
CUSTOM = "custom"
# ============================================================================
class ScopeType(str, Enum):
"""Crawl scope type"""
PAGE = "page"
PAGE_SPA = "page-spa"
PREFIX = "prefix"
HOST = "host"
DOMAIN = "domain"
ANY = "any"
CUSTOM = "custom"
# ============================================================================
class EmptyStr(ConstrainedStr):
"""empty string only"""
min_length = 0
max_length = 0
# ============================================================================
class Seed(BaseModel):
"""Crawl seed"""
url: HttpUrl
scopeType: Optional[ScopeType] = None
include: Union[str, List[str], None] = None
exclude: Union[str, List[str], None] = None
sitemap: Union[bool, HttpUrl, None] = None
allowHash: Optional[bool] = None
depth: Optional[int] = None
extraHops: Optional[int] = None
# ============================================================================
class RawCrawlConfig(BaseModel):
"""Base Crawl Config"""
seeds: Optional[List[Seed]]
scopeType: Optional[ScopeType] = ScopeType.PREFIX
include: Union[str, List[str], None] = None
exclude: Union[str, List[str], None] = None
depth: Optional[int] = -1
limit: Optional[int] = 0
extraHops: Optional[int] = 0
lang: Optional[str]
blockAds: Optional[bool] = False
behaviorTimeout: Optional[int]
pageLoadTimeout: Optional[int]
pageExtraDelay: Optional[int] = 0
postLoadDelay: Optional[int] = 0
workers: Optional[int] = None
headless: Optional[bool] = None
generateWACZ: Optional[bool] = None
combineWARC: Optional[bool] = None
useSitemap: Optional[bool] = False
failOnFailedSeed: Optional[bool] = False
logging: Optional[str] = None
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
userAgent: Optional[str] = None
# ============================================================================
class CrawlConfigIn(BaseModel):
"""CrawlConfig input model, submitted via API"""
schedule: Optional[str] = ""
runNow: bool = False
config: RawCrawlConfig
name: str
description: Optional[str]
jobType: Optional[JobType] = JobType.CUSTOM
profileid: Optional[UUID] = None
crawlerChannel: str = "default"
autoAddCollections: Optional[List[UUID]] = []
tags: Optional[List[str]] = []
crawlTimeout: int = 0
maxCrawlSize: int = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 # type: ignore
crawlFilenameTemplate: Optional[str] = None
# ============================================================================
class ConfigRevision(BaseMongoModel):
"""Crawl Config Revision"""
cid: UUID
schedule: Optional[str] = ""
config: RawCrawlConfig
profileid: Optional[UUID]
crawlerChannel: Optional[str]
crawlTimeout: Optional[int] = 0
maxCrawlSize: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 # type: ignore
modified: datetime
modifiedBy: Optional[UUID]
rev: int = 0
# ============================================================================
class CrawlConfigCore(BaseMongoModel):
"""Core data shared between crawls and crawlconfigs"""
schedule: Optional[str] = ""
jobType: Optional[JobType] = JobType.CUSTOM
config: Optional[RawCrawlConfig]
tags: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
maxCrawlSize: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 # type: ignore
oid: UUID
profileid: Optional[UUID]
crawlerChannel: Optional[str] = None
# ============================================================================
class CrawlConfigAdditional(BaseModel):
"""Additional fields shared by CrawlConfig and CrawlConfigOut."""
name: Optional[str]
description: Optional[str]
created: datetime
createdBy: Optional[UUID]
modified: Optional[datetime]
modifiedBy: Optional[UUID]
autoAddCollections: Optional[List[UUID]] = []
inactive: Optional[bool] = False
rev: int = 0
crawlAttemptCount: Optional[int] = 0
crawlCount: Optional[int] = 0
crawlSuccessfulCount: Optional[int] = 0
totalSize: Optional[int] = 0
lastCrawlId: Optional[str]
lastCrawlStartTime: Optional[datetime]
lastStartedBy: Optional[UUID]
lastCrawlTime: Optional[datetime]
lastCrawlState: Optional[str]
lastCrawlSize: Optional[int]
lastRun: Optional[datetime]
isCrawlRunning: Optional[bool] = False
crawlFilenameTemplate: Optional[str] = None
# ============================================================================
class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
"""Schedulable config"""
id: UUID
config: RawCrawlConfig
createdByName: Optional[str]
modifiedByName: Optional[str]
lastStartedByName: Optional[str]
def get_raw_config(self):
"""serialize config for browsertrix-crawler"""
return self.config.dict(exclude_unset=True, exclude_none=True)
# ============================================================================
class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
"""Crawl Config Output"""
id: UUID
lastCrawlStopping: Optional[bool] = False
profileName: Optional[str]
firstSeed: Optional[str]
seedCount: int = 0
createdByName: Optional[str]
modifiedByName: Optional[str]
lastStartedByName: Optional[str]
# ============================================================================
class CrawlConfigProfileOut(BaseMongoModel):
"""Crawl Config basic info for profiles"""
name: str
firstSeed: str
seedCount: int
# ============================================================================
class UpdateCrawlConfig(BaseModel):
"""Update crawl config name, crawl schedule, or tags"""
# metadata: not revision tracked
name: Optional[str] = None
tags: Optional[List[str]] = None
description: Optional[str] = None
autoAddCollections: Optional[List[UUID]] = None
runNow: bool = False
# crawl data: revision tracked
schedule: Optional[str] = None
profileid: Union[UUID, EmptyStr, None] = None
crawlerChannel: Optional[str] = None
crawlTimeout: Optional[int] = None
maxCrawlSize: Optional[int] = None
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = None # type: ignore
crawlFilenameTemplate: Optional[str] = None
config: Optional[RawCrawlConfig] = None
# ============================================================================
class CrawlConfigAddedResponse(BaseModel):
"""Response model for adding crawlconfigs"""
added: bool
id: str
run_now_job: Optional[str]
storageQuotaReached: bool
execMinutesQuotaReached: bool
# ============================================================================
class CrawlConfigTags(BaseModel):
"""Response model for crawlconfig tags"""
tags: List[str]
# ============================================================================
class CrawlConfigSearchValues(BaseModel):
"""Response model for adding crawlconfigs"""
names: List[str]
descriptions: List[str]
firstSeeds: List[AnyHttpUrl]
workflowIds: List[UUID]
# ============================================================================
class CrawlConfigUpdateResponse(BaseModel):
"""Response model for updating crawlconfigs"""
updated: bool
settings_changed: bool
metadata_changed: bool
storageQuotaReached: Optional[bool]
execMinutesQuotaReached: Optional[bool]
started: Optional[str]
# ============================================================================
class CrawlConfigDeletedResponse(BaseModel):
"""Response model for deleting crawlconfigs"""
success: bool
status: str
# ============================================================================
### CRAWLER VERSIONS ###
# ============================================================================
class CrawlerChannel(BaseModel):
"""Crawler version available to use in workflows"""
id: str
image: str
# ============================================================================
class CrawlerChannels(BaseModel):
"""List of CrawlerChannel instances for API"""
channels: List[CrawlerChannel] = []
# ============================================================================
### BASE CRAWLS ###
# ============================================================================
class StorageRef(BaseModel):
"""Reference to actual storage"""
name: str
custom: Optional[bool]
def __init__(self, *args, **kwargs):
if args:
if args[0].startswith("cs-"):
super().__init__(name=args[0][2:], custom=True)
else:
super().__init__(name=args[0], custom=False)
else:
super().__init__(**kwargs)
def __str__(self):
if not self.custom:
return self.name
return "cs-" + self.name
def get_storage_secret_name(self, oid: str) -> str:
"""get k8s secret name for this storage and oid"""
if not self.custom:
return "storage-" + self.name
return f"storage-cs-{self.name}-{oid[:12]}"
def get_storage_extra_path(self, oid: str) -> str:
"""return extra path added to the endpoint
using oid for default storages, no extra path for custom"""
if not self.custom:
return oid + "/"
return ""
# ============================================================================
class BaseFile(BaseModel):
"""Base model for crawl and profile files"""
filename: str
hash: str
size: int
storage: StorageRef
replicas: Optional[List[StorageRef]] = []
# ============================================================================
class CrawlFile(BaseFile):
"""file from a crawl"""
presignedUrl: Optional[str]
expireAt: Optional[datetime]
crc32: int = 0
# ============================================================================
class CrawlFileOut(BaseModel):
"""output for file from a crawl (conformance to Data Resource Spec)"""
name: str
path: str
hash: str
crc32: int = 0
size: int
crawlId: Optional[str]
numReplicas: int = 0
expireAt: Optional[str]
# ============================================================================
class CrawlStats(BaseModel):
"""Crawl Stats for pages and size"""
found: int = 0
done: int = 0
size: int = 0
# ============================================================================
class CoreCrawlable(BaseModel):
# pylint: disable=too-few-public-methods
"""Core properties for crawlable run (crawl or qa run)"""
id: str
userid: UUID
userName: Optional[str]
started: datetime
finished: Optional[datetime] = None
state: str
crawlExecSeconds: int = 0
image: Optional[str]
stats: Optional[CrawlStats] = CrawlStats()
files: List[CrawlFile] = []
fileSize: int = 0
fileCount: int = 0
errors: Optional[List[str]] = []
# ============================================================================
class BaseCrawl(CoreCrawlable, BaseMongoModel):
"""Base Crawl object (representing crawls, uploads and manual sessions)"""
type: str
oid: UUID
cid: Optional[UUID] = None
name: Optional[str] = ""
description: Optional[str] = ""
tags: Optional[List[str]] = []
collectionIds: Optional[List[UUID]] = []
reviewStatus: Optional[conint(ge=1, le=5)] = None # type: ignore
# ============================================================================
class CollIdName(BaseModel):
"""Collection id and name object"""
id: UUID
name: str
# ============================================================================
class CrawlOut(BaseMongoModel):
"""Crawl output model, shared across all crawl types"""
# pylint: disable=duplicate-code
type: Optional[str]
id: str
userid: UUID
userName: Optional[str]
oid: UUID
profileid: Optional[UUID]
name: Optional[str]
description: Optional[str]
started: datetime
finished: Optional[datetime]
state: str
stats: Optional[CrawlStats]
fileSize: int = 0
fileCount: int = 0
tags: Optional[List[str]] = []
errors: Optional[List[str]] = []
collectionIds: Optional[List[UUID]] = []
crawlExecSeconds: int = 0
qaCrawlExecSeconds: int = 0
# automated crawl fields
config: Optional[RawCrawlConfig]
cid: Optional[UUID]
firstSeed: Optional[str]
seedCount: Optional[int]
profileName: Optional[str]
stopping: Optional[bool]
manual: Optional[bool]
cid_rev: Optional[int]
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] # type: ignore
storageQuotaReached: Optional[bool]
execMinutesQuotaReached: Optional[bool]
crawlerChannel: str = "default"
image: Optional[str]
reviewStatus: Optional[conint(ge=1, le=5)] = None # type: ignore
qaRunCount: int = 0
activeQAStats: Optional[CrawlStats]
lastQAState: Optional[str]
lastQAStarted: Optional[datetime]
filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0
# ============================================================================
class CrawlOutWithResources(CrawlOut):
"""Crawl output model including resources"""
resources: Optional[List[CrawlFileOut]] = []
collections: Optional[List[CollIdName]] = []
# ============================================================================
class UpdateCrawl(BaseModel):
"""Update crawl"""
name: Optional[str]
description: Optional[str]
tags: Optional[List[str]]
collectionIds: Optional[List[UUID]]
reviewStatus: Optional[conint(ge=1, le=5)] # type: ignore
# ============================================================================
class DeleteCrawlList(BaseModel):
"""delete crawl list POST body"""
crawl_ids: List[str]
# ============================================================================
class DeleteQARunList(BaseModel):
"""delete qa run list POST body"""
qa_run_ids: List[str]
# ============================================================================
class CrawlSearchValuesResponse(BaseModel):
"""Response model for crawl search values"""
names: List[str]
descriptions: List[str]
firstSeeds: List[AnyHttpUrl]
# ============================================================================
class CrawlQueueUrl(BaseModel):
"""Model for item in crawl queue"""
seedId: int
url: AnyHttpUrl
depth: int
extraHops: int
ts: int
pageid: Optional[str]
# ============================================================================
class CrawlQueueResponse(BaseModel):
"""Response model for GET crawl queue"""
total: int
results: List[CrawlQueueUrl]
matched: List[CrawlQueueUrl]
# ============================================================================
class MatchCrawlQueueResponse(BaseModel):
"""Response model for match crawl queue"""
total: int
matched: List[CrawlQueueUrl]
nextOffset: int
# ============================================================================
### AUTOMATED CRAWLS ###
# ============================================================================
class CrawlScale(BaseModel):
"""scale the crawl to N parallel containers"""
scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1 # type: ignore
# ============================================================================
class QARun(CoreCrawlable, BaseModel):
"""Subdocument to track QA runs for given crawl"""
# ============================================================================
class QARunWithResources(QARun):
"""QA crawl output model including resources"""
resources: Optional[List[CrawlFileOut]] = []
# ============================================================================
class QARunOut(BaseModel):
"""QA Run Output"""
id: str
userName: Optional[str]
started: datetime
finished: Optional[datetime] = None
state: str
crawlExecSeconds: int = 0
stats: CrawlStats = CrawlStats()
# ============================================================================
class QARunBucketStats(BaseModel):
"""Model for per-bucket aggregate stats results"""
lowerBoundary: str
count: int
# ============================================================================
class QARunAggregateStatsOut(BaseModel):
"""QA Run aggregate stats out"""
screenshotMatch: List[QARunBucketStats]
textMatch: List[QARunBucketStats]
# ============================================================================
class Crawl(BaseCrawl, CrawlConfigCore):
"""Store State of a Crawl (Finished or Running)"""
type: Literal["crawl"] = "crawl"
cid: UUID
config: RawCrawlConfig
cid_rev: int = 0
# schedule: Optional[str]
manual: Optional[bool]
stopping: Optional[bool] = False
qaCrawlExecSeconds: int = 0
qa: Optional[QARun] = None
qaFinished: Optional[Dict[str, QARun]] = {}
filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0
# ============================================================================
class CrawlCompleteIn(BaseModel):
"""Completed Crawl Webhook POST message"""
id: str
user: str
filename: str
size: int
hash: str
crc32: int = 0
completed: Optional[bool] = True
# ============================================================================
class CrawlScaleResponse(BaseModel):
"""Response model for modifying crawl scale"""
scaled: int
# ============================================================================
### UPLOADED CRAWLS ###
# ============================================================================
class UploadedCrawl(BaseCrawl):
"""Store State of a Crawl Upload"""
type: Literal["upload"] = "upload"
# ============================================================================
class UpdateUpload(UpdateCrawl):
"""Update modal that also includes name"""
# ============================================================================
### COLLECTIONS ###
# ============================================================================
class Collection(BaseMongoModel):
"""Org collection structure"""
name: str = Field(..., min_length=1)
oid: UUID
description: Optional[str]
modified: Optional[datetime]
crawlCount: Optional[int] = 0
pageCount: Optional[int] = 0
totalSize: Optional[int] = 0
# Sorted by count, descending
tags: Optional[List[str]] = []
isPublic: Optional[bool] = False
# ============================================================================
class CollIn(BaseModel):
"""Collection Passed in By User"""
name: str = Field(..., min_length=1)
description: Optional[str]
crawlIds: Optional[List[str]] = []
isPublic: Optional[bool] = False
# ============================================================================
class CollOut(Collection):
"""Collection output model with annotations."""
resources: List[CrawlFileOut] = []
# ============================================================================
class UpdateColl(BaseModel):
"""Update collection"""
name: Optional[str]
description: Optional[str]
isPublic: Optional[bool]
# ============================================================================
class AddRemoveCrawlList(BaseModel):
"""Collections to add or remove from collection"""
crawlIds: List[str] = []
# ============================================================================
class CollectionSearchValuesResponse(BaseModel):
"""Response model for collections search values"""
names: List[str]
# ============================================================================
### ORGS ###
# ============================================================================
class UpdateRole(InviteToOrgRequest):
"""Update existing role for user"""
# ============================================================================
class RemoveFromOrg(InviteRequest):
"""Remove this user from org"""
# ============================================================================
class RemovePendingInvite(InviteRequest):
"""Delete pending invite to org by email"""
# ============================================================================
class RenameOrg(BaseModel):
"""Rename an existing org"""
name: str
slug: Optional[str] = None
# ============================================================================
class OrgStorageRefs(BaseModel):
"""Input model for setting primary storage + optional replicas"""
storage: StorageRef
storageReplicas: List[StorageRef] = []
# ============================================================================
class S3StorageIn(BaseModel):
"""Custom S3 Storage input model"""
type: Literal["s3"] = "s3"
name: str
access_key: str
secret_key: str
endpoint_url: str
bucket: str
access_endpoint_url: Optional[str]
region: str = ""
# ============================================================================
class S3Storage(BaseModel):
"""S3 Storage Model"""
type: Literal["s3"] = "s3"
endpoint_url: str
endpoint_no_bucket_url: str
access_key: str
secret_key: str
access_endpoint_url: str
region: str = ""
use_access_for_presign: bool = True
# ============================================================================
# Subscriptions
# ============================================================================
PAUSED_PAYMENT_FAILED = "paused_payment_failed"
ACTIVE = "active"
REASON_PAUSED = "subscriptionPaused"
REASON_CANCELED = "subscriptionCanceled"
# ============================================================================
class OrgQuotas(BaseModel):
"""Organization quotas (settable by superadmin)"""
maxConcurrentCrawls: Optional[int] = 0
maxPagesPerCrawl: Optional[int] = 0
storageQuota: Optional[int] = 0
maxExecMinutesPerMonth: Optional[int] = 0
extraExecMinutes: Optional[int] = 0
giftedExecMinutes: Optional[int] = 0
# ============================================================================
class SubscriptionEventOut(BaseModel):
"""Fields to add to output models for subscription events"""
oid: UUID
timestamp: datetime
# ============================================================================
class SubscriptionCreate(BaseModel):
"""create new subscription"""
subId: str
status: str
planId: str
firstAdminInviteEmail: str
quotas: Optional[OrgQuotas] = None
# ============================================================================
class SubscriptionCreateOut(SubscriptionCreate, SubscriptionEventOut):
"""Output model for subscription creation event"""
type: Literal["create"] = "create"
# ============================================================================
class SubscriptionImport(BaseModel):
"""import subscription to existing org"""
subId: str
status: str
planId: str
oid: UUID
# ============================================================================
class SubscriptionImportOut(SubscriptionImport, SubscriptionEventOut):
"""Output model for subscription import event"""
type: Literal["import"] = "import"
# ============================================================================
class SubscriptionUpdate(BaseModel):
"""update subscription data"""
subId: str
status: str
planId: str
futureCancelDate: Optional[datetime]
# ============================================================================
class SubscriptionUpdateOut(SubscriptionUpdate, SubscriptionEventOut):
"""Output model for subscription update event"""
type: Literal["update"] = "update"
# ============================================================================
class SubscriptionCancel(BaseModel):
"""cancel subscription"""
subId: str
# ============================================================================
class SubscriptionCancelOut(SubscriptionCancel, SubscriptionEventOut):
"""Output model for subscription cancellation event"""
type: Literal["cancel"] = "cancel"
# ============================================================================
class SubscriptionPortalUrlRequest(BaseModel):
"""Request for subscription update pull"""
subId: str
planId: str
# ============================================================================
class SubscriptionPortalUrlResponse(BaseModel):
"""Response for subscription update pull"""
portalUrl: str = ""
# ============================================================================
class Subscription(BaseModel):
"""subscription data"""
subId: str
status: str
planId: str
futureCancelDate: Optional[datetime] = None
readOnlyOnCancel: bool = False
# ============================================================================
class SubscriptionCanceledResponse(BaseModel):
"""Response model for subscription cancel"""
deleted: bool
canceled: bool
# ============================================================================
# ORGS
# ============================================================================
class OrgReadOnlyOnCancel(BaseModel):
"""Make org readOnly on subscription cancellation instead of deleting"""
readOnlyOnCancel: bool
# ============================================================================
class OrgCreate(BaseModel):
"""Create a new org"""
name: str
slug: Optional[str] = None
# ============================================================================
class OrgQuotaUpdate(BaseModel):
"""Organization quota update (to track changes over time)"""
modified: datetime
update: OrgQuotas
# ============================================================================
class OrgReadOnlyUpdate(BaseModel):
"""Organization readonly update"""
readOnly: bool
readOnlyReason: Optional[str] = None
# ============================================================================
class OrgWebhookUrls(BaseModel):
"""Organization webhook URLs"""
crawlStarted: Optional[AnyHttpUrl] = None
crawlFinished: Optional[AnyHttpUrl] = None
crawlDeleted: Optional[AnyHttpUrl] = None
uploadFinished: Optional[AnyHttpUrl] = None
uploadDeleted: Optional[AnyHttpUrl] = None
addedToCollection: Optional[AnyHttpUrl] = None
removedFromCollection: Optional[AnyHttpUrl] = None
collectionDeleted: Optional[AnyHttpUrl] = None
# ============================================================================
class OrgOut(BaseMongoModel):
"""Organization API output model"""
id: UUID
name: str
slug: str
users: Optional[Dict[str, Any]]
created: Optional[datetime]
default: bool = False
bytesStored: int
bytesStoredCrawls: int
bytesStoredUploads: int
bytesStoredProfiles: int
origin: Optional[AnyHttpUrl] = None
storageQuotaReached: Optional[bool]
execMinutesQuotaReached: Optional[bool]
# total usage and exec time
usage: Optional[Dict[str, int]]
crawlExecSeconds: Dict[str, int] = {}
# qa only usage + exec time
qaUsage: Optional[Dict[str, int]] = {}
qaCrawlExecSeconds: Dict[str, int] = {}
# exec time limits
monthlyExecSeconds: Dict[str, int] = {}
extraExecSeconds: Dict[str, int] = {}
giftedExecSeconds: Dict[str, int] = {}
extraExecSecondsAvailable: int = 0
giftedExecSecondsAvailable: int = 0
quotas: OrgQuotas = OrgQuotas()
quotaUpdates: Optional[List[OrgQuotaUpdate]] = []
webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
readOnly: Optional[bool]
readOnlyReason: Optional[str]
subscription: Optional[Subscription]
# ============================================================================
class Organization(BaseMongoModel):
"""Organization Base Model"""
id: UUID
name: str
slug: str
users: Dict[str, UserRole] = {}
created: Optional[datetime]
default: bool = False
storage: StorageRef
storageReplicas: List[StorageRef] = []
customStorages: Dict[str, S3Storage] = {}
bytesStored: int = 0
bytesStoredCrawls: int = 0
bytesStoredUploads: int = 0
bytesStoredProfiles: int = 0
# total usage + exec time
usage: Dict[str, int] = {}
crawlExecSeconds: Dict[str, int] = {}
# qa only usage + exec time
qaUsage: Dict[str, int] = {}
qaCrawlExecSeconds: Dict[str, int] = {}
# exec time limits
monthlyExecSeconds: Dict[str, int] = {}
extraExecSeconds: Dict[str, int] = {}
giftedExecSeconds: Dict[str, int] = {}
extraExecSecondsAvailable: int = 0
giftedExecSecondsAvailable: int = 0
quotas: OrgQuotas = OrgQuotas()
quotaUpdates: Optional[List[OrgQuotaUpdate]] = []
webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
origin: Optional[AnyHttpUrl] = None
readOnly: Optional[bool] = False
readOnlyReason: Optional[str] = None
subscription: Optional[Subscription] = None
def is_owner(self, user):
"""Check if user is owner"""
return self._is_auth(user, UserRole.OWNER)
def is_crawler(self, user):
"""Check if user can crawl (write)"""
return self._is_auth(user, UserRole.CRAWLER)
def is_viewer(self, user):
"""Check if user can view (read)"""
return self._is_auth(user, UserRole.VIEWER)
def _is_auth(self, user, value):
"""Check if user has at least specified permission level"""
if user.is_superuser:
return True
res = self.users.get(str(user.id))
if not res:
return False
return res >= value
async def serialize_for_user(self, user: User, user_manager) -> OrgOut:
"""Serialize result based on current user access"""
exclude = {"storage"}
if not self.is_owner(user):
exclude.add("users")
if not self.is_crawler(user):
exclude.add("usage")
exclude.add("crawlExecSeconds")
result = self.to_dict(
exclude_unset=True,
exclude_none=True,
exclude=exclude,
)
if self.is_owner(user):
result["users"] = {}
keys = list(self.users.keys())
user_list = await user_manager.get_user_names_by_ids(keys)
for org_user in user_list:
id_ = str(org_user["id"])
role = self.users.get(id_)
if not role:
continue
result["users"][id_] = {
"role": role,
"name": org_user.get("name", ""),
"email": org_user.get("email", ""),
}
return OrgOut.from_dict(result)
# ============================================================================
class OrgOutExport(Organization):
"""Org out for export"""
# Additional field so export contains user names and emails
userDetails: Optional[List[Dict[str, Union[str, int, UUID]]]]
async def serialize_for_export(self, user_manager):
"""Serialize result with users for org export"""
result = self.to_dict()
user_details = []
keys = list(self.users.keys())
user_list = await user_manager.get_user_names_by_ids(keys)
for org_user in user_list:
id_ = str(org_user["id"])
role = self.users.get(id_)
if not role:
continue
user_details.append(
{
"id": id_,
"role": role.value,
"name": org_user.get("name", ""),
"email": org_user.get("email", ""),
}
)
result["userDetails"] = user_details
return self.from_dict(result)
# ============================================================================
class OrgMetrics(BaseModel):
"""Organization API metrics model"""
storageUsedBytes: int
storageUsedCrawls: int
storageUsedUploads: int
storageUsedProfiles: int
storageQuotaBytes: int
archivedItemCount: int
crawlCount: int
uploadCount: int
pageCount: int
profileCount: int
workflowsRunningCount: int
maxConcurrentCrawls: int
workflowsQueuedCount: int
collectionsCount: int
publicCollectionsCount: int
# ============================================================================
class OrgImportExportData(BaseModel):
"""Model for org import/export data"""
dbVersion: str
org: Dict[str, Any]
profiles: List[Dict[str, Any]]
workflows: List[Dict[str, Any]]
workflowRevisions: List[Dict[str, Any]]
items: List[Dict[str, Any]]
pages: List[Dict[str, Any]]
collections: List[Dict[str, Any]]
# ============================================================================
class OrgImportExport(BaseModel):
"""Model for org import/export"""
data: OrgImportExportData
# ============================================================================
class OrgInviteResponse(BaseModel):
"""Model for org invite response"""
invited: str
token: UUID
# ============================================================================
class OrgAcceptInviteResponse(BaseModel):
"""Model for org invite response"""
added: bool
org: OrgOut
# ============================================================================
class OrgDeleteInviteResponse(BaseModel):
"""Model for org invite response"""
removed: bool
count: int
# ============================================================================
class OrgSlugsResponse(BaseModel):
"""Model for org slugs response"""
slugs: List[str]
# ============================================================================
class OrgImportResponse(BaseModel):
"""Model for org import response"""
imported: bool
# ============================================================================
### PAGINATION ###
# ============================================================================
class PaginatedResponse(BaseModel):
"""Paginated response model"""
total: int
page: int
pageSize: int
# ============================================================================
### PROFILES ###
# ============================================================================
class ProfileFile(BaseFile):
"""file for storing profile data"""
# ============================================================================
class Profile(BaseMongoModel):
"""Browser profile"""
name: str
description: Optional[str] = ""
userid: UUID
oid: UUID
origins: List[str]
resource: Optional[ProfileFile]
created: Optional[datetime]
createdBy: Optional[UUID] = None
createdByName: Optional[str] = None
modified: Optional[datetime] = None
modifiedBy: Optional[UUID] = None
modifiedByName: Optional[str] = None
baseid: Optional[UUID] = None
crawlerChannel: Optional[str]
# ============================================================================
class ProfileWithCrawlConfigs(Profile):
"""Profile with list of crawlconfigs using this profile"""
crawlconfigs: List[CrawlConfigProfileOut] = []
# ============================================================================
class UrlIn(BaseModel):
"""Request to set url"""
url: HttpUrl
# ============================================================================
class ProfileLaunchBrowserIn(UrlIn):
"""Request to launch new browser for creating profile"""
profileId: Optional[UUID] = None
crawlerChannel: str = "default"
# ============================================================================
class BrowserId(BaseModel):
"""Profile id on newly created profile"""
browserid: str
# ============================================================================
class ProfileCreate(BaseModel):
"""Create new profile for browser id"""
browserid: str
name: str
description: Optional[str] = ""
crawlerChannel: str = "default"
# ============================================================================
class ProfileUpdate(BaseModel):
"""Update existing profile with new browser profile or metadata only"""
browserid: Optional[str] = ""
name: str
description: Optional[str] = ""
# ============================================================================
class ProfilePingResponse(BaseModel):
"""Response model for pinging profile"""
success: bool
origins: List[AnyHttpUrl]
# ============================================================================
class ProfileBrowserGetUrlResponse(BaseModel):
"""Response model for profile get URL endpoint"""
path: str
password: str
oid: UUID
auth_bearer: str
scale: float
url: AnyHttpUrl
# ============================================================================
### USERS ###
# ============================================================================
class UserCreate(BaseModel):
"""
User Creation Model exposed to API
"""
email: EmailStr
password: str
name: Optional[str] = ""
inviteToken: Optional[UUID] = None
# ============================================================================
class UserUpdateEmailName(BaseModel):
"""
Update email and/or name
"""
email: Optional[EmailStr] = None
name: Optional[str] = None
# ============================================================================
class UserUpdatePassword(BaseModel):
"""
Update password, requires current password to reset
"""
email: EmailStr
password: str
newPassword: str
# ============================================================================
### WEBHOOKS ###
# ============================================================================
class WebhookNotificationBody(BaseModel):
"""Base POST body model for webhook notifications"""
# Store as str, not UUID, to make JSON-serializable
orgId: str
# ============================================================================
class WebhookEventType(str, Enum):
"""Webhook Event Types"""
CRAWL_STARTED = "crawlStarted"
CRAWL_FINISHED = "crawlFinished"
CRAWL_DELETED = "crawlDeleted"
UPLOAD_FINISHED = "uploadFinished"
UPLOAD_DELETED = "uploadDeleted"
ADDED_TO_COLLECTION = "addedToCollection"
REMOVED_FROM_COLLECTION = "removedFromCollection"
COLLECTION_DELETED = "collectionDeleted"
# ============================================================================
class BaseCollectionItemBody(WebhookNotificationBody):
"""Webhook notification base POST body for collection changes"""
collectionId: str
itemIds: List[str]
downloadUrl: str
# ============================================================================
class CollectionItemAddedBody(BaseCollectionItemBody):
"""Webhook notification POST body for collection additions"""
event: Literal[WebhookEventType.ADDED_TO_COLLECTION] = (
WebhookEventType.ADDED_TO_COLLECTION
)
# ============================================================================
class CollectionItemRemovedBody(BaseCollectionItemBody):
"""Webhook notification POST body for collection removals"""
event: Literal[WebhookEventType.REMOVED_FROM_COLLECTION] = (
WebhookEventType.REMOVED_FROM_COLLECTION
)
# ============================================================================
class CollectionDeletedBody(WebhookNotificationBody):
"""Webhook notification base POST body for collection changes"""
event: Literal[WebhookEventType.COLLECTION_DELETED] = (
WebhookEventType.COLLECTION_DELETED
)
collectionId: str
# ============================================================================
class BaseArchivedItemBody(WebhookNotificationBody):
"""Webhook notification POST body for when archived item is started or finished"""
itemId: str
# ============================================================================
class BaseArchivedItemFinishedBody(BaseArchivedItemBody):
"""Webhook notification POST body for when archived item is finished"""
resources: List[CrawlFileOut]
state: str
# ============================================================================
class CrawlStartedBody(BaseArchivedItemBody):
"""Webhook notification POST body for when crawl starts"""
scheduled: bool = False
event: Literal[WebhookEventType.CRAWL_STARTED] = WebhookEventType.CRAWL_STARTED
# ============================================================================
class CrawlFinishedBody(BaseArchivedItemFinishedBody):
"""Webhook notification POST body for when crawl finishes"""
event: Literal[WebhookEventType.CRAWL_FINISHED] = WebhookEventType.CRAWL_FINISHED
# ============================================================================
class CrawlDeletedBody(BaseArchivedItemBody):
"""Webhook notification POST body for when crawl is deleted"""
event: Literal[WebhookEventType.CRAWL_DELETED] = WebhookEventType.CRAWL_DELETED
# ============================================================================
class UploadFinishedBody(BaseArchivedItemFinishedBody):
"""Webhook notification POST body for when upload finishes"""
event: Literal[WebhookEventType.UPLOAD_FINISHED] = WebhookEventType.UPLOAD_FINISHED
# ============================================================================
class UploadDeletedBody(BaseArchivedItemBody):
"""Webhook notification POST body for when upload finishes"""
event: Literal[WebhookEventType.UPLOAD_DELETED] = WebhookEventType.UPLOAD_DELETED
# ============================================================================
class WebhookNotification(BaseMongoModel):
"""Base POST body model for webhook notifications"""
event: WebhookEventType
oid: UUID
body: Union[
CrawlStartedBody,
CrawlFinishedBody,
CrawlDeletedBody,
UploadFinishedBody,
UploadDeletedBody,
CollectionItemAddedBody,
CollectionItemRemovedBody,
CollectionDeletedBody,
]
success: bool = False
attempts: int = 0
created: datetime
lastAttempted: Optional[datetime] = None
# ============================================================================
### BACKGROUND JOBS ###
# ============================================================================
class BgJobType(str, Enum):
"""Background Job Types"""
CREATE_REPLICA = "create-replica"
DELETE_REPLICA = "delete-replica"
# ============================================================================
class BackgroundJob(BaseMongoModel):
"""Model for tracking background jobs"""
id: str
type: BgJobType
oid: UUID
success: Optional[bool] = None
started: datetime
finished: Optional[datetime] = None
previousAttempts: Optional[List[Dict[str, Optional[datetime]]]] = None
# ============================================================================
class CreateReplicaJob(BackgroundJob):
"""Model for tracking create of replica jobs"""
type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA
file_path: str
object_type: str
object_id: str
replica_storage: StorageRef
# ============================================================================
class DeleteReplicaJob(BackgroundJob):
"""Model for tracking deletion of replica jobs"""
type: Literal[BgJobType.DELETE_REPLICA] = BgJobType.DELETE_REPLICA
file_path: str
object_type: str
object_id: str
replica_storage: StorageRef
# ============================================================================
class AnyJob(BaseModel):
"""Union of all job types, for response model"""
__root__: Union[CreateReplicaJob, DeleteReplicaJob, BackgroundJob]
# ============================================================================
### PAGES ###
# ============================================================================
class PageReviewUpdate(BaseModel):
"""Update model for page manual review/approval"""
approved: Optional[bool] = None
# ============================================================================
class PageNoteIn(BaseModel):
"""Input model for adding page notes"""
text: str
# ============================================================================
class PageNoteEdit(BaseModel):
"""Input model for editing page notes"""
id: UUID
text: str
# ============================================================================
class PageNoteDelete(BaseModel):
"""Delete model for page notes"""
delete_list: List[UUID] = []
# ============================================================================
class PageNote(BaseModel):
"""Model for page notes, tracking user and time"""
id: UUID
text: str
created: datetime = dt_now()
userid: UUID
userName: str
# ============================================================================
class PageQACompare(BaseModel):
"""Model for updating pages from QA run"""
screenshotMatch: Optional[float] = None
textMatch: Optional[float] = None
resourceCounts: Optional[Dict[str, int]]
# ============================================================================
class Page(BaseMongoModel):
"""Core page data, no QA"""
id: UUID
oid: UUID
crawl_id: str
# core page data
url: AnyHttpUrl
title: Optional[str] = None
ts: Optional[datetime] = None
loadState: Optional[int] = None
status: Optional[int] = None
mime: Optional[str] = None
# manual review
userid: Optional[UUID] = None
modified: Optional[datetime] = None
approved: Optional[bool] = None
notes: List[PageNote] = []
isFile: Optional[bool] = False
isError: Optional[bool] = False
def compute_page_type(self):
"""sets self.isFile or self.isError flags"""
self.isFile = False
self.isError = False
if self.loadState == 2:
# pylint: disable=unsupported-membership-test
if self.mime and "html" not in self.mime:
self.isFile = True
elif self.title is None and self.status == 200:
self.isFile = True
elif self.loadState == 0:
self.isError = True
# ============================================================================
class PageWithAllQA(Page):
"""Model for core page data + qa"""
# automated heuristics, keyed by QA run id
qa: Optional[Dict[str, PageQACompare]] = {}
# ============================================================================
class PageOut(Page):
"""Model for pages output, no QA"""
status: Optional[int] = 200
# ============================================================================
class PageOutWithSingleQA(Page):
"""Page out with single QA entry"""
qa: Optional[PageQACompare] = None
# ============================================================================
class PageNoteAddedResponse(BaseModel):
"""Model for response to adding page"""
added: bool
data: PageNote
# ============================================================================
class PageNoteUpdatedResponse(BaseModel):
"""Model for response to updating page"""
updated: bool
data: PageNote
# ============================================================================
### GENERIC RESPONSE MODELS ###
# ============================================================================
class UpdatedResponse(BaseModel):
"""Response for update API endpoints"""
updated: bool
# ============================================================================
class SuccessResponse(BaseModel):
"""Response for API endpoints that return success"""
success: bool
# ============================================================================
class SuccessResponseStorageQuota(SuccessResponse):
"""Response for API endpoints that return success and storageQuotaReached"""
storageQuotaReached: bool
# ============================================================================
class StartedResponse(BaseModel):
"""Response for API endpoints that start crawls"""
started: str
# ============================================================================
class AddedResponse(BaseModel):
"""Response for API endpoints that return added"""
added: bool
# ============================================================================
class AddedResponseId(AddedResponse):
"""Response for API endpoints that return added + id"""
id: UUID
# ============================================================================
class AddedResponseName(AddedResponse):
"""Response for API endpoints that add resources and return name"""
name: str
# ============================================================================
class AddedResponseIdQuota(AddedResponse):
"""Response for API endpoints that return str id and storageQuotaReached"""
id: str
storageQuotaReached: bool
# ============================================================================
class AddedResponseIdName(AddedResponse):
"""Response for API endpoints that add resource and return id and name"""
id: UUID
name: str
# ============================================================================
class DeletedResponse(BaseModel):
"""Response for delete API endpoints"""
deleted: bool
# ============================================================================
class DeletedResponseQuota(DeletedResponse):
"""Response for delete API endpoints"""
storageQuotaReached: bool
# ============================================================================
class DeletedCountResponse(BaseModel):
"""Response for delete API endpoints that return count"""
deleted: int
# ============================================================================
class RemovedResponse(BaseModel):
"""Response for API endpoints for removing resources"""
removed: bool
# ============================================================================
class EmptyResponse(BaseModel):
"""Response for API endpoints that return nothing"""
# ============================================================================
### SPECIFIC PAGINATED RESPONSE MODELS ###
# ============================================================================
class PaginatedBackgroundJobResponse(PaginatedResponse):
"""Response model for paginated background jobs"""
items: List[Union[CreateReplicaJob, DeleteReplicaJob]]
# ============================================================================
class PaginatedCrawlOutResponse(PaginatedResponse):
"""Response model for paginated crawls"""
items: List[Union[CrawlOut, CrawlOutWithResources]]
# ============================================================================
class PaginatedCollOutResponse(PaginatedResponse):
"""Response model for paginated collections"""
items: List[CollOut]
# ============================================================================
class PaginatedCrawlConfigOutResponse(PaginatedResponse):
"""Response model for paginated crawlconfigs"""
items: List[CrawlConfigOut]
# ============================================================================
class PaginatedSeedResponse(PaginatedResponse):
"""Response model for paginated seeds"""
items: List[Seed]
# ============================================================================
class PaginatedConfigRevisionResponse(PaginatedResponse):
"""Response model for paginated crawlconfig revisions"""
items: List[ConfigRevision]
# ============================================================================
class PaginatedOrgOutResponse(PaginatedResponse):
"""Response model for paginated orgs"""
items: List[OrgOut]
# ============================================================================
class PaginatedInvitePendingResponse(PaginatedResponse):
"""Response model for paginated orgs"""
items: List[InviteOut]
# ============================================================================
class PaginatedPageOutResponse(PaginatedResponse):
"""Response model for paginated pages"""
items: List[PageOut]
# ============================================================================
class PaginatedPageOutWithQAResponse(PaginatedResponse):
"""Response model for paginated pages with single QA info"""
items: List[PageOutWithSingleQA]
# ============================================================================
class PaginatedProfileResponse(PaginatedResponse):
"""Response model for paginated profiles"""
items: List[Profile]
# ============================================================================
class PaginatedSubscriptionEventResponse(PaginatedResponse):
"""Response model for paginated subscription events"""
items: List[
Union[
SubscriptionCreateOut,
SubscriptionUpdateOut,
SubscriptionCancelOut,
SubscriptionImportOut,
]
]
# ============================================================================
class PaginatedWebhookNotificationResponse(PaginatedResponse):
"""Response model for paginated webhook notifications"""
items: List[WebhookNotification]