* Initial implementation of org metrics (This can eventually be sped up significantly by precomputing the values and storing them in the db.) * Rename storageQuota to storageQuotaBytes to be consistent * Update tests to include metrics
1019 lines
25 KiB
Python
1019 lines
25 KiB
Python
"""
|
|
Crawl-related models and types
|
|
"""
|
|
|
|
from datetime import datetime
|
|
from enum import Enum, IntEnum
|
|
import os
|
|
|
|
from typing import Optional, List, Dict, Union, Literal, Any
|
|
from pydantic import BaseModel, UUID4, conint, Field, HttpUrl, AnyHttpUrl, EmailStr
|
|
from fastapi_users import models as fastapi_users_models
|
|
|
|
from .db import BaseMongoModel
|
|
|
|
# crawl scale for constraint
|
|
MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3))
|
|
|
|
|
|
# pylint: disable=invalid-name, too-many-lines
|
|
# ============================================================================
|
|
|
|
### MAIN USER MODEL ###
|
|
|
|
|
|
# ============================================================================
|
|
class User(fastapi_users_models.BaseUser):
|
|
"""
|
|
Base User Model
|
|
"""
|
|
|
|
name: Optional[str] = ""
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### CRAWL CONFIGS ###
|
|
|
|
|
|
# ============================================================================
|
|
class JobType(str, Enum):
|
|
"""Job Types"""
|
|
|
|
URL_LIST = "url-list"
|
|
SEED_CRAWL = "seed-crawl"
|
|
CUSTOM = "custom"
|
|
|
|
|
|
# ============================================================================
|
|
class ScopeType(str, Enum):
|
|
"""Crawl scope type"""
|
|
|
|
PAGE = "page"
|
|
PAGE_SPA = "page-spa"
|
|
PREFIX = "prefix"
|
|
HOST = "host"
|
|
DOMAIN = "domain"
|
|
ANY = "any"
|
|
CUSTOM = "custom"
|
|
|
|
|
|
# ============================================================================
|
|
class Seed(BaseModel):
|
|
"""Crawl seed"""
|
|
|
|
url: HttpUrl
|
|
scopeType: Optional[ScopeType]
|
|
|
|
include: Union[str, List[str], None]
|
|
exclude: Union[str, List[str], None]
|
|
sitemap: Union[bool, HttpUrl, None]
|
|
allowHash: Optional[bool]
|
|
depth: Optional[int]
|
|
extraHops: Optional[int]
|
|
|
|
|
|
# ============================================================================
|
|
class RawCrawlConfig(BaseModel):
|
|
"""Base Crawl Config"""
|
|
|
|
seeds: List[Seed]
|
|
|
|
scopeType: Optional[ScopeType] = ScopeType.PREFIX
|
|
|
|
include: Union[str, List[str], None] = None
|
|
exclude: Union[str, List[str], None] = None
|
|
|
|
depth: Optional[int] = -1
|
|
limit: Optional[int] = 0
|
|
extraHops: Optional[int] = 0
|
|
|
|
lang: Optional[str]
|
|
blockAds: Optional[bool] = False
|
|
|
|
behaviorTimeout: Optional[int]
|
|
pageLoadTimeout: Optional[int]
|
|
pageExtraDelay: Optional[int] = 0
|
|
|
|
workers: Optional[int]
|
|
|
|
headless: Optional[bool]
|
|
|
|
generateWACZ: Optional[bool]
|
|
combineWARC: Optional[bool]
|
|
|
|
useSitemap: Optional[bool] = False
|
|
|
|
logging: Optional[str]
|
|
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlConfigIn(BaseModel):
|
|
"""CrawlConfig input model, submitted via API"""
|
|
|
|
schedule: Optional[str] = ""
|
|
runNow: Optional[bool] = False
|
|
|
|
config: RawCrawlConfig
|
|
|
|
name: str
|
|
|
|
description: Optional[str]
|
|
|
|
jobType: Optional[JobType] = JobType.CUSTOM
|
|
|
|
profileid: Optional[str]
|
|
|
|
autoAddCollections: Optional[List[UUID4]] = []
|
|
tags: Optional[List[str]] = []
|
|
|
|
crawlTimeout: int = 0
|
|
maxCrawlSize: int = 0
|
|
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 # type: ignore
|
|
|
|
crawlFilenameTemplate: Optional[str]
|
|
|
|
|
|
# ============================================================================
|
|
class ConfigRevision(BaseMongoModel):
|
|
"""Crawl Config Revision"""
|
|
|
|
cid: UUID4
|
|
|
|
schedule: Optional[str] = ""
|
|
|
|
config: RawCrawlConfig
|
|
|
|
profileid: Optional[UUID4]
|
|
|
|
crawlTimeout: Optional[int] = 0
|
|
maxCrawlSize: Optional[int] = 0
|
|
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 # type: ignore
|
|
|
|
modified: datetime
|
|
modifiedBy: Optional[UUID4]
|
|
|
|
rev: int = 0
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlConfigCore(BaseMongoModel):
|
|
"""Core data shared between crawls and crawlconfigs"""
|
|
|
|
schedule: Optional[str] = ""
|
|
|
|
jobType: Optional[JobType] = JobType.CUSTOM
|
|
config: Optional[RawCrawlConfig]
|
|
|
|
tags: Optional[List[str]] = []
|
|
|
|
crawlTimeout: Optional[int] = 0
|
|
maxCrawlSize: Optional[int] = 0
|
|
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1 # type: ignore
|
|
|
|
oid: UUID4
|
|
|
|
profileid: Optional[UUID4]
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlConfigAdditional(BaseModel):
|
|
"""Additional fields shared by CrawlConfig and CrawlConfigOut."""
|
|
|
|
name: Optional[str]
|
|
description: Optional[str]
|
|
|
|
created: datetime
|
|
createdBy: Optional[UUID4]
|
|
|
|
modified: Optional[datetime]
|
|
modifiedBy: Optional[UUID4]
|
|
|
|
autoAddCollections: Optional[List[UUID4]] = []
|
|
|
|
inactive: Optional[bool] = False
|
|
|
|
rev: int = 0
|
|
|
|
crawlAttemptCount: Optional[int] = 0
|
|
crawlCount: Optional[int] = 0
|
|
crawlSuccessfulCount: Optional[int] = 0
|
|
|
|
totalSize: Optional[int] = 0
|
|
|
|
lastCrawlId: Optional[str]
|
|
lastCrawlStartTime: Optional[datetime]
|
|
lastStartedBy: Optional[UUID4]
|
|
lastCrawlTime: Optional[datetime]
|
|
lastCrawlState: Optional[str]
|
|
lastCrawlSize: Optional[int]
|
|
|
|
lastRun: Optional[datetime]
|
|
|
|
isCrawlRunning: Optional[bool] = False
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
|
|
"""Schedulable config"""
|
|
|
|
id: UUID4
|
|
|
|
config: RawCrawlConfig
|
|
|
|
def get_raw_config(self):
|
|
"""serialize config for browsertrix-crawler"""
|
|
return self.config.dict(exclude_unset=True, exclude_none=True)
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
|
|
"""Crawl Config Output"""
|
|
|
|
lastCrawlStopping: Optional[bool] = False
|
|
|
|
profileName: Optional[str]
|
|
|
|
createdByName: Optional[str]
|
|
modifiedByName: Optional[str]
|
|
lastStartedByName: Optional[str]
|
|
|
|
firstSeed: Optional[str]
|
|
seedCount: int = 0
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlConfigIdNameOut(BaseMongoModel):
|
|
"""Crawl Config id and name output only"""
|
|
|
|
name: str
|
|
|
|
|
|
# ============================================================================
|
|
class UpdateCrawlConfig(BaseModel):
|
|
"""Update crawl config name, crawl schedule, or tags"""
|
|
|
|
# metadata: not revision tracked
|
|
name: Optional[str] = None
|
|
tags: Optional[List[str]] = None
|
|
description: Optional[str] = None
|
|
autoAddCollections: Optional[List[UUID4]] = None
|
|
runNow: bool = False
|
|
|
|
# crawl data: revision tracked
|
|
schedule: Optional[str] = None
|
|
profileid: Optional[str] = None
|
|
crawlTimeout: Optional[int] = None
|
|
maxCrawlSize: Optional[int] = None
|
|
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = None # type: ignore
|
|
crawlFilenameTemplate: Optional[str] = None
|
|
config: Optional[RawCrawlConfig] = None
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### BASE CRAWLS ###
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlFile(BaseModel):
|
|
"""file from a crawl"""
|
|
|
|
filename: str
|
|
hash: str
|
|
size: int
|
|
def_storage_name: Optional[str]
|
|
|
|
presignedUrl: Optional[str]
|
|
expireAt: Optional[datetime]
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlFileOut(BaseModel):
|
|
"""output for file from a crawl (conformance to Data Resource Spec)"""
|
|
|
|
name: str
|
|
path: str
|
|
hash: str
|
|
size: int
|
|
crawlId: Optional[str]
|
|
|
|
|
|
# ============================================================================
|
|
class BaseCrawl(BaseMongoModel):
|
|
"""Base Crawl object (representing crawls, uploads and manual sessions)"""
|
|
|
|
id: str
|
|
|
|
userid: UUID4
|
|
oid: UUID4
|
|
|
|
started: datetime
|
|
finished: Optional[datetime] = None
|
|
|
|
name: Optional[str] = ""
|
|
|
|
state: str
|
|
|
|
stats: Optional[Dict[str, int]] = None
|
|
|
|
files: Optional[List[CrawlFile]] = []
|
|
|
|
description: Optional[str] = ""
|
|
|
|
errors: Optional[List[str]] = []
|
|
|
|
collectionIds: Optional[List[UUID4]] = []
|
|
|
|
fileSize: int = 0
|
|
fileCount: int = 0
|
|
|
|
|
|
# ============================================================================
|
|
class CollIdName(BaseModel):
|
|
"""Collection id and name object"""
|
|
|
|
id: UUID4
|
|
name: str
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlOut(BaseMongoModel):
|
|
"""Crawl output model, shared across all crawl types"""
|
|
|
|
# pylint: disable=duplicate-code
|
|
|
|
type: Optional[str]
|
|
|
|
id: str
|
|
|
|
userid: UUID4
|
|
oid: UUID4
|
|
|
|
userName: Optional[str]
|
|
|
|
name: Optional[str]
|
|
description: Optional[str]
|
|
|
|
started: datetime
|
|
finished: Optional[datetime]
|
|
|
|
state: str
|
|
|
|
stats: Optional[Dict[str, int]]
|
|
|
|
fileSize: int = 0
|
|
fileCount: int = 0
|
|
|
|
tags: Optional[List[str]] = []
|
|
|
|
errors: Optional[List[str]]
|
|
|
|
collectionIds: Optional[List[UUID4]] = []
|
|
|
|
# automated crawl fields
|
|
config: Optional[RawCrawlConfig]
|
|
cid: Optional[UUID4]
|
|
firstSeed: Optional[str]
|
|
seedCount: Optional[int]
|
|
profileName: Optional[str]
|
|
stopping: Optional[bool]
|
|
manual: Optional[bool]
|
|
cid_rev: Optional[int]
|
|
|
|
storageQuotaReached: Optional[bool]
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlOutWithResources(CrawlOut):
|
|
"""Crawl output model including resources"""
|
|
|
|
resources: Optional[List[CrawlFileOut]] = []
|
|
collections: Optional[List[CollIdName]] = []
|
|
|
|
|
|
# ============================================================================
|
|
class UpdateCrawl(BaseModel):
|
|
"""Update crawl"""
|
|
|
|
name: Optional[str]
|
|
description: Optional[str]
|
|
tags: Optional[List[str]]
|
|
collectionIds: Optional[List[UUID4]]
|
|
|
|
|
|
# ============================================================================
|
|
class DeleteCrawlList(BaseModel):
|
|
"""delete crawl list POST body"""
|
|
|
|
crawl_ids: List[str]
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### AUTOMATED CRAWLS ###
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlScale(BaseModel):
|
|
"""scale the crawl to N parallel containers"""
|
|
|
|
scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1 # type: ignore
|
|
|
|
|
|
# ============================================================================
|
|
class Crawl(BaseCrawl, CrawlConfigCore):
|
|
"""Store State of a Crawl (Finished or Running)"""
|
|
|
|
type: str = Field("crawl", const=True)
|
|
|
|
cid: UUID4
|
|
|
|
config: RawCrawlConfig
|
|
|
|
cid_rev: int = 0
|
|
|
|
# schedule: Optional[str]
|
|
manual: Optional[bool]
|
|
|
|
stopping: Optional[bool] = False
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlCompleteIn(BaseModel):
|
|
"""Completed Crawl Webhook POST message"""
|
|
|
|
id: str
|
|
|
|
user: str
|
|
|
|
filename: str
|
|
size: int
|
|
hash: str
|
|
|
|
completed: Optional[bool] = True
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### UPLOADED CRAWLS ###
|
|
|
|
|
|
# ============================================================================
|
|
class UploadedCrawl(BaseCrawl):
|
|
"""Store State of a Crawl Upload"""
|
|
|
|
type: str = Field("upload", const=True)
|
|
|
|
tags: Optional[List[str]] = []
|
|
|
|
|
|
# ============================================================================
|
|
class UpdateUpload(UpdateCrawl):
|
|
"""Update modal that also includes name"""
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### COLLECTIONS ###
|
|
|
|
|
|
# ============================================================================
|
|
class Collection(BaseMongoModel):
|
|
"""Org collection structure"""
|
|
|
|
name: str = Field(..., min_length=1)
|
|
oid: UUID4
|
|
description: Optional[str]
|
|
modified: Optional[datetime]
|
|
|
|
crawlCount: Optional[int] = 0
|
|
pageCount: Optional[int] = 0
|
|
totalSize: Optional[int] = 0
|
|
|
|
# Sorted by count, descending
|
|
tags: Optional[List[str]] = []
|
|
|
|
isPublic: Optional[bool] = False
|
|
|
|
|
|
# ============================================================================
|
|
class CollIn(BaseModel):
|
|
"""Collection Passed in By User"""
|
|
|
|
name: str = Field(..., min_length=1)
|
|
description: Optional[str]
|
|
crawlIds: Optional[List[str]] = []
|
|
|
|
isPublic: Optional[bool] = False
|
|
|
|
|
|
# ============================================================================
|
|
class CollOut(Collection):
|
|
"""Collection output model with annotations."""
|
|
|
|
resources: Optional[List[CrawlFileOut]] = []
|
|
|
|
|
|
# ============================================================================
|
|
class UpdateColl(BaseModel):
|
|
"""Update collection"""
|
|
|
|
name: Optional[str]
|
|
description: Optional[str]
|
|
isPublic: Optional[bool]
|
|
|
|
|
|
# ============================================================================
|
|
class AddRemoveCrawlList(BaseModel):
|
|
"""Collections to add or remove from collection"""
|
|
|
|
crawlIds: Optional[List[str]] = []
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### INVITES ###
|
|
|
|
|
|
# ============================================================================
|
|
class UserRole(IntEnum):
|
|
"""User role"""
|
|
|
|
VIEWER = 10
|
|
CRAWLER = 20
|
|
OWNER = 40
|
|
SUPERADMIN = 100
|
|
|
|
|
|
# ============================================================================
|
|
class InvitePending(BaseMongoModel):
|
|
"""An invite for a new user, with an email and invite token as id"""
|
|
|
|
created: datetime
|
|
inviterEmail: str
|
|
oid: Optional[UUID4]
|
|
role: Optional[UserRole] = UserRole.VIEWER
|
|
email: Optional[str]
|
|
|
|
|
|
# ============================================================================
|
|
class InviteRequest(BaseModel):
|
|
"""Request to invite another user"""
|
|
|
|
email: str
|
|
|
|
|
|
# ============================================================================
|
|
class InviteToOrgRequest(InviteRequest):
|
|
"""Request to invite another user to an organization"""
|
|
|
|
role: UserRole
|
|
|
|
|
|
# ============================================================================
|
|
class AddToOrgRequest(InviteRequest):
|
|
"""Request to add a new user to an organization directly"""
|
|
|
|
role: UserRole
|
|
password: str
|
|
name: str
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### ORGS ###
|
|
|
|
|
|
# ============================================================================
|
|
class UpdateRole(InviteToOrgRequest):
|
|
"""Update existing role for user"""
|
|
|
|
|
|
# ============================================================================
|
|
class RemoveFromOrg(InviteRequest):
|
|
"""Remove this user from org"""
|
|
|
|
|
|
# ============================================================================
|
|
class RemovePendingInvite(InviteRequest):
|
|
"""Delete pending invite to org by email"""
|
|
|
|
|
|
# ============================================================================
|
|
class RenameOrg(BaseModel):
|
|
"""Request to invite another user"""
|
|
|
|
name: str
|
|
|
|
|
|
# ============================================================================
|
|
class DefaultStorage(BaseModel):
|
|
"""Storage reference"""
|
|
|
|
type: Literal["default"] = "default"
|
|
name: str
|
|
path: str = ""
|
|
|
|
|
|
# ============================================================================
|
|
class S3Storage(BaseModel):
|
|
"""S3 Storage Model"""
|
|
|
|
type: Literal["s3"] = "s3"
|
|
|
|
endpoint_url: str
|
|
access_key: str
|
|
secret_key: str
|
|
access_endpoint_url: Optional[str]
|
|
region: Optional[str] = ""
|
|
use_access_for_presign: Optional[bool] = True
|
|
|
|
|
|
# ============================================================================
|
|
class OrgQuotas(BaseModel):
|
|
"""Organization quotas (settable by superadmin)"""
|
|
|
|
maxConcurrentCrawls: Optional[int] = 0
|
|
maxPagesPerCrawl: Optional[int] = 0
|
|
storageQuota: Optional[int] = 0
|
|
|
|
|
|
# ============================================================================
|
|
class OrgWebhookUrls(BaseModel):
|
|
"""Organization webhook URLs"""
|
|
|
|
crawlStarted: Optional[AnyHttpUrl] = None
|
|
crawlFinished: Optional[AnyHttpUrl] = None
|
|
uploadFinished: Optional[AnyHttpUrl] = None
|
|
addedToCollection: Optional[AnyHttpUrl] = None
|
|
removedFromCollection: Optional[AnyHttpUrl] = None
|
|
|
|
|
|
# ============================================================================
|
|
class Organization(BaseMongoModel):
|
|
"""Organization Base Model"""
|
|
|
|
id: UUID4
|
|
|
|
name: str
|
|
|
|
users: Dict[str, UserRole]
|
|
|
|
storage: Union[S3Storage, DefaultStorage]
|
|
|
|
usage: Dict[str, int] = {}
|
|
|
|
bytesStored: int = 0
|
|
|
|
default: bool = False
|
|
|
|
quotas: Optional[OrgQuotas] = OrgQuotas()
|
|
|
|
webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
|
|
|
|
origin: Optional[AnyHttpUrl] = None
|
|
|
|
def is_owner(self, user):
|
|
"""Check if user is owner"""
|
|
return self._is_auth(user, UserRole.OWNER)
|
|
|
|
def is_crawler(self, user):
|
|
"""Check if user can crawl (write)"""
|
|
return self._is_auth(user, UserRole.CRAWLER)
|
|
|
|
def is_viewer(self, user):
|
|
"""Check if user can view (read)"""
|
|
return self._is_auth(user, UserRole.VIEWER)
|
|
|
|
def _is_auth(self, user, value):
|
|
"""Check if user has at least specified permission level"""
|
|
if user.is_superuser:
|
|
return True
|
|
|
|
res = self.users.get(str(user.id))
|
|
if not res:
|
|
return False
|
|
|
|
return res >= value
|
|
|
|
async def serialize_for_user(self, user: User, user_manager):
|
|
"""Serialize result based on current user access"""
|
|
|
|
exclude = {"storage"}
|
|
|
|
if not self.is_owner(user):
|
|
exclude.add("users")
|
|
|
|
if not self.is_crawler(user):
|
|
exclude.add("usage")
|
|
|
|
result = self.to_dict(
|
|
exclude_unset=True,
|
|
exclude_none=True,
|
|
exclude=exclude,
|
|
)
|
|
|
|
if self.is_owner(user):
|
|
keys = list(result["users"].keys())
|
|
user_list = await user_manager.get_user_names_by_ids(keys)
|
|
|
|
for org_user in user_list:
|
|
id_ = str(org_user["id"])
|
|
role = result["users"].get(id_)
|
|
if not role:
|
|
continue
|
|
|
|
result["users"][id_] = {
|
|
"role": role,
|
|
"name": org_user.get("name", ""),
|
|
"email": org_user.get("email", ""),
|
|
}
|
|
|
|
return OrgOut.from_dict(result)
|
|
|
|
|
|
# ============================================================================
|
|
class OrgOut(BaseMongoModel):
|
|
"""Organization API output model"""
|
|
|
|
id: UUID4
|
|
name: str
|
|
users: Optional[Dict[str, Any]]
|
|
usage: Optional[Dict[str, int]]
|
|
default: bool = False
|
|
bytesStored: int
|
|
origin: Optional[AnyHttpUrl]
|
|
|
|
webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
|
|
quotas: Optional[OrgQuotas] = OrgQuotas()
|
|
|
|
|
|
# ============================================================================
|
|
class OrgMetrics(BaseModel):
|
|
"""Organization API metrics model"""
|
|
|
|
storageUsedBytes: int
|
|
storageUsedGB: float
|
|
storageQuotaBytes: int
|
|
storageQuotaGB: float
|
|
archivedItemCount: int
|
|
crawlCount: int
|
|
uploadCount: int
|
|
pageCount: int
|
|
profileCount: int
|
|
workflowsRunningCount: int
|
|
maxConcurrentCrawls: int
|
|
workflowsQueuedCount: int
|
|
collectionsCount: int
|
|
publicCollectionsCount: int
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### PAGINATION ###
|
|
|
|
|
|
# ============================================================================
|
|
class PaginatedResponse(BaseModel):
|
|
"""Paginated response model"""
|
|
|
|
items: List[Any]
|
|
total: int
|
|
page: int
|
|
pageSize: int
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### PROFILES ###
|
|
|
|
|
|
# ============================================================================
|
|
class ProfileFile(BaseModel):
|
|
"""file from a crawl"""
|
|
|
|
filename: str
|
|
hash: str
|
|
size: int
|
|
def_storage_name: Optional[str] = ""
|
|
|
|
|
|
# ============================================================================
|
|
class Profile(BaseMongoModel):
|
|
"""Browser profile"""
|
|
|
|
name: str
|
|
description: Optional[str] = ""
|
|
|
|
userid: UUID4
|
|
oid: UUID4
|
|
|
|
origins: List[str]
|
|
resource: Optional[ProfileFile]
|
|
|
|
created: Optional[datetime]
|
|
baseid: Optional[UUID4] = None
|
|
|
|
|
|
# ============================================================================
|
|
class ProfileWithCrawlConfigs(Profile):
|
|
"""Profile with list of crawlconfigs using this profile"""
|
|
|
|
crawlconfigs: List[CrawlConfigIdNameOut] = []
|
|
|
|
|
|
# ============================================================================
|
|
class UrlIn(BaseModel):
|
|
"""Request to set url"""
|
|
|
|
url: HttpUrl
|
|
|
|
|
|
# ============================================================================
|
|
class ProfileLaunchBrowserIn(UrlIn):
|
|
"""Request to launch new browser for creating profile"""
|
|
|
|
profileId: Optional[UUID4]
|
|
|
|
|
|
# ============================================================================
|
|
class BrowserId(BaseModel):
|
|
"""Profile id on newly created profile"""
|
|
|
|
browserid: str
|
|
|
|
|
|
# ============================================================================
|
|
class ProfileCreate(BaseModel):
|
|
"""Create new profile for browser id"""
|
|
|
|
browserid: str
|
|
name: str
|
|
description: Optional[str] = ""
|
|
|
|
|
|
# ============================================================================
|
|
class ProfileUpdate(BaseModel):
|
|
"""Update existing profile with new browser profile or metadata only"""
|
|
|
|
browserid: Optional[str] = ""
|
|
name: str
|
|
description: Optional[str] = ""
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### USERS ###
|
|
|
|
|
|
# ============================================================================
|
|
# use custom model as model.BaseUserCreate includes is_* field
|
|
class UserCreateIn(fastapi_users_models.CreateUpdateDictModel):
|
|
"""
|
|
User Creation Model exposed to API
|
|
"""
|
|
|
|
email: EmailStr
|
|
password: str
|
|
|
|
name: Optional[str] = ""
|
|
|
|
inviteToken: Optional[UUID4]
|
|
|
|
newOrg: bool
|
|
newOrgName: Optional[str] = ""
|
|
|
|
|
|
# ============================================================================
|
|
class UserCreate(fastapi_users_models.BaseUserCreate):
|
|
"""
|
|
User Creation Model
|
|
"""
|
|
|
|
name: Optional[str] = ""
|
|
|
|
inviteToken: Optional[UUID4] = None
|
|
|
|
newOrg: bool
|
|
newOrgName: Optional[str] = ""
|
|
|
|
|
|
# ============================================================================
|
|
class UserUpdate(User, fastapi_users_models.CreateUpdateDictModel):
|
|
"""
|
|
User Update Model
|
|
"""
|
|
|
|
password: Optional[str]
|
|
email: EmailStr
|
|
|
|
|
|
# ============================================================================
|
|
class UserDB(User, fastapi_users_models.BaseUserDB):
|
|
"""
|
|
User in DB Model
|
|
"""
|
|
|
|
invites: Dict[str, InvitePending] = {}
|
|
|
|
|
|
# ============================================================================
|
|
|
|
### WEBHOOKS ###
|
|
|
|
|
|
# ============================================================================
|
|
class WebhookNotificationBody(BaseModel):
|
|
"""Base POST body model for webhook notifications"""
|
|
|
|
downloadUrls: Optional[List] = None
|
|
|
|
# Store as str, not UUID, to make JSON-serializable
|
|
orgId: str
|
|
|
|
|
|
# ============================================================================
|
|
class WebhookEventType(str, Enum):
|
|
"""Webhook Event Types"""
|
|
|
|
CRAWL_STARTED = "crawlStarted"
|
|
CRAWL_FINISHED = "crawlFinished"
|
|
UPLOAD_FINISHED = "uploadFinished"
|
|
|
|
ADDED_TO_COLLECTION = "addedToCollection"
|
|
REMOVED_FROM_COLLECTION = "removedFromCollection"
|
|
|
|
|
|
# ============================================================================
|
|
class BaseCollectionItemBody(WebhookNotificationBody):
|
|
"""Webhook notification base POST body for collection changes"""
|
|
|
|
collectionId: str
|
|
itemIds: List[str]
|
|
|
|
|
|
# ============================================================================
|
|
class CollectionItemAddedBody(BaseCollectionItemBody):
|
|
"""Webhook notification POST body for collection additions"""
|
|
|
|
event: str = Field(WebhookEventType.ADDED_TO_COLLECTION, const=True)
|
|
|
|
|
|
# ============================================================================
|
|
class CollectionItemRemovedBody(BaseCollectionItemBody):
|
|
"""Webhook notification POST body for collection removals"""
|
|
|
|
event: str = Field(WebhookEventType.REMOVED_FROM_COLLECTION, const=True)
|
|
|
|
|
|
# ============================================================================
|
|
class BaseArchivedItemBody(WebhookNotificationBody):
|
|
"""Webhook notification POST body for when archived item is started or finished"""
|
|
|
|
itemId: str
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlStartedBody(BaseArchivedItemBody):
|
|
"""Webhook notification POST body for when crawl starts"""
|
|
|
|
scheduled: bool = False
|
|
event: str = Field(WebhookEventType.CRAWL_STARTED, const=True)
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlFinishedBody(BaseArchivedItemBody):
|
|
"""Webhook notification POST body for when crawl finishes"""
|
|
|
|
event: str = Field(WebhookEventType.CRAWL_FINISHED, const=True)
|
|
state: str
|
|
|
|
|
|
# ============================================================================
|
|
class UploadFinishedBody(BaseArchivedItemBody):
|
|
"""Webhook notification POST body for when upload finishes"""
|
|
|
|
event: str = Field(WebhookEventType.UPLOAD_FINISHED, const=True)
|
|
state: str
|
|
|
|
|
|
# ============================================================================
|
|
class WebhookNotification(BaseMongoModel):
|
|
"""Base POST body model for webhook notifications"""
|
|
|
|
event: WebhookEventType
|
|
oid: UUID4
|
|
body: Union[
|
|
CrawlStartedBody,
|
|
CrawlFinishedBody,
|
|
UploadFinishedBody,
|
|
CollectionItemAddedBody,
|
|
CollectionItemRemovedBody,
|
|
]
|
|
success: bool = False
|
|
attempts: int = 0
|
|
created: datetime
|
|
lastAttempted: Optional[datetime] = None
|