- Adds two new crawl finished state, stopped_by_user and stopped_quota_reached - Tracking other possible 'stop reasons' in operator, though not making them distinct states for now. - Updated frontend with 'Stopped by User' and 'Stopped: Time Quota Reached', shown with same icon as current partial_complete - Added migration of partial_complete to either stopped_by_user or complete (no historical quota data available) - Addresses edge case in scaling: if crawl never scaled (no redis entry, no pod), automatically scale down - Edge case in status: if crawl is somehow 'canceled' but not deleted, immediately delete crawl object and begin finalizing. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
		
			
				
	
	
		
			1308 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			1308 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Crawl-related models and types
 | |
| """
 | |
| 
 | |
| from datetime import datetime
 | |
| from enum import Enum, IntEnum
 | |
| from uuid import UUID
 | |
| import os
 | |
| 
 | |
| from typing import Optional, List, Dict, Union, Literal, Any
 | |
| from pydantic import (
 | |
|     BaseModel,
 | |
|     conint,
 | |
|     Field,
 | |
|     HttpUrl,
 | |
|     AnyHttpUrl,
 | |
|     EmailStr,
 | |
|     ConstrainedStr,
 | |
| )
 | |
| 
 | |
| # from fastapi_users import models as fastapi_users_models
 | |
| 
 | |
| from .db import BaseMongoModel
 | |
| 
 | |
| # crawl scale for constraint
 | |
| MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3))
 | |
| 
 | |
| 
 | |
| # pylint: disable=invalid-name, too-many-lines
 | |
| # ============================================================================
 | |
| class UserRole(IntEnum):
 | |
|     """User role"""
 | |
| 
 | |
|     VIEWER = 10
 | |
|     CRAWLER = 20
 | |
|     OWNER = 40
 | |
|     SUPERADMIN = 100
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### INVITES ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class InvitePending(BaseMongoModel):
 | |
|     """An invite for a new user, with an email and invite token as id"""
 | |
| 
 | |
|     created: datetime
 | |
|     inviterEmail: str
 | |
|     oid: Optional[UUID]
 | |
|     role: Optional[UserRole] = UserRole.VIEWER
 | |
|     email: Optional[str]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class InviteRequest(BaseModel):
 | |
|     """Request to invite another user"""
 | |
| 
 | |
|     email: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class InviteToOrgRequest(InviteRequest):
 | |
|     """Request to invite another user to an organization"""
 | |
| 
 | |
|     role: UserRole
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class AddToOrgRequest(InviteRequest):
 | |
|     """Request to add a new user to an organization directly"""
 | |
| 
 | |
|     role: UserRole
 | |
|     password: str
 | |
|     name: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### MAIN USER MODEL ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class User(BaseModel):
 | |
|     """
 | |
|     User Model
 | |
|     """
 | |
| 
 | |
|     id: UUID
 | |
| 
 | |
|     name: str = ""
 | |
|     email: EmailStr
 | |
|     is_superuser: bool = False
 | |
|     is_verified: bool = False
 | |
| 
 | |
|     invites: Dict[str, InvitePending] = {}
 | |
|     hashed_password: str
 | |
| 
 | |
|     def dict(self, *a, **kw):
 | |
|         """ensure invites / hashed_password never serialize, just in case"""
 | |
|         exclude = kw.get("exclude") or set()
 | |
|         exclude.add("invites")
 | |
|         exclude.add("hashed_password")
 | |
|         return super().dict(*a, **kw)
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class FailedLogin(BaseMongoModel):
 | |
|     """
 | |
|     Failed login model
 | |
|     """
 | |
| 
 | |
|     attempted: datetime = datetime.now()
 | |
|     email: str
 | |
| 
 | |
|     # Consecutive failed logins, reset to 0 on successful login or after
 | |
|     # password is reset. On failed_logins >= 5 within the hour before this
 | |
|     # object is deleted, the user is unable to log in until they reset their
 | |
|     # password.
 | |
|     count: int = 1
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UserOrgInfoOut(BaseModel):
 | |
|     """org per user"""
 | |
| 
 | |
|     id: UUID
 | |
| 
 | |
|     name: str
 | |
|     slug: str
 | |
|     default: bool
 | |
|     role: UserRole
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UserOut(BaseModel):
 | |
|     """Output User model"""
 | |
| 
 | |
|     id: UUID
 | |
| 
 | |
|     name: str = ""
 | |
|     email: EmailStr
 | |
|     is_superuser: bool = False
 | |
|     is_verified: bool = False
 | |
| 
 | |
|     orgs: List[UserOrgInfoOut]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### CRAWL STATES
 | |
| 
 | |
| # ============================================================================
 | |
| RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"]
 | |
| 
 | |
| STARTING_STATES = ["starting", "waiting_capacity", "waiting_org_limit"]
 | |
| 
 | |
| FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]
 | |
| 
 | |
| SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"]
 | |
| 
 | |
| RUNNING_AND_STARTING_STATES = [*STARTING_STATES, *RUNNING_STATES]
 | |
| 
 | |
| RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES]
 | |
| 
 | |
| NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]
 | |
| 
 | |
| ALL_CRAWL_STATES = [*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### CRAWL CONFIGS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class JobType(str, Enum):
 | |
|     """Job Types"""
 | |
| 
 | |
|     URL_LIST = "url-list"
 | |
|     SEED_CRAWL = "seed-crawl"
 | |
|     CUSTOM = "custom"
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class ScopeType(str, Enum):
 | |
|     """Crawl scope type"""
 | |
| 
 | |
|     PAGE = "page"
 | |
|     PAGE_SPA = "page-spa"
 | |
|     PREFIX = "prefix"
 | |
|     HOST = "host"
 | |
|     DOMAIN = "domain"
 | |
|     ANY = "any"
 | |
|     CUSTOM = "custom"
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class EmptyStr(ConstrainedStr):
 | |
|     """empty string only"""
 | |
| 
 | |
|     min_length = 0
 | |
|     max_length = 0
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class Seed(BaseModel):
 | |
|     """Crawl seed"""
 | |
| 
 | |
|     url: HttpUrl
 | |
|     scopeType: Optional[ScopeType] = None
 | |
| 
 | |
|     include: Union[str, List[str], None] = None
 | |
|     exclude: Union[str, List[str], None] = None
 | |
|     sitemap: Union[bool, HttpUrl, None] = None
 | |
|     allowHash: Optional[bool] = None
 | |
|     depth: Optional[int] = None
 | |
|     extraHops: Optional[int] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class RawCrawlConfig(BaseModel):
 | |
|     """Base Crawl Config"""
 | |
| 
 | |
|     seeds: Optional[List[Seed]]
 | |
| 
 | |
|     scopeType: Optional[ScopeType] = ScopeType.PREFIX
 | |
| 
 | |
|     include: Union[str, List[str], None] = None
 | |
|     exclude: Union[str, List[str], None] = None
 | |
| 
 | |
|     depth: Optional[int] = -1
 | |
|     limit: Optional[int] = 0
 | |
|     extraHops: Optional[int] = 0
 | |
| 
 | |
|     lang: Optional[str]
 | |
|     blockAds: Optional[bool] = False
 | |
| 
 | |
|     behaviorTimeout: Optional[int]
 | |
|     pageLoadTimeout: Optional[int]
 | |
|     pageExtraDelay: Optional[int] = 0
 | |
| 
 | |
|     workers: Optional[int] = None
 | |
| 
 | |
|     headless: Optional[bool] = None
 | |
| 
 | |
|     generateWACZ: Optional[bool] = None
 | |
|     combineWARC: Optional[bool] = None
 | |
| 
 | |
|     useSitemap: Optional[bool] = False
 | |
|     failOnFailedSeed: Optional[bool] = False
 | |
| 
 | |
|     logging: Optional[str] = None
 | |
|     behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlConfigIn(BaseModel):
 | |
|     """CrawlConfig input model, submitted via API"""
 | |
| 
 | |
|     schedule: Optional[str] = ""
 | |
|     runNow: bool = False
 | |
| 
 | |
|     config: RawCrawlConfig
 | |
| 
 | |
|     name: str
 | |
| 
 | |
|     description: Optional[str]
 | |
| 
 | |
|     jobType: Optional[JobType] = JobType.CUSTOM
 | |
| 
 | |
|     profileid: Union[UUID, EmptyStr, None]
 | |
| 
 | |
|     autoAddCollections: Optional[List[UUID]] = []
 | |
|     tags: Optional[List[str]] = []
 | |
| 
 | |
|     crawlTimeout: int = 0
 | |
|     maxCrawlSize: int = 0
 | |
|     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1  # type: ignore
 | |
| 
 | |
|     crawlFilenameTemplate: Optional[str] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class ConfigRevision(BaseMongoModel):
 | |
|     """Crawl Config Revision"""
 | |
| 
 | |
|     cid: UUID
 | |
| 
 | |
|     schedule: Optional[str] = ""
 | |
| 
 | |
|     config: RawCrawlConfig
 | |
| 
 | |
|     profileid: Optional[UUID]
 | |
| 
 | |
|     crawlTimeout: Optional[int] = 0
 | |
|     maxCrawlSize: Optional[int] = 0
 | |
|     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1  # type: ignore
 | |
| 
 | |
|     modified: datetime
 | |
|     modifiedBy: Optional[UUID]
 | |
| 
 | |
|     rev: int = 0
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlConfigCore(BaseMongoModel):
 | |
|     """Core data shared between crawls and crawlconfigs"""
 | |
| 
 | |
|     schedule: Optional[str] = ""
 | |
| 
 | |
|     jobType: Optional[JobType] = JobType.CUSTOM
 | |
|     config: Optional[RawCrawlConfig]
 | |
| 
 | |
|     tags: Optional[List[str]] = []
 | |
| 
 | |
|     crawlTimeout: Optional[int] = 0
 | |
|     maxCrawlSize: Optional[int] = 0
 | |
|     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1  # type: ignore
 | |
| 
 | |
|     oid: UUID
 | |
| 
 | |
|     profileid: Optional[UUID]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlConfigAdditional(BaseModel):
 | |
|     """Additional fields shared by CrawlConfig and CrawlConfigOut."""
 | |
| 
 | |
|     name: Optional[str]
 | |
|     description: Optional[str]
 | |
| 
 | |
|     created: datetime
 | |
|     createdBy: Optional[UUID]
 | |
| 
 | |
|     modified: Optional[datetime]
 | |
|     modifiedBy: Optional[UUID]
 | |
| 
 | |
|     autoAddCollections: Optional[List[UUID]] = []
 | |
| 
 | |
|     inactive: Optional[bool] = False
 | |
| 
 | |
|     rev: int = 0
 | |
| 
 | |
|     crawlAttemptCount: Optional[int] = 0
 | |
|     crawlCount: Optional[int] = 0
 | |
|     crawlSuccessfulCount: Optional[int] = 0
 | |
| 
 | |
|     totalSize: Optional[int] = 0
 | |
| 
 | |
|     lastCrawlId: Optional[str]
 | |
|     lastCrawlStartTime: Optional[datetime]
 | |
|     lastStartedBy: Optional[UUID]
 | |
|     lastCrawlTime: Optional[datetime]
 | |
|     lastCrawlState: Optional[str]
 | |
|     lastCrawlSize: Optional[int]
 | |
| 
 | |
|     lastRun: Optional[datetime]
 | |
| 
 | |
|     isCrawlRunning: Optional[bool] = False
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
 | |
|     """Schedulable config"""
 | |
| 
 | |
|     id: UUID
 | |
| 
 | |
|     config: RawCrawlConfig
 | |
|     createdByName: Optional[str]
 | |
|     modifiedByName: Optional[str]
 | |
|     lastStartedByName: Optional[str]
 | |
| 
 | |
|     def get_raw_config(self):
 | |
|         """serialize config for browsertrix-crawler"""
 | |
|         return self.config.dict(exclude_unset=True, exclude_none=True)
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
 | |
|     """Crawl Config Output"""
 | |
| 
 | |
|     lastCrawlStopping: Optional[bool] = False
 | |
|     profileName: Optional[str]
 | |
|     firstSeed: Optional[str]
 | |
|     seedCount: int = 0
 | |
| 
 | |
|     createdByName: Optional[str]
 | |
|     modifiedByName: Optional[str]
 | |
|     lastStartedByName: Optional[str]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlConfigIdNameOut(BaseMongoModel):
 | |
|     """Crawl Config id and name output only"""
 | |
| 
 | |
|     name: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UpdateCrawlConfig(BaseModel):
 | |
|     """Update crawl config name, crawl schedule, or tags"""
 | |
| 
 | |
|     # metadata: not revision tracked
 | |
|     name: Optional[str] = None
 | |
|     tags: Optional[List[str]] = None
 | |
|     description: Optional[str] = None
 | |
|     autoAddCollections: Optional[List[UUID]] = None
 | |
|     runNow: bool = False
 | |
| 
 | |
|     # crawl data: revision tracked
 | |
|     schedule: Optional[str] = None
 | |
|     profileid: Union[UUID, EmptyStr, None] = None
 | |
|     crawlTimeout: Optional[int] = None
 | |
|     maxCrawlSize: Optional[int] = None
 | |
|     scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = None  # type: ignore
 | |
|     crawlFilenameTemplate: Optional[str] = None
 | |
|     config: Optional[RawCrawlConfig] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### BASE CRAWLS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class StorageRef(BaseModel):
 | |
|     """Reference to actual storage"""
 | |
| 
 | |
|     name: str
 | |
|     custom: Optional[bool]
 | |
| 
 | |
|     def __init__(self, *args, **kwargs):
 | |
|         if args:
 | |
|             if args[0].startswith("cs-"):
 | |
|                 super().__init__(name=args[0][2:], custom=True)
 | |
|             else:
 | |
|                 super().__init__(name=args[0], custom=False)
 | |
|         else:
 | |
|             super().__init__(**kwargs)
 | |
| 
 | |
|     def __str__(self):
 | |
|         if not self.custom:
 | |
|             return self.name
 | |
|         return "cs-" + self.name
 | |
| 
 | |
|     def get_storage_secret_name(self, oid: str) -> str:
 | |
|         """get k8s secret name for this storage and oid"""
 | |
|         if not self.custom:
 | |
|             return "storage-" + self.name
 | |
|         return f"storage-cs-{self.name}-{oid[:12]}"
 | |
| 
 | |
|     def get_storage_extra_path(self, oid: str) -> str:
 | |
|         """return extra path added to the endpoint
 | |
|         using oid for default storages, no extra path for custom"""
 | |
|         if not self.custom:
 | |
|             return oid + "/"
 | |
|         return ""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BaseFile(BaseModel):
 | |
|     """Base model for crawl and profile files"""
 | |
| 
 | |
|     filename: str
 | |
|     hash: str
 | |
|     size: int
 | |
|     storage: StorageRef
 | |
| 
 | |
|     replicas: Optional[List[StorageRef]] = []
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlFile(BaseFile):
 | |
|     """file from a crawl"""
 | |
| 
 | |
|     presignedUrl: Optional[str]
 | |
|     expireAt: Optional[datetime]
 | |
|     crc32: int = 0
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlFileOut(BaseModel):
 | |
|     """output for file from a crawl (conformance to Data Resource Spec)"""
 | |
| 
 | |
|     name: str
 | |
|     path: str
 | |
|     hash: str
 | |
|     crc32: int = 0
 | |
|     size: int
 | |
| 
 | |
|     crawlId: Optional[str]
 | |
|     numReplicas: int = 0
 | |
|     expireAt: Optional[str]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BaseCrawl(BaseMongoModel):
 | |
|     """Base Crawl object (representing crawls, uploads and manual sessions)"""
 | |
| 
 | |
|     id: str
 | |
| 
 | |
|     type: str
 | |
| 
 | |
|     userid: UUID
 | |
|     userName: Optional[str]
 | |
|     oid: UUID
 | |
| 
 | |
|     started: datetime
 | |
|     finished: Optional[datetime] = None
 | |
| 
 | |
|     name: Optional[str] = ""
 | |
| 
 | |
|     state: str
 | |
| 
 | |
|     stats: Optional[Dict[str, int]] = None
 | |
| 
 | |
|     files: Optional[List[CrawlFile]] = []
 | |
| 
 | |
|     description: Optional[str] = ""
 | |
| 
 | |
|     errors: Optional[List[str]] = []
 | |
| 
 | |
|     collectionIds: Optional[List[UUID]] = []
 | |
| 
 | |
|     fileSize: int = 0
 | |
|     fileCount: int = 0
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollIdName(BaseModel):
 | |
|     """Collection id and name object"""
 | |
| 
 | |
|     id: UUID
 | |
|     name: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlOut(BaseMongoModel):
 | |
|     """Crawl output model, shared across all crawl types"""
 | |
| 
 | |
|     # pylint: disable=duplicate-code
 | |
| 
 | |
|     type: Optional[str]
 | |
| 
 | |
|     id: str
 | |
| 
 | |
|     userid: UUID
 | |
|     userName: Optional[str]
 | |
|     oid: UUID
 | |
| 
 | |
|     name: Optional[str]
 | |
|     description: Optional[str]
 | |
| 
 | |
|     started: datetime
 | |
|     finished: Optional[datetime]
 | |
| 
 | |
|     state: str
 | |
| 
 | |
|     stats: Optional[Dict[str, int]]
 | |
| 
 | |
|     fileSize: int = 0
 | |
|     fileCount: int = 0
 | |
| 
 | |
|     tags: Optional[List[str]] = []
 | |
| 
 | |
|     errors: Optional[List[str]] = []
 | |
| 
 | |
|     collectionIds: Optional[List[UUID]] = []
 | |
| 
 | |
|     crawlExecSeconds: int = 0
 | |
| 
 | |
|     # automated crawl fields
 | |
|     config: Optional[RawCrawlConfig]
 | |
|     cid: Optional[UUID]
 | |
|     firstSeed: Optional[str]
 | |
|     seedCount: Optional[int]
 | |
|     profileName: Optional[str]
 | |
|     stopping: Optional[bool]
 | |
|     manual: Optional[bool]
 | |
|     cid_rev: Optional[int]
 | |
| 
 | |
|     storageQuotaReached: Optional[bool]
 | |
|     execMinutesQuotaReached: Optional[bool]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlOutWithResources(CrawlOut):
 | |
|     """Crawl output model including resources"""
 | |
| 
 | |
|     resources: Optional[List[CrawlFileOut]] = []
 | |
|     collections: Optional[List[CollIdName]] = []
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UpdateCrawl(BaseModel):
 | |
|     """Update crawl"""
 | |
| 
 | |
|     name: Optional[str]
 | |
|     description: Optional[str]
 | |
|     tags: Optional[List[str]]
 | |
|     collectionIds: Optional[List[UUID]]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class DeleteCrawlList(BaseModel):
 | |
|     """delete crawl list POST body"""
 | |
| 
 | |
|     crawl_ids: List[str]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### AUTOMATED CRAWLS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlScale(BaseModel):
 | |
|     """scale the crawl to N parallel containers"""
 | |
| 
 | |
|     scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1  # type: ignore
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class Crawl(BaseCrawl, CrawlConfigCore):
 | |
|     """Store State of a Crawl (Finished or Running)"""
 | |
| 
 | |
|     type: Literal["crawl"] = "crawl"
 | |
| 
 | |
|     cid: UUID
 | |
| 
 | |
|     config: RawCrawlConfig
 | |
| 
 | |
|     cid_rev: int = 0
 | |
| 
 | |
|     # schedule: Optional[str]
 | |
|     manual: Optional[bool]
 | |
| 
 | |
|     stopping: Optional[bool] = False
 | |
| 
 | |
|     crawlExecSeconds: int = 0
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlCompleteIn(BaseModel):
 | |
|     """Completed Crawl Webhook POST message"""
 | |
| 
 | |
|     id: str
 | |
| 
 | |
|     user: str
 | |
| 
 | |
|     filename: str
 | |
|     size: int
 | |
|     hash: str
 | |
|     crc32: int = 0
 | |
| 
 | |
|     completed: Optional[bool] = True
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### UPLOADED CRAWLS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UploadedCrawl(BaseCrawl):
 | |
|     """Store State of a Crawl Upload"""
 | |
| 
 | |
|     type: Literal["upload"] = "upload"
 | |
| 
 | |
|     tags: Optional[List[str]] = []
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UpdateUpload(UpdateCrawl):
 | |
|     """Update modal that also includes name"""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### COLLECTIONS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class Collection(BaseMongoModel):
 | |
|     """Org collection structure"""
 | |
| 
 | |
|     name: str = Field(..., min_length=1)
 | |
|     oid: UUID
 | |
|     description: Optional[str]
 | |
|     modified: Optional[datetime]
 | |
| 
 | |
|     crawlCount: Optional[int] = 0
 | |
|     pageCount: Optional[int] = 0
 | |
|     totalSize: Optional[int] = 0
 | |
| 
 | |
|     # Sorted by count, descending
 | |
|     tags: Optional[List[str]] = []
 | |
| 
 | |
|     isPublic: Optional[bool] = False
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollIn(BaseModel):
 | |
|     """Collection Passed in By User"""
 | |
| 
 | |
|     name: str = Field(..., min_length=1)
 | |
|     description: Optional[str]
 | |
|     crawlIds: Optional[List[str]] = []
 | |
| 
 | |
|     isPublic: Optional[bool] = False
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollOut(Collection):
 | |
|     """Collection output model with annotations."""
 | |
| 
 | |
|     resources: List[CrawlFileOut] = []
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UpdateColl(BaseModel):
 | |
|     """Update collection"""
 | |
| 
 | |
|     name: Optional[str]
 | |
|     description: Optional[str]
 | |
|     isPublic: Optional[bool]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class AddRemoveCrawlList(BaseModel):
 | |
|     """Collections to add or remove from collection"""
 | |
| 
 | |
|     crawlIds: List[str] = []
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### ORGS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UpdateRole(InviteToOrgRequest):
 | |
|     """Update existing role for user"""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class RemoveFromOrg(InviteRequest):
 | |
|     """Remove this user from org"""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class RemovePendingInvite(InviteRequest):
 | |
|     """Delete pending invite to org by email"""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class RenameOrg(BaseModel):
 | |
|     """Rename an existing org"""
 | |
| 
 | |
|     name: str
 | |
|     slug: Optional[str] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CreateOrg(RenameOrg):
 | |
|     """Create a new org"""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class OrgStorageRefs(BaseModel):
 | |
|     """Input model for setting primary storage + optional replicas"""
 | |
| 
 | |
|     storage: StorageRef
 | |
| 
 | |
|     storageReplicas: List[StorageRef] = []
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class S3StorageIn(BaseModel):
 | |
|     """Custom S3 Storage input model"""
 | |
| 
 | |
|     type: Literal["s3"] = "s3"
 | |
| 
 | |
|     name: str
 | |
| 
 | |
|     access_key: str
 | |
|     secret_key: str
 | |
|     endpoint_url: str
 | |
|     bucket: str
 | |
|     access_endpoint_url: Optional[str]
 | |
|     region: str = ""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class S3Storage(BaseModel):
 | |
|     """S3 Storage Model"""
 | |
| 
 | |
|     type: Literal["s3"] = "s3"
 | |
| 
 | |
|     endpoint_url: str
 | |
|     endpoint_no_bucket_url: str
 | |
|     access_key: str
 | |
|     secret_key: str
 | |
|     access_endpoint_url: str
 | |
|     region: str = ""
 | |
|     use_access_for_presign: bool = True
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class OrgQuotas(BaseModel):
 | |
|     """Organization quotas (settable by superadmin)"""
 | |
| 
 | |
|     maxConcurrentCrawls: Optional[int] = 0
 | |
|     maxPagesPerCrawl: Optional[int] = 0
 | |
|     storageQuota: Optional[int] = 0
 | |
|     maxExecMinutesPerMonth: Optional[int] = 0
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class OrgWebhookUrls(BaseModel):
 | |
|     """Organization webhook URLs"""
 | |
| 
 | |
|     crawlStarted: Optional[AnyHttpUrl] = None
 | |
|     crawlFinished: Optional[AnyHttpUrl] = None
 | |
|     crawlDeleted: Optional[AnyHttpUrl] = None
 | |
|     uploadFinished: Optional[AnyHttpUrl] = None
 | |
|     uploadDeleted: Optional[AnyHttpUrl] = None
 | |
|     addedToCollection: Optional[AnyHttpUrl] = None
 | |
|     removedFromCollection: Optional[AnyHttpUrl] = None
 | |
|     collectionDeleted: Optional[AnyHttpUrl] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class OrgOut(BaseMongoModel):
 | |
|     """Organization API output model"""
 | |
| 
 | |
|     id: UUID
 | |
|     name: str
 | |
|     slug: str
 | |
|     users: Optional[Dict[str, Any]]
 | |
|     usage: Optional[Dict[str, int]]
 | |
|     crawlExecSeconds: Optional[Dict[str, int]]
 | |
|     default: bool = False
 | |
|     bytesStored: int
 | |
|     bytesStoredCrawls: int
 | |
|     bytesStoredUploads: int
 | |
|     bytesStoredProfiles: int
 | |
|     origin: Optional[AnyHttpUrl] = None
 | |
| 
 | |
|     webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
 | |
|     quotas: Optional[OrgQuotas] = OrgQuotas()
 | |
| 
 | |
|     storageQuotaReached: Optional[bool]
 | |
|     execMinutesQuotaReached: Optional[bool]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class Organization(BaseMongoModel):
 | |
|     """Organization Base Model"""
 | |
| 
 | |
|     id: UUID
 | |
| 
 | |
|     name: str
 | |
|     slug: str
 | |
| 
 | |
|     users: Dict[str, UserRole]
 | |
| 
 | |
|     storage: StorageRef
 | |
| 
 | |
|     storageReplicas: List[StorageRef] = []
 | |
| 
 | |
|     customStorages: Dict[str, S3Storage] = {}
 | |
| 
 | |
|     usage: Dict[str, int] = {}
 | |
|     crawlExecSeconds: Dict[str, int] = {}
 | |
| 
 | |
|     bytesStored: int = 0
 | |
|     bytesStoredCrawls: int = 0
 | |
|     bytesStoredUploads: int = 0
 | |
|     bytesStoredProfiles: int = 0
 | |
| 
 | |
|     default: bool = False
 | |
| 
 | |
|     quotas: Optional[OrgQuotas] = OrgQuotas()
 | |
| 
 | |
|     webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
 | |
| 
 | |
|     origin: Optional[AnyHttpUrl] = None
 | |
| 
 | |
|     def is_owner(self, user):
 | |
|         """Check if user is owner"""
 | |
|         return self._is_auth(user, UserRole.OWNER)
 | |
| 
 | |
|     def is_crawler(self, user):
 | |
|         """Check if user can crawl (write)"""
 | |
|         return self._is_auth(user, UserRole.CRAWLER)
 | |
| 
 | |
|     def is_viewer(self, user):
 | |
|         """Check if user can view (read)"""
 | |
|         return self._is_auth(user, UserRole.VIEWER)
 | |
| 
 | |
|     def _is_auth(self, user, value):
 | |
|         """Check if user has at least specified permission level"""
 | |
|         if user.is_superuser:
 | |
|             return True
 | |
| 
 | |
|         res = self.users.get(str(user.id))
 | |
|         if not res:
 | |
|             return False
 | |
| 
 | |
|         return res >= value
 | |
| 
 | |
|     async def serialize_for_user(self, user: User, user_manager) -> OrgOut:
 | |
|         """Serialize result based on current user access"""
 | |
| 
 | |
|         exclude = {"storage"}
 | |
| 
 | |
|         if not self.is_owner(user):
 | |
|             exclude.add("users")
 | |
| 
 | |
|         if not self.is_crawler(user):
 | |
|             exclude.add("usage")
 | |
|             exclude.add("crawlExecSeconds")
 | |
| 
 | |
|         result = self.to_dict(
 | |
|             exclude_unset=True,
 | |
|             exclude_none=True,
 | |
|             exclude=exclude,
 | |
|         )
 | |
| 
 | |
|         if self.is_owner(user):
 | |
|             result["users"] = {}
 | |
| 
 | |
|             keys = list(self.users.keys())
 | |
|             user_list = await user_manager.get_user_names_by_ids(keys)
 | |
| 
 | |
|             for org_user in user_list:
 | |
|                 id_ = str(org_user["id"])
 | |
|                 role = self.users.get(id_)
 | |
|                 if not role:
 | |
|                     continue
 | |
| 
 | |
|                 result["users"][id_] = {
 | |
|                     "role": role,
 | |
|                     "name": org_user.get("name", ""),
 | |
|                     "email": org_user.get("email", ""),
 | |
|                 }
 | |
| 
 | |
|         return OrgOut.from_dict(result)
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class OrgMetrics(BaseModel):
 | |
|     """Organization API metrics model"""
 | |
| 
 | |
|     storageUsedBytes: int
 | |
|     storageUsedCrawls: int
 | |
|     storageUsedUploads: int
 | |
|     storageUsedProfiles: int
 | |
|     storageQuotaBytes: int
 | |
|     archivedItemCount: int
 | |
|     crawlCount: int
 | |
|     uploadCount: int
 | |
|     pageCount: int
 | |
|     profileCount: int
 | |
|     workflowsRunningCount: int
 | |
|     maxConcurrentCrawls: int
 | |
|     workflowsQueuedCount: int
 | |
|     collectionsCount: int
 | |
|     publicCollectionsCount: int
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### PAGINATION ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class PaginatedResponse(BaseModel):
 | |
|     """Paginated response model"""
 | |
| 
 | |
|     items: List[Any]
 | |
|     total: int
 | |
|     page: int
 | |
|     pageSize: int
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### PROFILES ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class ProfileFile(BaseFile):
 | |
|     """file for storing profile data"""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class Profile(BaseMongoModel):
 | |
|     """Browser profile"""
 | |
| 
 | |
|     name: str
 | |
|     description: Optional[str] = ""
 | |
| 
 | |
|     userid: UUID
 | |
|     oid: UUID
 | |
| 
 | |
|     origins: List[str]
 | |
|     resource: Optional[ProfileFile]
 | |
| 
 | |
|     created: Optional[datetime]
 | |
|     baseid: Optional[UUID] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class ProfileWithCrawlConfigs(Profile):
 | |
|     """Profile with list of crawlconfigs using this profile"""
 | |
| 
 | |
|     crawlconfigs: List[CrawlConfigIdNameOut] = []
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UrlIn(BaseModel):
 | |
|     """Request to set url"""
 | |
| 
 | |
|     url: HttpUrl
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class ProfileLaunchBrowserIn(UrlIn):
 | |
|     """Request to launch new browser for creating profile"""
 | |
| 
 | |
|     profileId: Optional[UUID] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BrowserId(BaseModel):
 | |
|     """Profile id on newly created profile"""
 | |
| 
 | |
|     browserid: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class ProfileCreate(BaseModel):
 | |
|     """Create new profile for browser id"""
 | |
| 
 | |
|     browserid: str
 | |
|     name: str
 | |
|     description: Optional[str] = ""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class ProfileUpdate(BaseModel):
 | |
|     """Update existing profile with new browser profile or metadata only"""
 | |
| 
 | |
|     browserid: Optional[str] = ""
 | |
|     name: str
 | |
|     description: Optional[str] = ""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### USERS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UserCreateIn(BaseModel):
 | |
|     """
 | |
|     User Creation Model exposed to API
 | |
|     """
 | |
| 
 | |
|     email: EmailStr
 | |
|     password: str
 | |
| 
 | |
|     name: Optional[str] = ""
 | |
| 
 | |
|     inviteToken: Optional[UUID] = None
 | |
| 
 | |
|     newOrg: bool
 | |
|     newOrgName: Optional[str] = ""
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UserCreate(UserCreateIn):
 | |
|     """
 | |
|     User Creation Model
 | |
|     """
 | |
| 
 | |
|     is_superuser: Optional[bool] = False
 | |
|     is_verified: Optional[bool] = False
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UserUpdateEmailName(BaseModel):
 | |
|     """
 | |
|     Update email and/or name
 | |
|     """
 | |
| 
 | |
|     email: Optional[EmailStr] = None
 | |
|     name: Optional[str] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UserUpdatePassword(BaseModel):
 | |
|     """
 | |
|     Update password, requires current password to reset
 | |
|     """
 | |
| 
 | |
|     email: EmailStr
 | |
|     password: str
 | |
|     newPassword: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### WEBHOOKS ###
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class WebhookNotificationBody(BaseModel):
 | |
|     """Base POST body model for webhook notifications"""
 | |
| 
 | |
|     # Store as str, not UUID, to make JSON-serializable
 | |
|     orgId: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class WebhookEventType(str, Enum):
 | |
|     """Webhook Event Types"""
 | |
| 
 | |
|     CRAWL_STARTED = "crawlStarted"
 | |
|     CRAWL_FINISHED = "crawlFinished"
 | |
|     CRAWL_DELETED = "crawlDeleted"
 | |
| 
 | |
|     UPLOAD_FINISHED = "uploadFinished"
 | |
|     UPLOAD_DELETED = "uploadDeleted"
 | |
| 
 | |
|     ADDED_TO_COLLECTION = "addedToCollection"
 | |
|     REMOVED_FROM_COLLECTION = "removedFromCollection"
 | |
|     COLLECTION_DELETED = "collectionDeleted"
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BaseCollectionItemBody(WebhookNotificationBody):
 | |
|     """Webhook notification base POST body for collection changes"""
 | |
| 
 | |
|     collectionId: str
 | |
|     itemIds: List[str]
 | |
|     downloadUrl: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollectionItemAddedBody(BaseCollectionItemBody):
 | |
|     """Webhook notification POST body for collection additions"""
 | |
| 
 | |
|     event: Literal[
 | |
|         WebhookEventType.ADDED_TO_COLLECTION
 | |
|     ] = WebhookEventType.ADDED_TO_COLLECTION
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollectionItemRemovedBody(BaseCollectionItemBody):
 | |
|     """Webhook notification POST body for collection removals"""
 | |
| 
 | |
|     event: Literal[
 | |
|         WebhookEventType.REMOVED_FROM_COLLECTION
 | |
|     ] = WebhookEventType.REMOVED_FROM_COLLECTION
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollectionDeletedBody(WebhookNotificationBody):
 | |
|     """Webhook notification base POST body for collection changes"""
 | |
| 
 | |
|     event: Literal[
 | |
|         WebhookEventType.COLLECTION_DELETED
 | |
|     ] = WebhookEventType.COLLECTION_DELETED
 | |
|     collectionId: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BaseArchivedItemBody(WebhookNotificationBody):
 | |
|     """Webhook notification POST body for when archived item is started or finished"""
 | |
| 
 | |
|     itemId: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BaseArchivedItemFinishedBody(BaseArchivedItemBody):
 | |
|     """Webhook notification POST body for when archived item is finished"""
 | |
| 
 | |
|     resources: List[CrawlFileOut]
 | |
|     state: str
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlStartedBody(BaseArchivedItemBody):
 | |
|     """Webhook notification POST body for when crawl starts"""
 | |
| 
 | |
|     scheduled: bool = False
 | |
|     event: Literal[WebhookEventType.CRAWL_STARTED] = WebhookEventType.CRAWL_STARTED
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlFinishedBody(BaseArchivedItemFinishedBody):
 | |
|     """Webhook notification POST body for when crawl finishes"""
 | |
| 
 | |
|     event: Literal[WebhookEventType.CRAWL_FINISHED] = WebhookEventType.CRAWL_FINISHED
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CrawlDeletedBody(BaseArchivedItemBody):
 | |
|     """Webhook notification POST body for when crawl is deleted"""
 | |
| 
 | |
|     event: Literal[WebhookEventType.CRAWL_DELETED] = WebhookEventType.CRAWL_DELETED
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UploadFinishedBody(BaseArchivedItemFinishedBody):
 | |
|     """Webhook notification POST body for when upload finishes"""
 | |
| 
 | |
|     event: Literal[WebhookEventType.UPLOAD_FINISHED] = WebhookEventType.UPLOAD_FINISHED
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class UploadDeletedBody(BaseArchivedItemBody):
 | |
|     """Webhook notification POST body for when upload finishes"""
 | |
| 
 | |
|     event: Literal[WebhookEventType.UPLOAD_DELETED] = WebhookEventType.UPLOAD_DELETED
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class WebhookNotification(BaseMongoModel):
 | |
|     """Base POST body model for webhook notifications"""
 | |
| 
 | |
|     event: WebhookEventType
 | |
|     oid: UUID
 | |
|     body: Union[
 | |
|         CrawlStartedBody,
 | |
|         CrawlFinishedBody,
 | |
|         CrawlDeletedBody,
 | |
|         UploadFinishedBody,
 | |
|         UploadDeletedBody,
 | |
|         CollectionItemAddedBody,
 | |
|         CollectionItemRemovedBody,
 | |
|         CollectionDeletedBody,
 | |
|     ]
 | |
|     success: bool = False
 | |
|     attempts: int = 0
 | |
|     created: datetime
 | |
|     lastAttempted: Optional[datetime] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| 
 | |
| ### BACKGROUND JOBS ###
 | |
| 
 | |
| 
 | |
| class BgJobType(str, Enum):
 | |
|     """Background Job Types"""
 | |
| 
 | |
|     CREATE_REPLICA = "create-replica"
 | |
|     DELETE_REPLICA = "delete-replica"
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BackgroundJob(BaseMongoModel):
 | |
|     """Model for tracking background jobs"""
 | |
| 
 | |
|     id: str
 | |
|     type: BgJobType
 | |
|     oid: UUID
 | |
|     success: Optional[bool] = None
 | |
|     started: datetime
 | |
|     finished: Optional[datetime] = None
 | |
| 
 | |
|     previousAttempts: Optional[List[Dict[str, Optional[datetime]]]] = None
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CreateReplicaJob(BackgroundJob):
 | |
|     """Model for tracking create of replica jobs"""
 | |
| 
 | |
|     type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA
 | |
|     file_path: str
 | |
|     object_type: str
 | |
|     object_id: str
 | |
|     replica_storage: StorageRef
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class DeleteReplicaJob(BackgroundJob):
 | |
|     """Model for tracking deletion of replica jobs"""
 | |
| 
 | |
|     type: Literal[BgJobType.DELETE_REPLICA] = BgJobType.DELETE_REPLICA
 | |
|     file_path: str
 | |
|     object_type: str
 | |
|     object_id: str
 | |
|     replica_storage: StorageRef
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class AnyJob(BaseModel):
 | |
|     """Union of all job types, for response model"""
 | |
| 
 | |
|     __root__: Union[CreateReplicaJob, DeleteReplicaJob, BackgroundJob]
 |