browsertrix/backend/btrixcloud/models.py

"""
Crawl-related models and types
"""

from datetime import datetime
from enum import Enum, IntEnum
from uuid import UUID
import os

from typing import Optional, List, Dict, Union, Literal, Any
from pydantic import (
    BaseModel,
    conint,
    Field,
    HttpUrl,
    AnyHttpUrl,
    EmailStr,
    ConstrainedStr,
)

# from fastapi_users import models as fastapi_users_models

from .db import BaseMongoModel

# crawl scale for constraint
MAX_CRAWL_SCALE = int(os.environ.get("MAX_CRAWL_SCALE", 3))


# pylint: disable=invalid-name, too-many-lines
# ============================================================================
class UserRole(IntEnum):
    """User role"""

    VIEWER = 10
    CRAWLER = 20
    OWNER = 40
    SUPERADMIN = 100


# ============================================================================

### INVITES ###


# ============================================================================
class InvitePending(BaseMongoModel):
    """An invite for a new user, with an email and invite token as id"""

    created: datetime
    inviterEmail: str
    oid: Optional[UUID]
    role: Optional[UserRole] = UserRole.VIEWER
    email: Optional[str]


# ============================================================================
class InviteRequest(BaseModel):
    """Request to invite another user"""

    email: str


# ============================================================================
class InviteToOrgRequest(InviteRequest):
    """Request to invite another user to an organization"""

    role: UserRole


# ============================================================================
class AddToOrgRequest(InviteRequest):
    """Request to add a new user to an organization directly"""

    role: UserRole
    password: str
    name: str


# ============================================================================

### MAIN USER MODEL ###


# ============================================================================
class User(BaseModel):
    """
    User Model
    """

    id: UUID

    name: str = ""
    email: EmailStr
    is_superuser: bool = False
    is_verified: bool = False

    invites: Dict[str, InvitePending] = {}
    hashed_password: str

    def dict(self, *a, **kw):
        """ensure invites / hashed_password never serialize, just in case"""
        exclude = kw.get("exclude") or set()
        exclude.add("invites")
        exclude.add("hashed_password")
        return super().dict(*a, **kw)


# ============================================================================
class FailedLogin(BaseMongoModel):
    """
    Failed login model
    """

    attempted: datetime = datetime.now()
    email: str

    # Consecutive failed logins, reset to 0 on successful login or after
    # password is reset. On failed_logins >= 5 within the hour before this
    # object is deleted, the user is unable to log in until they reset their
    # password.
    count: int = 1


# ============================================================================
class UserOrgInfoOut(BaseModel):
    """org per user"""

    id: UUID

    name: str
    slug: str
    default: bool
    role: UserRole


# ============================================================================
class UserOut(BaseModel):
    """Output User model"""

    id: UUID

    name: str = ""
    email: EmailStr
    is_superuser: bool = False
    is_verified: bool = False

    orgs: List[UserOrgInfoOut]


# ============================================================================

### CRAWL STATES

# ============================================================================
RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"]

STARTING_STATES = ["starting", "waiting_capacity", "waiting_org_limit"]

FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]

SUCCESSFUL_STATES = ["complete", "partial_complete"]

RUNNING_AND_STARTING_STATES = [*STARTING_STATES, *RUNNING_STATES]

RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES]

NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]

ALL_CRAWL_STATES = [*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES]


# ============================================================================

### CRAWL CONFIGS ###


# ============================================================================
class JobType(str, Enum):
    """Job Types"""

    URL_LIST = "url-list"
    SEED_CRAWL = "seed-crawl"
    CUSTOM = "custom"


# ============================================================================
class ScopeType(str, Enum):
    """Crawl scope type"""

    PAGE = "page"
    PAGE_SPA = "page-spa"
    PREFIX = "prefix"
    HOST = "host"
    DOMAIN = "domain"
    ANY = "any"
    CUSTOM = "custom"


# ============================================================================
class EmptyStr(ConstrainedStr):
    """empty string only"""

    min_length = 0
    max_length = 0


# ============================================================================
class Seed(BaseModel):
    """Crawl seed"""

    url: HttpUrl
    scopeType: Optional[ScopeType] = None

    include: Union[str, List[str], None] = None
    exclude: Union[str, List[str], None] = None
    sitemap: Union[bool, HttpUrl, None] = None
    allowHash: Optional[bool] = None
    depth: Optional[int] = None
    extraHops: Optional[int] = None


# ============================================================================
class RawCrawlConfig(BaseModel):
    """Base Crawl Config"""

    seeds: Optional[List[Seed]]

    scopeType: Optional[ScopeType] = ScopeType.PREFIX

    include: Union[str, List[str], None] = None
    exclude: Union[str, List[str], None] = None

    depth: Optional[int] = -1
    limit: Optional[int] = 0
    extraHops: Optional[int] = 0

    lang: Optional[str]
    blockAds: Optional[bool] = False

    behaviorTimeout: Optional[int]
    pageLoadTimeout: Optional[int]
    pageExtraDelay: Optional[int] = 0

    workers: Optional[int] = None

    headless: Optional[bool] = None

    generateWACZ: Optional[bool] = None
    combineWARC: Optional[bool] = None

    useSitemap: Optional[bool] = False
    failOnFailedSeed: Optional[bool] = False

    logging: Optional[str] = None
    behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"


# ============================================================================
class CrawlConfigIn(BaseModel):
    """CrawlConfig input model, submitted via API"""

    schedule: Optional[str] = ""
    runNow: bool = False

    config: RawCrawlConfig

    name: str

    description: Optional[str]

    jobType: Optional[JobType] = JobType.CUSTOM

    profileid: Union[UUID, EmptyStr, None]

    autoAddCollections: Optional[List[UUID]] = []
    tags: Optional[List[str]] = []

    crawlTimeout: int = 0
    maxCrawlSize: int = 0
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1  # type: ignore

    crawlFilenameTemplate: Optional[str] = None


# ============================================================================
class ConfigRevision(BaseMongoModel):
    """Crawl Config Revision"""

    cid: UUID

    schedule: Optional[str] = ""

    config: RawCrawlConfig

    profileid: Optional[UUID]

    crawlTimeout: Optional[int] = 0
    maxCrawlSize: Optional[int] = 0
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1  # type: ignore

    modified: datetime
    modifiedBy: Optional[UUID]

    rev: int = 0


# ============================================================================
class CrawlConfigCore(BaseMongoModel):
    """Core data shared between crawls and crawlconfigs"""

    schedule: Optional[str] = ""

    jobType: Optional[JobType] = JobType.CUSTOM
    config: Optional[RawCrawlConfig]

    tags: Optional[List[str]] = []

    crawlTimeout: Optional[int] = 0
    maxCrawlSize: Optional[int] = 0
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1  # type: ignore

    oid: UUID

    profileid: Optional[UUID]


# ============================================================================
class CrawlConfigAdditional(BaseModel):
    """Additional fields shared by CrawlConfig and CrawlConfigOut."""

    name: Optional[str]
    description: Optional[str]

    created: datetime
    createdBy: Optional[UUID]

    modified: Optional[datetime]
    modifiedBy: Optional[UUID]

    autoAddCollections: Optional[List[UUID]] = []

    inactive: Optional[bool] = False

    rev: int = 0

    crawlAttemptCount: Optional[int] = 0
    crawlCount: Optional[int] = 0
    crawlSuccessfulCount: Optional[int] = 0

    totalSize: Optional[int] = 0

    lastCrawlId: Optional[str]
    lastCrawlStartTime: Optional[datetime]
    lastStartedBy: Optional[UUID]
    lastCrawlTime: Optional[datetime]
    lastCrawlState: Optional[str]
    lastCrawlSize: Optional[int]

    lastRun: Optional[datetime]

    isCrawlRunning: Optional[bool] = False


# ============================================================================
class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
    """Schedulable config"""

    id: UUID

    config: RawCrawlConfig
    createdByName: Optional[str]
    modifiedByName: Optional[str]
    lastStartedByName: Optional[str]

    def get_raw_config(self):
        """serialize config for browsertrix-crawler"""
        return self.config.dict(exclude_unset=True, exclude_none=True)


# ============================================================================
class CrawlConfigOut(CrawlConfigCore, CrawlConfigAdditional):
    """Crawl Config Output"""

    lastCrawlStopping: Optional[bool] = False
    profileName: Optional[str]
    firstSeed: Optional[str]
    seedCount: int = 0

    createdByName: Optional[str]
    modifiedByName: Optional[str]
    lastStartedByName: Optional[str]


# ============================================================================
class CrawlConfigIdNameOut(BaseMongoModel):
    """Crawl Config id and name output only"""

    name: str


# ============================================================================
class UpdateCrawlConfig(BaseModel):
    """Update crawl config name, crawl schedule, or tags"""

    # metadata: not revision tracked
    name: Optional[str] = None
    tags: Optional[List[str]] = None
    description: Optional[str] = None
    autoAddCollections: Optional[List[UUID]] = None
    runNow: bool = False

    # crawl data: revision tracked
    schedule: Optional[str] = None
    profileid: Union[UUID, EmptyStr, None] = None
    crawlTimeout: Optional[int] = None
    maxCrawlSize: Optional[int] = None
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = None  # type: ignore
    crawlFilenameTemplate: Optional[str] = None
    config: Optional[RawCrawlConfig] = None


# ============================================================================

### BASE CRAWLS ###


# ============================================================================
class StorageRef(BaseModel):
    """Reference to actual storage"""

    name: str
    custom: Optional[bool]

    def __init__(self, *args, **kwargs):
        if args:
            if args[0].startswith("cs-"):
                super().__init__(name=args[0][2:], custom=True)
            else:
                super().__init__(name=args[0], custom=False)
        else:
            super().__init__(**kwargs)

    def __str__(self):
        if not self.custom:
            return self.name
        return "cs-" + self.name

    def get_storage_secret_name(self, oid: str) -> str:
        """get k8s secret name for this storage and oid"""
        if not self.custom:
            return "storage-" + self.name
        return f"storage-cs-{self.name}-{oid[:12]}"

    def get_storage_extra_path(self, oid: str) -> str:
        """return extra path added to the endpoint
        using oid for default storages, no extra path for custom"""
        if not self.custom:
            return oid + "/"
        return ""


# ============================================================================
class BaseFile(BaseModel):
    """Base model for crawl and profile files"""

    filename: str
    hash: str
    size: int
    storage: StorageRef

    replicas: Optional[List[StorageRef]] = []


# ============================================================================
class CrawlFile(BaseFile):
    """file from a crawl"""

    presignedUrl: Optional[str]
    expireAt: Optional[datetime]
    crc32: int = 0


# ============================================================================
class CrawlFileOut(BaseModel):
    """output for file from a crawl (conformance to Data Resource Spec)"""

    name: str
    path: str
    hash: str
    crc32: int = 0
    size: int

    crawlId: Optional[str]
    numReplicas: int = 0
    expireAt: Optional[str]


# ============================================================================
class BaseCrawl(BaseMongoModel):
    """Base Crawl object (representing crawls, uploads and manual sessions)"""

    id: str

    type: str

    userid: UUID
    userName: Optional[str]
    oid: UUID

    started: datetime
    finished: Optional[datetime] = None

    name: Optional[str] = ""

    state: str

    stats: Optional[Dict[str, int]] = None

    files: Optional[List[CrawlFile]] = []

    description: Optional[str] = ""

    errors: Optional[List[str]] = []

    collectionIds: Optional[List[UUID]] = []

    fileSize: int = 0
    fileCount: int = 0


# ============================================================================
class CollIdName(BaseModel):
    """Collection id and name object"""

    id: UUID
    name: str


# ============================================================================
class CrawlOut(BaseMongoModel):
    """Crawl output model, shared across all crawl types"""

    # pylint: disable=duplicate-code

    type: Optional[str]

    id: str

    userid: UUID
    userName: Optional[str]
    oid: UUID

    name: Optional[str]
    description: Optional[str]

    started: datetime
    finished: Optional[datetime]

    state: str

    stats: Optional[Dict[str, int]]

    fileSize: int = 0
    fileCount: int = 0

    tags: Optional[List[str]] = []

    errors: Optional[List[str]] = []

    collectionIds: Optional[List[UUID]] = []

    crawlExecSeconds: int = 0

    # automated crawl fields
    config: Optional[RawCrawlConfig]
    cid: Optional[UUID]
    firstSeed: Optional[str]
    seedCount: Optional[int]
    profileName: Optional[str]
    stopping: Optional[bool]
    manual: Optional[bool]
    cid_rev: Optional[int]

    storageQuotaReached: Optional[bool]
    execMinutesQuotaReached: Optional[bool]


# ============================================================================
class CrawlOutWithResources(CrawlOut):
    """Crawl output model including resources"""

    resources: Optional[List[CrawlFileOut]] = []
    collections: Optional[List[CollIdName]] = []


# ============================================================================
class UpdateCrawl(BaseModel):
    """Update crawl"""

    name: Optional[str]
    description: Optional[str]
    tags: Optional[List[str]]
    collectionIds: Optional[List[UUID]]


# ============================================================================
class DeleteCrawlList(BaseModel):
    """delete crawl list POST body"""

    crawl_ids: List[str]


# ============================================================================

### AUTOMATED CRAWLS ###


# ============================================================================
class CrawlScale(BaseModel):
    """scale the crawl to N parallel containers"""

    scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1  # type: ignore


# ============================================================================
class Crawl(BaseCrawl, CrawlConfigCore):
    """Store State of a Crawl (Finished or Running)"""

    type: Literal["crawl"] = "crawl"

    cid: UUID

    config: RawCrawlConfig

    cid_rev: int = 0

    # schedule: Optional[str]
    manual: Optional[bool]

    stopping: Optional[bool] = False

    crawlExecSeconds: int = 0


# ============================================================================
class CrawlCompleteIn(BaseModel):
    """Completed Crawl Webhook POST message"""

    id: str

    user: str

    filename: str
    size: int
    hash: str
    crc32: int = 0

    completed: Optional[bool] = True


# ============================================================================

### UPLOADED CRAWLS ###


# ============================================================================
class UploadedCrawl(BaseCrawl):
    """Store State of a Crawl Upload"""

    type: Literal["upload"] = "upload"

    tags: Optional[List[str]] = []


# ============================================================================
class UpdateUpload(UpdateCrawl):
    """Update modal that also includes name"""


# ============================================================================

### COLLECTIONS ###


# ============================================================================
class Collection(BaseMongoModel):
    """Org collection structure"""

    name: str = Field(..., min_length=1)
    oid: UUID
    description: Optional[str]
    modified: Optional[datetime]

    crawlCount: Optional[int] = 0
    pageCount: Optional[int] = 0
    totalSize: Optional[int] = 0

    # Sorted by count, descending
    tags: Optional[List[str]] = []

    isPublic: Optional[bool] = False


# ============================================================================
class CollIn(BaseModel):
    """Collection Passed in By User"""

    name: str = Field(..., min_length=1)
    description: Optional[str]
    crawlIds: Optional[List[str]] = []

    isPublic: Optional[bool] = False


# ============================================================================
class CollOut(Collection):
    """Collection output model with annotations."""

    resources: List[CrawlFileOut] = []


# ============================================================================
class UpdateColl(BaseModel):
    """Update collection"""

    name: Optional[str]
    description: Optional[str]
    isPublic: Optional[bool]


# ============================================================================
class AddRemoveCrawlList(BaseModel):
    """Collections to add or remove from collection"""

    crawlIds: List[str] = []


# ============================================================================

### ORGS ###


# ============================================================================
class UpdateRole(InviteToOrgRequest):
    """Update existing role for user"""


# ============================================================================
class RemoveFromOrg(InviteRequest):
    """Remove this user from org"""


# ============================================================================
class RemovePendingInvite(InviteRequest):
    """Delete pending invite to org by email"""


# ============================================================================
class RenameOrg(BaseModel):
    """Rename an existing org"""

    name: str
    slug: Optional[str] = None


# ============================================================================
class CreateOrg(RenameOrg):
    """Create a new org"""


# ============================================================================
class OrgStorageRefs(BaseModel):
    """Input model for setting primary storage + optional replicas"""

    storage: StorageRef

    storageReplicas: List[StorageRef] = []


# ============================================================================
class S3StorageIn(BaseModel):
    """Custom S3 Storage input model"""

    type: Literal["s3"] = "s3"

    name: str

    access_key: str
    secret_key: str
    endpoint_url: str
    bucket: str
    access_endpoint_url: Optional[str]
    region: str = ""


# ============================================================================
class S3Storage(BaseModel):
    """S3 Storage Model"""

    type: Literal["s3"] = "s3"

    endpoint_url: str
    endpoint_no_bucket_url: str
    access_key: str
    secret_key: str
    access_endpoint_url: str
    region: str = ""
    use_access_for_presign: bool = True


# ============================================================================
class OrgQuotas(BaseModel):
    """Organization quotas (settable by superadmin)"""

    maxConcurrentCrawls: Optional[int] = 0
    maxPagesPerCrawl: Optional[int] = 0
    storageQuota: Optional[int] = 0
    maxExecMinutesPerMonth: Optional[int] = 0


# ============================================================================
class OrgWebhookUrls(BaseModel):
    """Organization webhook URLs"""

    crawlStarted: Optional[AnyHttpUrl] = None
    crawlFinished: Optional[AnyHttpUrl] = None
    uploadFinished: Optional[AnyHttpUrl] = None
    addedToCollection: Optional[AnyHttpUrl] = None
    removedFromCollection: Optional[AnyHttpUrl] = None


# ============================================================================
class OrgOut(BaseMongoModel):
    """Organization API output model"""

    id: UUID
    name: str
    slug: str
    users: Optional[Dict[str, Any]]
    usage: Optional[Dict[str, int]]
    crawlExecSeconds: Optional[Dict[str, int]]
    default: bool = False
    bytesStored: int
    bytesStoredCrawls: int
    bytesStoredUploads: int
    bytesStoredProfiles: int
    origin: Optional[AnyHttpUrl] = None

    webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()
    quotas: Optional[OrgQuotas] = OrgQuotas()

    storageQuotaReached: Optional[bool]
    execMinutesQuotaReached: Optional[bool]


# ============================================================================
class Organization(BaseMongoModel):
    """Organization Base Model"""

    id: UUID

    name: str
    slug: str

    users: Dict[str, UserRole]

    storage: StorageRef

    storageReplicas: List[StorageRef] = []

    customStorages: Dict[str, S3Storage] = {}

    usage: Dict[str, int] = {}
    crawlExecSeconds: Dict[str, int] = {}

    bytesStored: int = 0
    bytesStoredCrawls: int = 0
    bytesStoredUploads: int = 0
    bytesStoredProfiles: int = 0

    default: bool = False

    quotas: Optional[OrgQuotas] = OrgQuotas()

    webhookUrls: Optional[OrgWebhookUrls] = OrgWebhookUrls()

    origin: Optional[AnyHttpUrl] = None

    def is_owner(self, user):
        """Check if user is owner"""
        return self._is_auth(user, UserRole.OWNER)

    def is_crawler(self, user):
        """Check if user can crawl (write)"""
        return self._is_auth(user, UserRole.CRAWLER)

    def is_viewer(self, user):
        """Check if user can view (read)"""
        return self._is_auth(user, UserRole.VIEWER)

    def _is_auth(self, user, value):
        """Check if user has at least specified permission level"""
        if user.is_superuser:
            return True

        res = self.users.get(str(user.id))
        if not res:
            return False

        return res >= value

    async def serialize_for_user(self, user: User, user_manager) -> OrgOut:
        """Serialize result based on current user access"""

        exclude = {"storage"}

        if not self.is_owner(user):
            exclude.add("users")

        if not self.is_crawler(user):
            exclude.add("usage")
            exclude.add("crawlExecSeconds")

        result = self.to_dict(
            exclude_unset=True,
            exclude_none=True,
            exclude=exclude,
        )

        if self.is_owner(user):
            result["users"] = {}

            keys = list(self.users.keys())
            user_list = await user_manager.get_user_names_by_ids(keys)

            for org_user in user_list:
                id_ = str(org_user["id"])
                role = self.users.get(id_)
                if not role:
                    continue

                result["users"][id_] = {
                    "role": role,
                    "name": org_user.get("name", ""),
                    "email": org_user.get("email", ""),
                }

        return OrgOut.from_dict(result)


# ============================================================================
class OrgMetrics(BaseModel):
    """Organization API metrics model"""

    storageUsedBytes: int
    storageUsedCrawls: int
    storageUsedUploads: int
    storageUsedProfiles: int
    storageQuotaBytes: int
    archivedItemCount: int
    crawlCount: int
    uploadCount: int
    pageCount: int
    profileCount: int
    workflowsRunningCount: int
    maxConcurrentCrawls: int
    workflowsQueuedCount: int
    collectionsCount: int
    publicCollectionsCount: int


# ============================================================================

### PAGINATION ###


# ============================================================================
class PaginatedResponse(BaseModel):
    """Paginated response model"""

    items: List[Any]
    total: int
    page: int
    pageSize: int


# ============================================================================

### PROFILES ###


# ============================================================================
class ProfileFile(BaseFile):
    """file for storing profile data"""


# ============================================================================
class Profile(BaseMongoModel):
    """Browser profile"""

    name: str
    description: Optional[str] = ""

    userid: UUID
    oid: UUID

    origins: List[str]
    resource: Optional[ProfileFile]

    created: Optional[datetime]
    baseid: Optional[UUID] = None


# ============================================================================
class ProfileWithCrawlConfigs(Profile):
    """Profile with list of crawlconfigs using this profile"""

    crawlconfigs: List[CrawlConfigIdNameOut] = []


# ============================================================================
class UrlIn(BaseModel):
    """Request to set url"""

    url: HttpUrl


# ============================================================================
class ProfileLaunchBrowserIn(UrlIn):
    """Request to launch new browser for creating profile"""

    profileId: Optional[UUID] = None


# ============================================================================
class BrowserId(BaseModel):
    """Profile id on newly created profile"""

    browserid: str


# ============================================================================
class ProfileCreate(BaseModel):
    """Create new profile for browser id"""

    browserid: str
    name: str
    description: Optional[str] = ""


# ============================================================================
class ProfileUpdate(BaseModel):
    """Update existing profile with new browser profile or metadata only"""

    browserid: Optional[str] = ""
    name: str
    description: Optional[str] = ""


# ============================================================================

### USERS ###


# ============================================================================
class UserCreateIn(BaseModel):
    """
    User Creation Model exposed to API
    """

    email: EmailStr
    password: str

    name: Optional[str] = ""

    inviteToken: Optional[UUID] = None

    newOrg: bool
    newOrgName: Optional[str] = ""


# ============================================================================
class UserCreate(UserCreateIn):
    """
    User Creation Model
    """

    is_superuser: Optional[bool] = False
    is_verified: Optional[bool] = False


# ============================================================================
class UserUpdateEmailName(BaseModel):
    """
    Update email and/or name
    """

    email: Optional[EmailStr] = None
    name: Optional[str] = None


# ============================================================================
class UserUpdatePassword(BaseModel):
    """
    Update password, requires current password to reset
    """

    email: EmailStr
    password: str
    newPassword: str


# ============================================================================

### WEBHOOKS ###


# ============================================================================
class WebhookNotificationBody(BaseModel):
    """Base POST body model for webhook notifications"""

    downloadUrls: Optional[List] = None

    # Store as str, not UUID, to make JSON-serializable
    orgId: str


# ============================================================================
class WebhookEventType(str, Enum):
    """Webhook Event Types"""

    CRAWL_STARTED = "crawlStarted"
    CRAWL_FINISHED = "crawlFinished"
    UPLOAD_FINISHED = "uploadFinished"

    ADDED_TO_COLLECTION = "addedToCollection"
    REMOVED_FROM_COLLECTION = "removedFromCollection"


# ============================================================================
class BaseCollectionItemBody(WebhookNotificationBody):
    """Webhook notification base POST body for collection changes"""

    collectionId: str
    itemIds: List[str]


# ============================================================================
class CollectionItemAddedBody(BaseCollectionItemBody):
    """Webhook notification POST body for collection additions"""

    event: Literal[
        WebhookEventType.ADDED_TO_COLLECTION
    ] = WebhookEventType.ADDED_TO_COLLECTION


# ============================================================================
class CollectionItemRemovedBody(BaseCollectionItemBody):
    """Webhook notification POST body for collection removals"""

    event: Literal[
        WebhookEventType.REMOVED_FROM_COLLECTION
    ] = WebhookEventType.REMOVED_FROM_COLLECTION


# ============================================================================
class BaseArchivedItemBody(WebhookNotificationBody):
    """Webhook notification POST body for when archived item is started or finished"""

    itemId: str
    resources: Optional[List[CrawlFileOut]] = None


# ============================================================================
class CrawlStartedBody(BaseArchivedItemBody):
    """Webhook notification POST body for when crawl starts"""

    scheduled: bool = False
    event: Literal[WebhookEventType.CRAWL_STARTED] = WebhookEventType.CRAWL_STARTED


# ============================================================================
class CrawlFinishedBody(BaseArchivedItemBody):
    """Webhook notification POST body for when crawl finishes"""

    event: Literal[WebhookEventType.CRAWL_FINISHED] = WebhookEventType.CRAWL_FINISHED
    state: str


# ============================================================================
class UploadFinishedBody(BaseArchivedItemBody):
    """Webhook notification POST body for when upload finishes"""

    event: Literal[WebhookEventType.UPLOAD_FINISHED] = WebhookEventType.UPLOAD_FINISHED
    state: str


# ============================================================================
class WebhookNotification(BaseMongoModel):
    """Base POST body model for webhook notifications"""

    event: WebhookEventType
    oid: UUID
    body: Union[
        CrawlStartedBody,
        CrawlFinishedBody,
        UploadFinishedBody,
        CollectionItemAddedBody,
        CollectionItemRemovedBody,
    ]
    success: bool = False
    attempts: int = 0
    created: datetime
    lastAttempted: Optional[datetime] = None


# ============================================================================

### BACKGROUND JOBS ###


class BgJobType(str, Enum):
    """Background Job Types"""

    CREATE_REPLICA = "create-replica"
    DELETE_REPLICA = "delete-replica"


# ============================================================================
class BackgroundJob(BaseMongoModel):
    """Model for tracking background jobs"""

    id: str
    type: BgJobType
    oid: UUID
    success: Optional[bool] = None
    started: datetime
    finished: Optional[datetime] = None


# ============================================================================
class CreateReplicaJob(BackgroundJob):
    """Model for tracking create of replica jobs"""

    type: Literal[BgJobType.CREATE_REPLICA] = BgJobType.CREATE_REPLICA
    file_path: str
    object_type: str
    object_id: str
    replica_storage: StorageRef


# ============================================================================
class DeleteReplicaJob(BackgroundJob):
    """Model for tracking deletion of replica jobs"""

    type: Literal[BgJobType.DELETE_REPLICA] = BgJobType.DELETE_REPLICA
    file_path: str
    object_type: str
    object_id: str
    replica_storage: StorageRef


# ============================================================================
class AnyJob(BaseModel):
    """Union of all job types, for response model"""

    __root__: Union[CreateReplicaJob, DeleteReplicaJob, BackgroundJob]