browsertrix/backend/crawlconfigs.py
Ilya Kreymer 2f63c7dcf8
Profiles: Backend API + Nginx Devtools Proxy Support (#212)
* add profile creation, list endpoints at /archives/<aid>/profiles
* add profile browser creation, get, ping, commit, delete endpoints at /archives/<aid>/profiles/browser
* support creation of profile browser using browsertrix-crawler 'create-login-profile' in docker and k8s
* ensure profile browser expires after set time, k8s job or docker container automatically deleted on exit
* profile browser creation returns temporary browser id, or `{"detail": "waiting_for_browser"}` while waiting for browser container init
* nginx frontend: proxy /loadbrowser/ to port 9223 in browsertrix-crawler, connecting directly to chrome devtools
* profile api auth: use redis for auth
- store browserid->archiveid and browserid->browser ip mapping in redis
- browser apis: ensure profile browser is associated with specified archive
- browser ws: pass arcchiveid and browserid to ws query args, browserid is part of archive, and browserid corresponds to specified ip
* store profiles in /profiles/ directory in default storage, include profileid in profile tar.gz filename

* support profile in crawlconfig:
- add profileid to CrawlConfig, and profileName to CrawlConfigOut
- support resolving profile path via profileid, setting '--profile @{path/to/profile.tar.gz}' for crawler (assuming same storage for profile as output for now) in both docker and k8s setups
- docker: support out_filename, custom wacz output filename missing functionality
2022-04-13 19:36:06 -07:00

522 lines
16 KiB
Python

"""
Crawl Config API handling
"""
from typing import List, Union, Optional
from enum import Enum
import uuid
import asyncio
import re
from datetime import datetime
import pymongo
from pydantic import BaseModel, UUID4, conint
from fastapi import APIRouter, Depends, HTTPException
from users import User
from archives import Archive, MAX_CRAWL_SCALE
from db import BaseMongoModel
# ============================================================================
class ScopeType(str, Enum):
"""Crawl scope type"""
PAGE = "page"
PAGE_SPA = "page-spa"
PREFIX = "prefix"
HOST = "host"
DOMAIN = "domain"
ANY = "any"
# ============================================================================
class Seed(BaseModel):
"""Crawl seed"""
url: str
scopeType: Optional[ScopeType] = ScopeType.PREFIX
include: Union[str, List[str], None]
exclude: Union[str, List[str], None]
sitemap: Union[bool, str, None]
allowHash: Optional[bool]
depth: Optional[int]
# ============================================================================
class RawCrawlConfig(BaseModel):
"""Base Crawl Config"""
seeds: List[Union[str, Seed]]
scopeType: Optional[ScopeType] = ScopeType.PREFIX
include: Union[str, List[str], None]
exclude: Union[str, List[str], None]
depth: Optional[int] = -1
limit: Optional[int] = 0
extraHops: Optional[int] = 0
behaviorTimeout: Optional[int]
workers: Optional[int]
headless: Optional[bool]
generateWACZ: Optional[bool]
combineWARC: Optional[bool]
logging: Optional[str]
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
# ============================================================================
class CrawlConfigIn(BaseModel):
"""CrawlConfig input model, submitted via API"""
schedule: Optional[str] = ""
runNow: Optional[bool] = False
config: RawCrawlConfig
name: Optional[str]
profileid: Optional[str]
colls: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
oldId: Optional[UUID4]
# ============================================================================
class CrawlConfig(BaseMongoModel):
"""Schedulable config"""
schedule: Optional[str] = ""
config: RawCrawlConfig
name: Optional[str]
created: Optional[datetime]
colls: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
aid: UUID4
userid: UUID4
profileid: Optional[UUID4]
crawlCount: Optional[int] = 0
lastCrawlId: Optional[str]
lastCrawlTime: Optional[datetime]
lastCrawlState: Optional[str]
newId: Optional[UUID4]
oldId: Optional[UUID4]
inactive: Optional[bool] = False
def get_raw_config(self):
""" serialize config for browsertrix-crawler """
return self.config.dict(exclude_unset=True, exclude_none=True)
# ============================================================================
class CrawlConfigOut(CrawlConfig):
"""Crawl Config Output, includes currCrawlId of running crawl"""
currCrawlId: Optional[str]
profileName: Optional[str]
userName: Optional[str]
# ============================================================================
class CrawlConfigsResponse(BaseModel):
""" model for crawl configs response """
crawlConfigs: List[CrawlConfigOut]
# ============================================================================
class UpdateCrawlConfig(BaseModel):
""" Update crawl config name or crawl schedule """
name: Optional[str]
schedule: Optional[str]
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)]
# ============================================================================
# pylint: disable=too-many-instance-attributes,too-many-arguments
class CrawlConfigOps:
"""Crawl Config Operations"""
def __init__(
self, dbclient, mdb, user_manager, archive_ops, crawl_manager, profiles
):
self.dbclient = dbclient
self.crawl_configs = mdb["crawl_configs"]
self.user_manager = user_manager
self.archive_ops = archive_ops
self.crawl_manager = crawl_manager
self.profiles = profiles
self.router = APIRouter(
prefix="/crawlconfigs",
tags=["crawlconfigs"],
responses={404: {"description": "Not found"}},
)
self.coll_ops = None
self._file_rx = re.compile("\\W+")
asyncio.create_task(self.init_index())
async def init_index(self):
""" init index for crawls db """
await self.crawl_configs.create_index(
[("aid", pymongo.HASHED), ("inactive", pymongo.ASCENDING)]
)
def set_coll_ops(self, coll_ops):
""" set collection ops """
self.coll_ops = coll_ops
def sanitize(self, string=""):
""" sanitize string for use in wacz filename"""
return self._file_rx.sub("-", string.lower())
async def add_crawl_config(
self, config: CrawlConfigIn, archive: Archive, user: User
):
"""Add new crawl config"""
data = config.dict()
data["aid"] = archive.id
data["userid"] = user.id
data["_id"] = uuid.uuid4()
data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None)
profile_filename = None
if config.profileid:
profile_filename = await self.profiles.get_profile_storage_path(
uuid.UUID(config.profileid), archive
)
if not profile_filename:
raise HTTPException(status_code=400, detail="invalid_profile_id")
if config.colls:
data["colls"] = await self.coll_ops.find_collections(
archive.id, config.colls
)
old_id = data.get("oldId")
if old_id:
old_config = await self.get_crawl_config(old_id, archive)
async with await self.dbclient.start_session() as sesh:
async with sesh.start_transaction():
if (
await self.make_inactive_or_delete(old_config, data["_id"])
== "deleted"
):
data["oldId"] = old_config.oldId
result = await self.crawl_configs.insert_one(data)
else:
result = await self.crawl_configs.insert_one(data)
crawlconfig = CrawlConfig.from_dict(data)
# pylint: disable=line-too-long
out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz"
new_name = await self.crawl_manager.add_crawl_config(
crawlconfig=crawlconfig,
storage=archive.storage,
run_now=config.runNow,
out_filename=out_filename,
profile_filename=profile_filename,
)
return result, new_name
async def update_crawl_config(self, cid: uuid.UUID, update: UpdateCrawlConfig):
""" Update name, scale and/or schedule for an existing crawl config """
# set update query
query = update.dict(
exclude_unset=True, exclude_defaults=True, exclude_none=True
)
if len(query) == 0:
raise HTTPException(status_code=400, detail="no_update_data")
# update schedule in crawl manager first
if update.schedule is not None or update.scale is not None:
try:
await self.crawl_manager.update_crawl_schedule_or_scale(
str(cid), update.schedule, update.scale
)
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
# update in db
if not await self.crawl_configs.find_one_and_update(
{"_id": cid, "inactive": {"$ne": True}}, {"$set": query}
):
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
return {"success": True}
async def inc_crawls(
self, cid: uuid.UUID, crawl_id: str, finished: datetime, state: str
):
""" Increment Crawl Counter """
await self.crawl_configs.find_one_and_update(
{"_id": cid, "inactive": {"$ne": True}},
{
"$inc": {"crawlCount": 1},
"$set": {
"lastCrawlId": crawl_id,
"lastCrawlTime": finished,
"lastCrawlState": state,
},
},
)
async def get_crawl_configs(self, archive: Archive):
"""Get all crawl configs for an archive is a member of"""
cursor = self.crawl_configs.aggregate(
[
{"$match": {"aid": archive.id, "inactive": {"$ne": True}}},
{
"$lookup": {
"from": "users",
"localField": "userid",
"foreignField": "id",
"as": "userName",
},
},
{"$set": {"userName": {"$arrayElemAt": ["$userName.name", 0]}}},
]
)
results = await cursor.to_list(length=1000)
crawls = await self.crawl_manager.list_running_crawls(aid=archive.id)
running = {}
for crawl in crawls:
running[crawl.cid] = crawl.id
configs = []
for res in results:
config = CrawlConfigOut.from_dict(res)
# pylint: disable=invalid-name
config.currCrawlId = running.get(config.id)
configs.append(config)
return CrawlConfigsResponse(crawlConfigs=configs)
async def get_running_crawl(self, crawlconfig: CrawlConfig):
""" Return the id of currently running crawl for this config, if any """
crawls = await self.crawl_manager.list_running_crawls(cid=crawlconfig.id)
if len(crawls) == 1:
return crawls[0].id
return None
async def get_crawl_config_out(self, cid: uuid.UUID, archive: Archive):
"""Return CrawlConfigOut, including state of currently running crawl, if active
also include inactive crawl configs"""
crawlconfig = await self.get_crawl_config(
cid, archive, active_only=False, config_cls=CrawlConfigOut
)
if not crawlconfig:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
if not crawlconfig.inactive:
crawlconfig.currCrawlId = await self.get_running_crawl(crawlconfig)
user = await self.user_manager.get(crawlconfig.userid)
# pylint: disable=invalid-name
if user:
crawlconfig.userName = user.name
if crawlconfig.profileid:
crawlconfig.profileName = await self.profiles.get_profile_name(
crawlconfig.profileid, archive
)
return crawlconfig
async def get_crawl_config(
self,
cid: uuid.UUID,
archive: Optional[Archive],
active_only: bool = True,
config_cls=CrawlConfig,
):
"""Get an archive for user by unique id"""
query = {"_id": cid}
if archive:
query["aid"] = archive.id
if active_only:
query["inactive"] = {"$ne": True}
res = await self.crawl_configs.find_one(query)
return config_cls.from_dict(res)
async def make_inactive_or_delete(
self, crawlconfig: CrawlConfig, new_id: uuid.UUID = None
):
"""Make config inactive if crawls exist, otherwise move to inactive list"""
query = {"inactive": True}
if new_id:
crawlconfig.newId = query["newId"] = new_id
if await self.get_running_crawl(crawlconfig):
raise HTTPException(status_code=400, detail="crawl_running_cant_deactivate")
# set to either "deleted" or "deactivated"
status = None
# if no crawls have been run, actually delete
if not crawlconfig.crawlCount:
result = await self.crawl_configs.delete_one(
{"_id": crawlconfig.id, "aid": crawlconfig.aid}
)
if result.deleted_count != 1:
raise HTTPException(status_code=404, detail="failed_to_delete")
if crawlconfig.oldId:
await self.crawl_configs.find_one_and_update(
{"_id": crawlconfig.oldId}, {"$set": query}
)
status = "deleted"
else:
if not await self.crawl_configs.find_one_and_update(
{"_id": crawlconfig.id, "inactive": {"$ne": True}},
{"$set": query},
):
raise HTTPException(status_code=404, detail="failed_to_deactivate")
status = "deactivated"
# delete from crawl manager, but not from db
await self.crawl_manager.delete_crawl_config_by_id(crawlconfig.id)
return status
async def do_make_inactive(self, crawlconfig: CrawlConfig):
""" perform make_inactive in a transaction """
async with await self.dbclient.start_session() as sesh:
async with sesh.start_transaction():
status = await self.make_inactive_or_delete(crawlconfig)
return {"success": True, "status": status}
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
def init_crawl_config_api(
dbclient, mdb, user_dep, user_manager, archive_ops, crawl_manager, profiles
):
"""Init /crawlconfigs api routes"""
ops = CrawlConfigOps(
dbclient, mdb, user_manager, archive_ops, crawl_manager, profiles
)
router = ops.router
archive_crawl_dep = archive_ops.archive_crawl_dep
@router.get("", response_model=CrawlConfigsResponse)
async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl_configs(archive)
@router.get("/{cid}", response_model=CrawlConfigOut)
async def get_crawl_config(cid: str, archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl_config_out(uuid.UUID(cid), archive)
@router.post("/")
async def add_crawl_config(
config: CrawlConfigIn,
archive: Archive = Depends(archive_crawl_dep),
user: User = Depends(user_dep),
):
res, new_job_name = await ops.add_crawl_config(config, archive, user)
return {"added": str(res.inserted_id), "run_now_job": new_job_name}
@router.patch("/{cid}", dependencies=[Depends(archive_crawl_dep)])
async def update_crawl_config(
update: UpdateCrawlConfig,
cid: str,
):
return await ops.update_crawl_config(uuid.UUID(cid), update)
@router.post("/{cid}/run")
async def run_now(
cid: str,
archive: Archive = Depends(archive_crawl_dep),
user: User = Depends(user_dep),
):
crawlconfig = await ops.get_crawl_config(uuid.UUID(cid), archive)
if not crawlconfig:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
crawl_id = None
try:
crawl_id = await crawl_manager.run_crawl_config(cid, userid=str(user.id))
except Exception as e:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=500, detail=f"Error starting crawl: {e}")
return {"started": crawl_id}
@router.delete("/{cid}")
async def make_inactive(cid: str, archive: Archive = Depends(archive_crawl_dep)):
crawlconfig = await ops.get_crawl_config(uuid.UUID(cid), archive)
if not crawlconfig:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
return await ops.do_make_inactive(crawlconfig)
archive_ops.router.include_router(router)
return ops