backend work:
- support {configname}-{username}-@ts-@hostsuffix.wacz as output filename, sanitize username and config name - support returning 'starting' for crawl status if no ips or 0/0 pages found. - fix updating scale via POST crawlconfig update - fix duplicate user error on superuser init
This commit is contained in:
parent
4b2f89db91
commit
e6467c3374
@ -6,6 +6,7 @@ from typing import List, Union, Optional
|
||||
from enum import Enum
|
||||
import uuid
|
||||
import asyncio
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import pymongo
|
||||
@ -169,6 +170,7 @@ class CrawlConfigOps:
|
||||
)
|
||||
|
||||
self.coll_ops = None
|
||||
self._file_rx = re.compile("\\W+")
|
||||
|
||||
asyncio.create_task(self.init_index())
|
||||
|
||||
@ -182,6 +184,10 @@ class CrawlConfigOps:
|
||||
""" set collection ops """
|
||||
self.coll_ops = coll_ops
|
||||
|
||||
def sanitize(self, string=""):
|
||||
""" sanitize string for use in wacz filename"""
|
||||
return self._file_rx.sub("-", string.lower())
|
||||
|
||||
async def add_crawl_config(
|
||||
self, config: CrawlConfigIn, archive: Archive, user: User
|
||||
):
|
||||
@ -216,8 +222,14 @@ class CrawlConfigOps:
|
||||
|
||||
crawlconfig = CrawlConfig.from_dict(data)
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz"
|
||||
|
||||
new_name = await self.crawl_manager.add_crawl_config(
|
||||
crawlconfig=crawlconfig, storage=archive.storage, run_now=config.runNow
|
||||
crawlconfig=crawlconfig,
|
||||
storage=archive.storage,
|
||||
run_now=config.runNow,
|
||||
out_filename=out_filename,
|
||||
)
|
||||
|
||||
return result, new_name
|
||||
@ -234,10 +246,10 @@ class CrawlConfigOps:
|
||||
raise HTTPException(status_code=400, detail="no_update_data")
|
||||
|
||||
# update schedule in crawl manager first
|
||||
if update.schedule is not None:
|
||||
if update.schedule is not None or update.scale is not None:
|
||||
try:
|
||||
await self.crawl_manager.update_crawl_schedule(
|
||||
str(cid), update.schedule
|
||||
await self.crawl_manager.update_crawl_schedule_or_scale(
|
||||
str(cid), update.schedule, update.scale
|
||||
)
|
||||
except Exception:
|
||||
# pylint: disable=raise-missing-from
|
||||
@ -443,14 +455,6 @@ def init_crawl_config_api(
|
||||
):
|
||||
return await ops.update_crawl_config(uuid.UUID(cid), update)
|
||||
|
||||
# depcreated: to remove in favor of general patch
|
||||
@router.patch("/{cid}/schedule", dependencies=[Depends(archive_crawl_dep)])
|
||||
async def update_crawl_schedule(
|
||||
update: UpdateCrawlConfig,
|
||||
cid: str,
|
||||
):
|
||||
return await ops.update_crawl_config(uuid.UUID(cid), update)
|
||||
|
||||
@router.post("/{cid}/run")
|
||||
async def run_now(
|
||||
cid: str,
|
||||
|
@ -422,6 +422,8 @@ class CrawlOps:
|
||||
|
||||
for crawl, (done, total) in zip(crawl_list, pairwise(results)):
|
||||
crawl.stats = {"done": done, "found": total}
|
||||
if total == 0 and done == 0 and crawl.state == "running":
|
||||
crawl.state = "starting"
|
||||
|
||||
async def cache_ips(self, crawl: CrawlOut):
|
||||
""" cache ips for ws auth check """
|
||||
|
@ -170,7 +170,8 @@ class DockerManager:
|
||||
async def update_archive_storage(self, aid, userid, storage):
|
||||
""" No storage kept for docker manager """
|
||||
|
||||
async def add_crawl_config(self, crawlconfig, storage, run_now):
|
||||
# pylint: disable=unused-argument
|
||||
async def add_crawl_config(self, crawlconfig, storage, run_now, out_filename):
|
||||
""" Add new crawl config """
|
||||
cid = str(crawlconfig.id)
|
||||
userid = str(crawlconfig.userid)
|
||||
@ -212,7 +213,7 @@ class DockerManager:
|
||||
|
||||
return ""
|
||||
|
||||
async def update_crawl_schedule(self, cid, schedule):
|
||||
async def update_crawl_schedule_or_scale(self, cid, schedule=None, scale=None):
|
||||
""" Update the schedule for existing crawl config """
|
||||
|
||||
if schedule:
|
||||
|
@ -164,7 +164,7 @@ class K8SManager:
|
||||
name=archive_storage_name, namespace=self.namespace, body=crawl_secret
|
||||
)
|
||||
|
||||
async def add_crawl_config(self, crawlconfig, storage, run_now):
|
||||
async def add_crawl_config(self, crawlconfig, storage, run_now, out_filename):
|
||||
"""add new crawl as cron job, store crawl config in configmap"""
|
||||
cid = str(crawlconfig.id)
|
||||
userid = str(crawlconfig.userid)
|
||||
@ -209,6 +209,7 @@ class K8SManager:
|
||||
storage_path,
|
||||
labels,
|
||||
annotations,
|
||||
out_filename,
|
||||
crawlconfig.crawlTimeout,
|
||||
crawlconfig.scale,
|
||||
)
|
||||
@ -242,8 +243,8 @@ class K8SManager:
|
||||
|
||||
return ""
|
||||
|
||||
async def update_crawl_schedule(self, cid, schedule):
|
||||
""" Update the schedule for existing crawl config """
|
||||
async def update_crawl_schedule_or_scale(self, cid, schedule=None, scale=None):
|
||||
""" Update the schedule or scale for existing crawl config """
|
||||
|
||||
cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
|
||||
namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}"
|
||||
@ -254,16 +255,25 @@ class K8SManager:
|
||||
|
||||
cron_job = cron_jobs.items[0]
|
||||
|
||||
real_schedule = schedule or DEFAULT_NO_SCHEDULE
|
||||
updated = False
|
||||
|
||||
if real_schedule != cron_job.spec.schedule:
|
||||
cron_job.spec.schedule = real_schedule
|
||||
cron_job.spec.suspend = not schedule
|
||||
if schedule is not None:
|
||||
real_schedule = schedule or DEFAULT_NO_SCHEDULE
|
||||
|
||||
cron_job.spec.job_template.metadata.annotations[
|
||||
"btrix.run.schedule"
|
||||
] = schedule
|
||||
if real_schedule != cron_job.spec.schedule:
|
||||
cron_job.spec.schedule = real_schedule
|
||||
cron_job.spec.suspend = not schedule
|
||||
|
||||
cron_job.spec.job_template.metadata.annotations[
|
||||
"btrix.run.schedule"
|
||||
] = schedule
|
||||
updated = True
|
||||
|
||||
if scale is not None:
|
||||
cron_job.spec.job_template.spec.parallelism = scale
|
||||
updated = True
|
||||
|
||||
if updated:
|
||||
await self.batch_beta_api.patch_namespaced_cron_job(
|
||||
name=cron_job.metadata.name, namespace=self.namespace, body=cron_job
|
||||
)
|
||||
@ -397,17 +407,17 @@ class K8SManager:
|
||||
if not status:
|
||||
return None
|
||||
|
||||
crawl = self._make_crawl_for_job(job, status, False, CrawlOut)
|
||||
|
||||
pods = await self.core_api.list_namespaced_pod(
|
||||
namespace=self.namespace,
|
||||
label_selector=f"job-name={name},btrix.archive={aid}",
|
||||
)
|
||||
|
||||
crawl.watchIPs = [
|
||||
pod.status.pod_ip for pod in pods.items if pod.status.pod_ip
|
||||
]
|
||||
return crawl
|
||||
watch_ips = [pod.status.pod_ip for pod in pods.items if pod.status.pod_ip]
|
||||
|
||||
if status == "running" and not watch_ips:
|
||||
status = "starting"
|
||||
|
||||
return self._make_crawl_for_job(job, status, False, CrawlOut, watch_ips)
|
||||
|
||||
# pylint: disable=broad-except
|
||||
except Exception:
|
||||
@ -517,7 +527,9 @@ class K8SManager:
|
||||
return None
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def _make_crawl_for_job(self, job, state, finish_now=False, crawl_cls=Crawl):
|
||||
def _make_crawl_for_job(
|
||||
self, job, state, finish_now=False, crawl_cls=Crawl, watch_ips=None
|
||||
):
|
||||
""" Make a crawl object from a job"""
|
||||
return crawl_cls(
|
||||
id=job.metadata.name,
|
||||
@ -529,10 +541,11 @@ class K8SManager:
|
||||
# schedule=job.metadata.annotations.get("btrix.run.schedule", ""),
|
||||
manual=job.metadata.annotations.get("btrix.run.manual") == "1",
|
||||
started=job.status.start_time.replace(tzinfo=None),
|
||||
watchIPs=watch_ips or [],
|
||||
colls=json.loads(job.metadata.annotations.get("btrix.colls", [])),
|
||||
finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None)
|
||||
if finish_now
|
||||
else None,
|
||||
colls=json.loads(job.metadata.annotations.get("btrix.colls", [])),
|
||||
)
|
||||
|
||||
async def _delete_job(self, name):
|
||||
@ -669,6 +682,7 @@ class K8SManager:
|
||||
storage_path,
|
||||
labels,
|
||||
annotations,
|
||||
out_filename,
|
||||
crawl_timeout,
|
||||
parallel,
|
||||
):
|
||||
@ -731,7 +745,7 @@ class K8SManager:
|
||||
{"name": "STORE_PATH", "value": storage_path},
|
||||
{
|
||||
"name": "STORE_FILENAME",
|
||||
"value": "@ts-@hostname.wacz",
|
||||
"value": out_filename,
|
||||
},
|
||||
],
|
||||
"resources": resources,
|
||||
|
@ -14,6 +14,8 @@ import passlib.pwd
|
||||
from fastapi import Request, Response, HTTPException, Depends, WebSocket
|
||||
from fastapi.security import OAuth2PasswordBearer
|
||||
|
||||
from pymongo.errors import DuplicateKeyError
|
||||
|
||||
from fastapi_users import FastAPIUsers, models, BaseUserManager
|
||||
from fastapi_users.manager import UserAlreadyExists
|
||||
from fastapi_users.authentication import (
|
||||
@ -174,7 +176,7 @@ class UserManager(BaseUserManager[UserCreate, UserDB]):
|
||||
print(f"Super user {email} created", flush=True)
|
||||
print(res, flush=True)
|
||||
|
||||
except UserAlreadyExists:
|
||||
except (DuplicateKeyError, UserAlreadyExists):
|
||||
print(f"User {email} already exists", flush=True)
|
||||
|
||||
async def on_after_register_custom(
|
||||
|
Loading…
Reference in New Issue
Block a user