Misc backend fixes (#133)

* misc backend fixes:
- fix uuid typing: roles list, user invites
- crawlconfig: fix created date setting, fix userName lookup
- docker: fix timezone for scheduler, fix running check
- remove prints
- fix get crawl stuck in 'stopping' - check finished list first, then run list (in case k8s job has not been deleted)
This commit is contained in:
Ilya Kreymer 2022-01-31 19:41:04 -08:00 committed by GitHub
parent d7f58c964c
commit 2b2e6fedfa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 34 additions and 22 deletions

View File

@ -96,7 +96,7 @@ class Archive(BaseMongoModel):
user_list = await user_manager.get_user_names_by_ids(keys) user_list = await user_manager.get_user_names_by_ids(keys)
for archive_user in user_list: for archive_user in user_list:
id_ = archive_user["id"] id_ = str(archive_user["id"])
role = result["users"].get(id_) role = result["users"].get(id_)
if not role: if not role:
continue continue
@ -158,10 +158,10 @@ class ArchiveOps:
return [Archive.from_dict(res) for res in results] return [Archive.from_dict(res) for res in results]
async def get_archive_for_user_by_id( async def get_archive_for_user_by_id(
self, aid: str, user: User, role: UserRole = UserRole.VIEWER self, aid: uuid.UUID, user: User, role: UserRole = UserRole.VIEWER
): ):
"""Get an archive for user by unique id""" """Get an archive for user by unique id"""
query = {f"users.{user.id}": {"$gte": role.value}, "_id": uuid.UUID(aid)} query = {f"users.{user.id}": {"$gte": role.value}, "_id": aid}
res = await self.archives.find_one(query) res = await self.archives.find_one(query)
return Archive.from_dict(res) return Archive.from_dict(res)
@ -221,7 +221,7 @@ def init_archives_api(app, mdb, user_manager, invites, user_dep: User):
ops = ArchiveOps(mdb, invites) ops = ArchiveOps(mdb, invites)
async def archive_dep(aid: str, user: User = Depends(user_dep)): async def archive_dep(aid: str, user: User = Depends(user_dep)):
archive = await ops.get_archive_for_user_by_id(aid, user) archive = await ops.get_archive_for_user_by_id(uuid.UUID(aid), user)
if not archive: if not archive:
raise HTTPException(status_code=404, detail=f"Archive '{aid}' not found") raise HTTPException(status_code=404, detail=f"Archive '{aid}' not found")

View File

@ -143,8 +143,9 @@ class UpdateSchedule(BaseModel):
class CrawlOps: class CrawlOps:
"""Crawl Config Operations""" """Crawl Config Operations"""
def __init__(self, mdb, archive_ops, crawl_manager): def __init__(self, mdb, user_manager, archive_ops, crawl_manager):
self.crawl_configs = mdb["crawl_configs"] self.crawl_configs = mdb["crawl_configs"]
self.user_manager = user_manager
self.archive_ops = archive_ops self.archive_ops = archive_ops
self.crawl_manager = crawl_manager self.crawl_manager = crawl_manager
@ -168,6 +169,7 @@ class CrawlOps:
data["aid"] = archive.id data["aid"] = archive.id
data["userid"] = user.id data["userid"] = user.id
data["_id"] = uuid.uuid4() data["_id"] = uuid.uuid4()
data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None)
if config.colls: if config.colls:
data["colls"] = await self.coll_ops.find_collections( data["colls"] = await self.coll_ops.find_collections(
@ -176,8 +178,6 @@ class CrawlOps:
result = await self.crawl_configs.insert_one(data) result = await self.crawl_configs.insert_one(data)
data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None)
crawlconfig = CrawlConfig.from_dict(data) crawlconfig = CrawlConfig.from_dict(data)
new_name = await self.crawl_manager.add_crawl_config( new_name = await self.crawl_manager.add_crawl_config(
@ -190,7 +190,7 @@ class CrawlOps:
""" Update schedule for existing crawl config""" """ Update schedule for existing crawl config"""
if not await self.crawl_configs.find_one_and_update( if not await self.crawl_configs.find_one_and_update(
{"_id": cid}, {"$set": {"schedule": update.schedule}} {"_id": uuid.UUID(cid)}, {"$set": {"schedule": update.schedule}}
): ):
return False return False
@ -248,6 +248,11 @@ class CrawlOps:
if len(crawls) == 1: if len(crawls) == 1:
out.currCrawlId = crawls[0].id out.currCrawlId = crawls[0].id
user = await self.user_manager.get(crawlconfig.userid)
# pylint: disable=invalid-name
if user:
out.userName = user.name
return out return out
async def get_crawl_config(self, cid: uuid.UUID, archive: Archive): async def get_crawl_config(self, cid: uuid.UUID, archive: Archive):
@ -270,9 +275,9 @@ class CrawlOps:
# ============================================================================ # ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals # pylint: disable=redefined-builtin,invalid-name,too-many-locals
def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): def init_crawl_config_api(mdb, user_dep, user_manager, archive_ops, crawl_manager):
"""Init /crawlconfigs api routes""" """Init /crawlconfigs api routes"""
ops = CrawlOps(mdb, archive_ops, crawl_manager) ops = CrawlOps(mdb, user_manager, archive_ops, crawl_manager)
router = ops.router router = ops.router
@ -342,7 +347,7 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
crawl_id = None crawl_id = None
try: try:
crawl_id = await crawl_manager.run_crawl_config(cid, str(user.id)) crawl_id = await crawl_manager.run_crawl_config(cid, userid=str(user.id))
except Exception as e: except Exception as e:
# pylint: disable=raise-missing-from # pylint: disable=raise-missing-from
raise HTTPException(status_code=500, detail=f"Error starting crawl: {e}") raise HTTPException(status_code=500, detail=f"Error starting crawl: {e}")

View File

@ -313,17 +313,15 @@ class CrawlOps:
async def get_crawl(self, crawlid: str, archive: Archive): async def get_crawl(self, crawlid: str, archive: Archive):
""" Get data for single crawl """ """ Get data for single crawl """
res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id})
if not res:
crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id_str) crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id_str)
if crawl: if crawl:
await self.get_redis_stats([crawl]) await self.get_redis_stats([crawl])
else: else:
res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id})
if not res:
raise HTTPException(
status_code=404, detail=f"Crawl not found: {crawlid}"
)
files = [CrawlFile(**data) for data in res["files"]] files = [CrawlFile(**data) for data in res["files"]]
del res["files"] del res["files"]
@ -331,6 +329,11 @@ class CrawlOps:
res["resources"] = await self._resolve_signed_urls(files, archive) res["resources"] = await self._resolve_signed_urls(files, archive)
crawl = CrawlOut.from_dict(res) crawl = CrawlOut.from_dict(res)
if not crawl:
raise HTTPException(
status_code=404, detail=f"Crawl not found: {crawlid}"
)
return await self._resolve_crawl_refs(crawl, archive) return await self._resolve_crawl_refs(crawl, archive)
async def _resolve_crawl_refs( async def _resolve_crawl_refs(

View File

@ -359,6 +359,9 @@ class DockerManager:
try: try:
container = await self.client.containers.get(crawl_id) container = await self.client.containers.get(crawl_id)
if container["State"]["Status"] != "running":
return None
if aid and container["Config"]["Labels"]["btrix.archive"] != aid: if aid and container["Config"]["Labels"]["btrix.archive"] != aid:
return None return None

View File

@ -66,6 +66,7 @@ def main():
crawl_config_ops = init_crawl_config_api( crawl_config_ops = init_crawl_config_api(
mdb, mdb,
current_active_user, current_active_user,
user_manager,
archive_ops, archive_ops,
crawl_manager, crawl_manager,
) )

View File

@ -6,6 +6,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger from apscheduler.triggers.cron import CronTrigger
from apscheduler.jobstores.mongodb import MongoDBJobStore from apscheduler.jobstores.mongodb import MongoDBJobStore
from pymongo import MongoClient from pymongo import MongoClient
from pytz import utc
from db import DATABASE_URL from db import DATABASE_URL
@ -28,7 +29,7 @@ def run_scheduler(event_q, trigger_q):
print("Initializing Scheduler...", flush=True) print("Initializing Scheduler...", flush=True)
scheduler = BackgroundScheduler() scheduler = BackgroundScheduler(timezone=utc)
mongoclient = MongoClient(DATABASE_URL) mongoclient = MongoClient(DATABASE_URL)

View File

@ -107,5 +107,4 @@ async def get_presigned_url(archive, crawlfile, crawl_manager, duration=3600):
"get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=duration "get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=duration
) )
print("presigned_url", presigned_url)
return presigned_url return presigned_url