diff --git a/backend/archives.py b/backend/archives.py index 79e18726..12cc507b 100644 --- a/backend/archives.py +++ b/backend/archives.py @@ -96,7 +96,7 @@ class Archive(BaseMongoModel): user_list = await user_manager.get_user_names_by_ids(keys) for archive_user in user_list: - id_ = archive_user["id"] + id_ = str(archive_user["id"]) role = result["users"].get(id_) if not role: continue @@ -158,10 +158,10 @@ class ArchiveOps: return [Archive.from_dict(res) for res in results] async def get_archive_for_user_by_id( - self, aid: str, user: User, role: UserRole = UserRole.VIEWER + self, aid: uuid.UUID, user: User, role: UserRole = UserRole.VIEWER ): """Get an archive for user by unique id""" - query = {f"users.{user.id}": {"$gte": role.value}, "_id": uuid.UUID(aid)} + query = {f"users.{user.id}": {"$gte": role.value}, "_id": aid} res = await self.archives.find_one(query) return Archive.from_dict(res) @@ -221,7 +221,7 @@ def init_archives_api(app, mdb, user_manager, invites, user_dep: User): ops = ArchiveOps(mdb, invites) async def archive_dep(aid: str, user: User = Depends(user_dep)): - archive = await ops.get_archive_for_user_by_id(aid, user) + archive = await ops.get_archive_for_user_by_id(uuid.UUID(aid), user) if not archive: raise HTTPException(status_code=404, detail=f"Archive '{aid}' not found") diff --git a/backend/crawlconfigs.py b/backend/crawlconfigs.py index cee73abc..e4624963 100644 --- a/backend/crawlconfigs.py +++ b/backend/crawlconfigs.py @@ -143,8 +143,9 @@ class UpdateSchedule(BaseModel): class CrawlOps: """Crawl Config Operations""" - def __init__(self, mdb, archive_ops, crawl_manager): + def __init__(self, mdb, user_manager, archive_ops, crawl_manager): self.crawl_configs = mdb["crawl_configs"] + self.user_manager = user_manager self.archive_ops = archive_ops self.crawl_manager = crawl_manager @@ -168,6 +169,7 @@ class CrawlOps: data["aid"] = archive.id data["userid"] = user.id data["_id"] = uuid.uuid4() + data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None) if config.colls: data["colls"] = await self.coll_ops.find_collections( @@ -176,8 +178,6 @@ class CrawlOps: result = await self.crawl_configs.insert_one(data) - data["created"] = datetime.utcnow().replace(microsecond=0, tzinfo=None) - crawlconfig = CrawlConfig.from_dict(data) new_name = await self.crawl_manager.add_crawl_config( @@ -190,7 +190,7 @@ class CrawlOps: """ Update schedule for existing crawl config""" if not await self.crawl_configs.find_one_and_update( - {"_id": cid}, {"$set": {"schedule": update.schedule}} + {"_id": uuid.UUID(cid)}, {"$set": {"schedule": update.schedule}} ): return False @@ -248,6 +248,11 @@ class CrawlOps: if len(crawls) == 1: out.currCrawlId = crawls[0].id + user = await self.user_manager.get(crawlconfig.userid) + # pylint: disable=invalid-name + if user: + out.userName = user.name + return out async def get_crawl_config(self, cid: uuid.UUID, archive: Archive): @@ -270,9 +275,9 @@ class CrawlOps: # ============================================================================ # pylint: disable=redefined-builtin,invalid-name,too-many-locals -def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): +def init_crawl_config_api(mdb, user_dep, user_manager, archive_ops, crawl_manager): """Init /crawlconfigs api routes""" - ops = CrawlOps(mdb, archive_ops, crawl_manager) + ops = CrawlOps(mdb, user_manager, archive_ops, crawl_manager) router = ops.router @@ -342,7 +347,7 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): crawl_id = None try: - crawl_id = await crawl_manager.run_crawl_config(cid, str(user.id)) + crawl_id = await crawl_manager.run_crawl_config(cid, userid=str(user.id)) except Exception as e: # pylint: disable=raise-missing-from raise HTTPException(status_code=500, detail=f"Error starting crawl: {e}") diff --git a/backend/crawls.py b/backend/crawls.py index 66e82535..ed186bb2 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -313,17 +313,15 @@ class CrawlOps: async def get_crawl(self, crawlid: str, archive: Archive): """ Get data for single crawl """ - crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id_str) - if crawl: - await self.get_redis_stats([crawl]) + + res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id}) + + if not res: + crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id_str) + if crawl: + await self.get_redis_stats([crawl]) else: - res = await self.crawls.find_one({"_id": crawlid, "aid": archive.id}) - if not res: - raise HTTPException( - status_code=404, detail=f"Crawl not found: {crawlid}" - ) - files = [CrawlFile(**data) for data in res["files"]] del res["files"] @@ -331,6 +329,11 @@ class CrawlOps: res["resources"] = await self._resolve_signed_urls(files, archive) crawl = CrawlOut.from_dict(res) + if not crawl: + raise HTTPException( + status_code=404, detail=f"Crawl not found: {crawlid}" + ) + return await self._resolve_crawl_refs(crawl, archive) async def _resolve_crawl_refs( diff --git a/backend/dockerman.py b/backend/dockerman.py index 71f65583..4e17f6cc 100644 --- a/backend/dockerman.py +++ b/backend/dockerman.py @@ -359,6 +359,9 @@ class DockerManager: try: container = await self.client.containers.get(crawl_id) + if container["State"]["Status"] != "running": + return None + if aid and container["Config"]["Labels"]["btrix.archive"] != aid: return None diff --git a/backend/main.py b/backend/main.py index 2c4e4a49..5e68b634 100644 --- a/backend/main.py +++ b/backend/main.py @@ -66,6 +66,7 @@ def main(): crawl_config_ops = init_crawl_config_api( mdb, current_active_user, + user_manager, archive_ops, crawl_manager, ) diff --git a/backend/scheduler.py b/backend/scheduler.py index 773dddb6..6b777563 100644 --- a/backend/scheduler.py +++ b/backend/scheduler.py @@ -6,6 +6,7 @@ from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.triggers.cron import CronTrigger from apscheduler.jobstores.mongodb import MongoDBJobStore from pymongo import MongoClient +from pytz import utc from db import DATABASE_URL @@ -28,7 +29,7 @@ def run_scheduler(event_q, trigger_q): print("Initializing Scheduler...", flush=True) - scheduler = BackgroundScheduler() + scheduler = BackgroundScheduler(timezone=utc) mongoclient = MongoClient(DATABASE_URL) diff --git a/backend/storages.py b/backend/storages.py index e62a0c21..1fa1d181 100644 --- a/backend/storages.py +++ b/backend/storages.py @@ -107,5 +107,4 @@ async def get_presigned_url(archive, crawlfile, crawl_manager, duration=3600): "get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=duration ) - print("presigned_url", presigned_url) return presigned_url