diff --git a/backend/crawlconfigs.py b/backend/crawlconfigs.py index ac0ef0b5..0993b657 100644 --- a/backend/crawlconfigs.py +++ b/backend/crawlconfigs.py @@ -204,7 +204,11 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): @router.get("") async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)): results = await ops.get_crawl_configs(archive) - return {"crawl_configs": [res.serialize() for res in results]} + return { + "crawl_configs": [ + res.serialize(exclude={"archive", "runNow"}) for res in results + ] + } @router.get("/{cid}") async def get_crawl_config(crawl_config: CrawlConfig = Depends(crawls_dep)): diff --git a/backend/crawls.py b/backend/crawls.py index 54076f0d..4178de55 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -1,6 +1,7 @@ """ Crawl API """ import asyncio +import json from typing import Optional, List, Dict from datetime import datetime @@ -88,6 +89,7 @@ class CrawlOps: self.crawl_manager = crawl_manager self.crawl_configs = crawl_configs self.archives = archives + self.crawls_done_key = "crawls-done" self.redis = None asyncio.create_task(self.init_redis(redis_url)) @@ -96,10 +98,29 @@ class CrawlOps: async def init_redis(self, redis_url): """ init redis async """ - self.redis = await aioredis.from_url(redis_url) + self.redis = await aioredis.from_url( + redis_url, encoding="utf-8", decode_responses=True + ) + + loop = asyncio.get_running_loop() + loop.create_task(self.run_crawl_complete_loop()) + + async def run_crawl_complete_loop(self): + """ Wait for any crawls done from redis queue """ + while True: + try: + _, value = await self.redis.blpop(self.crawls_done_key, timeout=0) + value = json.loads(value) + await self.on_handle_crawl_complete(CrawlCompleteIn(**value)) + + # pylint: disable=broad-except + except Exception as exc: + print(f"Retrying crawls done loop: {exc}") + await asyncio.sleep(10) async def on_handle_crawl_complete(self, msg: CrawlCompleteIn): """ Handle completed crawl, add to crawls db collection, also update archive usage """ + print(msg, flush=True) crawl, crawl_file = await self.crawl_manager.process_crawl_complete(msg) if not crawl: print("Not a valid crawl complete msg!", flush=True) @@ -205,13 +226,6 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv archive_crawl_dep = archives.archive_crawl_dep - @app.post("/_crawls/done", tags=["_internal"]) - async def crawl_done(msg: CrawlCompleteIn): - loop = asyncio.get_running_loop() - loop.create_task(ops.on_handle_crawl_complete(msg)) - - return {"success": True} - @app.get("/archives/{aid}/crawls", tags=["crawls"]) async def list_crawls(archive: Archive = Depends(archive_crawl_dep)): return await ops.list_crawls(archive.id) diff --git a/backend/db.py b/backend/db.py index 90a8670c..b0216a10 100644 --- a/backend/db.py +++ b/backend/db.py @@ -41,9 +41,11 @@ class BaseMongoModel(BaseModel): data["id"] = str(data.pop("_id")) return cls(**data) - def serialize(self): + def serialize(self, **opts): """convert Archive to dict""" - return self.dict(exclude_unset=True, exclude_defaults=True, exclude_none=True) + return self.dict( + exclude_unset=True, exclude_defaults=True, exclude_none=True, **opts + ) def to_dict(self, **opts): """convert to dict for mongo""" diff --git a/backend/dockerman.py b/backend/dockerman.py index d45f997f..023ed7e6 100644 --- a/backend/dockerman.py +++ b/backend/dockerman.py @@ -19,7 +19,7 @@ from scheduler import run_scheduler from archives import S3Storage -from crawls import Crawl +from crawls import Crawl, CrawlFile # ============================================================================ @@ -33,6 +33,11 @@ class DockerManager: self.crawler_image = os.environ["CRAWLER_IMAGE"] self.default_network = os.environ.get("CRAWLER_NETWORK", "btrix-cloud-net") + self.redis_url = os.environ["REDIS_URL"] + self.crawls_done_key = "crawls-done" + + self.crawl_args = os.environ["CRAWL_ARGS"] + self.archive_ops = archive_ops self.crawl_ops = None @@ -44,7 +49,7 @@ class DockerManager: name="default", access_key=os.environ["STORE_ACCESS_KEY"], secret_key=os.environ["STORE_SECRET_KEY"], - endpont_url=os.environ["STORE_ENDPOINT_URL"], + endpoint_url=os.environ["STORE_ENDPOINT_URL"], ) } @@ -137,7 +142,7 @@ class DockerManager: # pylint: disable=no-else-return if storage.type == "default": - return self.storages[storage], storage.path + return self.storages[storage.name], storage.path else: return storage, "" @@ -169,7 +174,7 @@ class DockerManager: "btrix.user": userid, "btrix.archive": aid, "btrix.crawlconfig": cid, - "btrix.coll": crawlconfig.config.collection, + "btrix.tag.coll": crawlconfig.config.collection, } if crawlconfig.crawlTimeout: @@ -186,13 +191,15 @@ class DockerManager: ) if crawlconfig.runNow: - await self._run_crawl_now( + return await self._run_crawl_now( storage, storage_path, labels, volume, ) + return "" + async def update_crawl_schedule(self, cid, schedule): """ Update the schedule for existing crawl config """ @@ -272,10 +279,9 @@ class DockerManager: print(exc, flush=True) return None - container = await self._run_crawl_now( + return await self._run_crawl_now( storage, storage_path, labels, volume_name, schedule, manual ) - return container["id"][:12] async def process_crawl_complete(self, crawlcomplete): """Validate that crawl is valid by checking that container exists and label matches @@ -290,12 +296,15 @@ class DockerManager: container, "complete" if crawlcomplete.completed else "partial_complete", finish_now=True, - filename=crawlcomplete.filename, - size=crawlcomplete.size, - hashstr=crawlcomplete.hash, ) - return crawl + crawl_file = CrawlFile( + filename=crawlcomplete.filename, + size=crawlcomplete.size, + hash=crawlcomplete.hash, + ) + + return crawl, crawl_file async def scale_crawl(self): # job_name, aid, parallelism=1): """ Scale running crawl, currently only supported in k8s""" @@ -394,7 +403,7 @@ class DockerManager: "--config", "/tmp/crawlconfig/crawl-config.json", "--redisStoreUrl", - "redis://redis:6379/0", + self.redis_url, ] if self.extra_crawl_params: @@ -411,7 +420,8 @@ class DockerManager: f"STORE_ACCESS_KEY={storage.access_key}", f"STORE_SECRET_KEY={storage.secret_key}", f"STORE_PATH={storage_path}", - "WEBHOOK_URL=http://backend:8000/_crawls/done", + f"WEBHOOK_URL={self.redis_url}/{self.crawls_done_key}", + f"CRAWL_ARGS={self.crawl_args}", ] labels["btrix.run.schedule"] = schedule @@ -429,7 +439,8 @@ class DockerManager: }, } - return await self.client.containers.run(run_config) + container = await self.client.containers.run(run_config) + return container["id"] async def _list_running_containers(self, labels): results = await self.client.containers.list( @@ -454,12 +465,15 @@ class DockerManager: await container.delete() # pylint: disable=no-self-use,too-many-arguments - def _make_crawl_for_container( - self, container, state, finish_now=False, filename=None, size=None, hashstr=None - ): + def _make_crawl_for_container(self, container, state, finish_now=False): """ Make a crawl object from a container data""" labels = container["Config"]["Labels"] + tags = {} + for name in labels: + if name.startswith("btrix.tag."): + tags[name[len("btrix.tag.") :]] = labels.get(name) + return Crawl( id=container["Id"], state=state, @@ -472,7 +486,5 @@ class DockerManager: finished=datetime.utcnow().replace(microsecond=0, tzinfo=None) if finish_now else None, - filename=filename, - size=size, - hash=hashstr, + tags=tags, ) diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 8f623ed5..1c4d8aa9 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -16,6 +16,7 @@ data: REDIS_URL: "{{ .Values.redis_url }}" + REDIS_CRAWLS_DONE_KEY: "crawls-done" --- apiVersion: v1 @@ -26,7 +27,8 @@ metadata: data: CRAWL_ARGS: "{{ .Values.crawler_args }} --redisStoreUrl {{ .Values.redis_url }}" - WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done" + #WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done" + WEBHOOK_URL: "{{ .Values.redis_url }}/crawls-done" STORE_USER: "" --- diff --git a/chart/values.yaml b/chart/values.yaml index 50716c2f..f44fc463 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -65,7 +65,7 @@ crawler_namespace: "crawlers" crawl_retries: 1 # browsertrix-crawler args: -crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037 --headless" +crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" diff --git a/configs/config.sample.env b/configs/config.sample.env index f15f0f55..c29b45ec 100644 --- a/configs/config.sample.env +++ b/configs/config.sample.env @@ -14,6 +14,8 @@ STORE_SECRET_KEY=PASSW0RD MC_HOST_local=http://ADMIN:PASSW0RD@minio:9000 +REDIS_URL=redis://redis/0 + # enable to send verification emails #EMAIL_SMTP_HOST=smtp.gmail.com #EMAIL_SMTP_PORT=587 @@ -23,3 +25,5 @@ MC_HOST_local=http://ADMIN:PASSW0RD@minio:9000 # Browsertrix Crawler image to use CRAWLER_IMAGE=webrecorder/browsertrix-crawler +CRAWL_ARGS="--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" +