use redis based queue instead of url for crawl done webhook
update docker setup to support redis webhook, add consistent CRAWL_ARGS, additional fixes
This commit is contained in:
		
							parent
							
								
									4ae4005d74
								
							
						
					
					
						commit
						c38e0b7bf7
					
				| @ -204,7 +204,11 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): | |||||||
|     @router.get("") |     @router.get("") | ||||||
|     async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)): |     async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)): | ||||||
|         results = await ops.get_crawl_configs(archive) |         results = await ops.get_crawl_configs(archive) | ||||||
|         return {"crawl_configs": [res.serialize() for res in results]} |         return { | ||||||
|  |             "crawl_configs": [ | ||||||
|  |                 res.serialize(exclude={"archive", "runNow"}) for res in results | ||||||
|  |             ] | ||||||
|  |         } | ||||||
| 
 | 
 | ||||||
|     @router.get("/{cid}") |     @router.get("/{cid}") | ||||||
|     async def get_crawl_config(crawl_config: CrawlConfig = Depends(crawls_dep)): |     async def get_crawl_config(crawl_config: CrawlConfig = Depends(crawls_dep)): | ||||||
|  | |||||||
| @ -1,6 +1,7 @@ | |||||||
| """ Crawl API """ | """ Crawl API """ | ||||||
| 
 | 
 | ||||||
| import asyncio | import asyncio | ||||||
|  | import json | ||||||
| 
 | 
 | ||||||
| from typing import Optional, List, Dict | from typing import Optional, List, Dict | ||||||
| from datetime import datetime | from datetime import datetime | ||||||
| @ -88,6 +89,7 @@ class CrawlOps: | |||||||
|         self.crawl_manager = crawl_manager |         self.crawl_manager = crawl_manager | ||||||
|         self.crawl_configs = crawl_configs |         self.crawl_configs = crawl_configs | ||||||
|         self.archives = archives |         self.archives = archives | ||||||
|  |         self.crawls_done_key = "crawls-done" | ||||||
| 
 | 
 | ||||||
|         self.redis = None |         self.redis = None | ||||||
|         asyncio.create_task(self.init_redis(redis_url)) |         asyncio.create_task(self.init_redis(redis_url)) | ||||||
| @ -96,10 +98,29 @@ class CrawlOps: | |||||||
| 
 | 
 | ||||||
|     async def init_redis(self, redis_url): |     async def init_redis(self, redis_url): | ||||||
|         """ init redis async """ |         """ init redis async """ | ||||||
|         self.redis = await aioredis.from_url(redis_url) |         self.redis = await aioredis.from_url( | ||||||
|  |             redis_url, encoding="utf-8", decode_responses=True | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         loop = asyncio.get_running_loop() | ||||||
|  |         loop.create_task(self.run_crawl_complete_loop()) | ||||||
|  | 
 | ||||||
|  |     async def run_crawl_complete_loop(self): | ||||||
|  |         """ Wait for any crawls done from redis queue """ | ||||||
|  |         while True: | ||||||
|  |             try: | ||||||
|  |                 _, value = await self.redis.blpop(self.crawls_done_key, timeout=0) | ||||||
|  |                 value = json.loads(value) | ||||||
|  |                 await self.on_handle_crawl_complete(CrawlCompleteIn(**value)) | ||||||
|  | 
 | ||||||
|  |             # pylint: disable=broad-except | ||||||
|  |             except Exception as exc: | ||||||
|  |                 print(f"Retrying crawls done loop: {exc}") | ||||||
|  |                 await asyncio.sleep(10) | ||||||
| 
 | 
 | ||||||
|     async def on_handle_crawl_complete(self, msg: CrawlCompleteIn): |     async def on_handle_crawl_complete(self, msg: CrawlCompleteIn): | ||||||
|         """ Handle completed crawl, add to crawls db collection, also update archive usage """ |         """ Handle completed crawl, add to crawls db collection, also update archive usage """ | ||||||
|  |         print(msg, flush=True) | ||||||
|         crawl, crawl_file = await self.crawl_manager.process_crawl_complete(msg) |         crawl, crawl_file = await self.crawl_manager.process_crawl_complete(msg) | ||||||
|         if not crawl: |         if not crawl: | ||||||
|             print("Not a valid crawl complete msg!", flush=True) |             print("Not a valid crawl complete msg!", flush=True) | ||||||
| @ -205,13 +226,6 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv | |||||||
| 
 | 
 | ||||||
|     archive_crawl_dep = archives.archive_crawl_dep |     archive_crawl_dep = archives.archive_crawl_dep | ||||||
| 
 | 
 | ||||||
|     @app.post("/_crawls/done", tags=["_internal"]) |  | ||||||
|     async def crawl_done(msg: CrawlCompleteIn): |  | ||||||
|         loop = asyncio.get_running_loop() |  | ||||||
|         loop.create_task(ops.on_handle_crawl_complete(msg)) |  | ||||||
| 
 |  | ||||||
|         return {"success": True} |  | ||||||
| 
 |  | ||||||
|     @app.get("/archives/{aid}/crawls", tags=["crawls"]) |     @app.get("/archives/{aid}/crawls", tags=["crawls"]) | ||||||
|     async def list_crawls(archive: Archive = Depends(archive_crawl_dep)): |     async def list_crawls(archive: Archive = Depends(archive_crawl_dep)): | ||||||
|         return await ops.list_crawls(archive.id) |         return await ops.list_crawls(archive.id) | ||||||
|  | |||||||
| @ -41,9 +41,11 @@ class BaseMongoModel(BaseModel): | |||||||
|         data["id"] = str(data.pop("_id")) |         data["id"] = str(data.pop("_id")) | ||||||
|         return cls(**data) |         return cls(**data) | ||||||
| 
 | 
 | ||||||
|     def serialize(self): |     def serialize(self, **opts): | ||||||
|         """convert Archive to dict""" |         """convert Archive to dict""" | ||||||
|         return self.dict(exclude_unset=True, exclude_defaults=True, exclude_none=True) |         return self.dict( | ||||||
|  |             exclude_unset=True, exclude_defaults=True, exclude_none=True, **opts | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|     def to_dict(self, **opts): |     def to_dict(self, **opts): | ||||||
|         """convert to dict for mongo""" |         """convert to dict for mongo""" | ||||||
|  | |||||||
| @ -19,7 +19,7 @@ from scheduler import run_scheduler | |||||||
| 
 | 
 | ||||||
| from archives import S3Storage | from archives import S3Storage | ||||||
| 
 | 
 | ||||||
| from crawls import Crawl | from crawls import Crawl, CrawlFile | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| @ -33,6 +33,11 @@ class DockerManager: | |||||||
|         self.crawler_image = os.environ["CRAWLER_IMAGE"] |         self.crawler_image = os.environ["CRAWLER_IMAGE"] | ||||||
|         self.default_network = os.environ.get("CRAWLER_NETWORK", "btrix-cloud-net") |         self.default_network = os.environ.get("CRAWLER_NETWORK", "btrix-cloud-net") | ||||||
| 
 | 
 | ||||||
|  |         self.redis_url = os.environ["REDIS_URL"] | ||||||
|  |         self.crawls_done_key = "crawls-done" | ||||||
|  | 
 | ||||||
|  |         self.crawl_args = os.environ["CRAWL_ARGS"] | ||||||
|  | 
 | ||||||
|         self.archive_ops = archive_ops |         self.archive_ops = archive_ops | ||||||
|         self.crawl_ops = None |         self.crawl_ops = None | ||||||
| 
 | 
 | ||||||
| @ -44,7 +49,7 @@ class DockerManager: | |||||||
|                 name="default", |                 name="default", | ||||||
|                 access_key=os.environ["STORE_ACCESS_KEY"], |                 access_key=os.environ["STORE_ACCESS_KEY"], | ||||||
|                 secret_key=os.environ["STORE_SECRET_KEY"], |                 secret_key=os.environ["STORE_SECRET_KEY"], | ||||||
|                 endpont_url=os.environ["STORE_ENDPOINT_URL"], |                 endpoint_url=os.environ["STORE_ENDPOINT_URL"], | ||||||
|             ) |             ) | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
| @ -137,7 +142,7 @@ class DockerManager: | |||||||
| 
 | 
 | ||||||
|         # pylint: disable=no-else-return |         # pylint: disable=no-else-return | ||||||
|         if storage.type == "default": |         if storage.type == "default": | ||||||
|             return self.storages[storage], storage.path |             return self.storages[storage.name], storage.path | ||||||
|         else: |         else: | ||||||
|             return storage, "" |             return storage, "" | ||||||
| 
 | 
 | ||||||
| @ -169,7 +174,7 @@ class DockerManager: | |||||||
|             "btrix.user": userid, |             "btrix.user": userid, | ||||||
|             "btrix.archive": aid, |             "btrix.archive": aid, | ||||||
|             "btrix.crawlconfig": cid, |             "btrix.crawlconfig": cid, | ||||||
|             "btrix.coll": crawlconfig.config.collection, |             "btrix.tag.coll": crawlconfig.config.collection, | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         if crawlconfig.crawlTimeout: |         if crawlconfig.crawlTimeout: | ||||||
| @ -186,13 +191,15 @@ class DockerManager: | |||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|         if crawlconfig.runNow: |         if crawlconfig.runNow: | ||||||
|             await self._run_crawl_now( |             return await self._run_crawl_now( | ||||||
|                 storage, |                 storage, | ||||||
|                 storage_path, |                 storage_path, | ||||||
|                 labels, |                 labels, | ||||||
|                 volume, |                 volume, | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|  |         return "" | ||||||
|  | 
 | ||||||
|     async def update_crawl_schedule(self, cid, schedule): |     async def update_crawl_schedule(self, cid, schedule): | ||||||
|         """ Update the schedule for existing crawl config """ |         """ Update the schedule for existing crawl config """ | ||||||
| 
 | 
 | ||||||
| @ -272,10 +279,9 @@ class DockerManager: | |||||||
|             print(exc, flush=True) |             print(exc, flush=True) | ||||||
|             return None |             return None | ||||||
| 
 | 
 | ||||||
|         container = await self._run_crawl_now( |         return await self._run_crawl_now( | ||||||
|             storage, storage_path, labels, volume_name, schedule, manual |             storage, storage_path, labels, volume_name, schedule, manual | ||||||
|         ) |         ) | ||||||
|         return container["id"][:12] |  | ||||||
| 
 | 
 | ||||||
|     async def process_crawl_complete(self, crawlcomplete): |     async def process_crawl_complete(self, crawlcomplete): | ||||||
|         """Validate that crawl is valid by checking that container exists and label matches |         """Validate that crawl is valid by checking that container exists and label matches | ||||||
| @ -290,12 +296,15 @@ class DockerManager: | |||||||
|             container, |             container, | ||||||
|             "complete" if crawlcomplete.completed else "partial_complete", |             "complete" if crawlcomplete.completed else "partial_complete", | ||||||
|             finish_now=True, |             finish_now=True, | ||||||
|             filename=crawlcomplete.filename, |  | ||||||
|             size=crawlcomplete.size, |  | ||||||
|             hashstr=crawlcomplete.hash, |  | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         return crawl |         crawl_file = CrawlFile( | ||||||
|  |             filename=crawlcomplete.filename, | ||||||
|  |             size=crawlcomplete.size, | ||||||
|  |             hash=crawlcomplete.hash, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         return crawl, crawl_file | ||||||
| 
 | 
 | ||||||
|     async def scale_crawl(self):  # job_name, aid, parallelism=1): |     async def scale_crawl(self):  # job_name, aid, parallelism=1): | ||||||
|         """ Scale running crawl, currently only supported in k8s""" |         """ Scale running crawl, currently only supported in k8s""" | ||||||
| @ -394,7 +403,7 @@ class DockerManager: | |||||||
|             "--config", |             "--config", | ||||||
|             "/tmp/crawlconfig/crawl-config.json", |             "/tmp/crawlconfig/crawl-config.json", | ||||||
|             "--redisStoreUrl", |             "--redisStoreUrl", | ||||||
|             "redis://redis:6379/0", |             self.redis_url, | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         if self.extra_crawl_params: |         if self.extra_crawl_params: | ||||||
| @ -411,7 +420,8 @@ class DockerManager: | |||||||
|             f"STORE_ACCESS_KEY={storage.access_key}", |             f"STORE_ACCESS_KEY={storage.access_key}", | ||||||
|             f"STORE_SECRET_KEY={storage.secret_key}", |             f"STORE_SECRET_KEY={storage.secret_key}", | ||||||
|             f"STORE_PATH={storage_path}", |             f"STORE_PATH={storage_path}", | ||||||
|             "WEBHOOK_URL=http://backend:8000/_crawls/done", |             f"WEBHOOK_URL={self.redis_url}/{self.crawls_done_key}", | ||||||
|  |             f"CRAWL_ARGS={self.crawl_args}", | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         labels["btrix.run.schedule"] = schedule |         labels["btrix.run.schedule"] = schedule | ||||||
| @ -429,7 +439,8 @@ class DockerManager: | |||||||
|             }, |             }, | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|         return await self.client.containers.run(run_config) |         container = await self.client.containers.run(run_config) | ||||||
|  |         return container["id"] | ||||||
| 
 | 
 | ||||||
|     async def _list_running_containers(self, labels): |     async def _list_running_containers(self, labels): | ||||||
|         results = await self.client.containers.list( |         results = await self.client.containers.list( | ||||||
| @ -454,12 +465,15 @@ class DockerManager: | |||||||
|         await container.delete() |         await container.delete() | ||||||
| 
 | 
 | ||||||
|     # pylint: disable=no-self-use,too-many-arguments |     # pylint: disable=no-self-use,too-many-arguments | ||||||
|     def _make_crawl_for_container( |     def _make_crawl_for_container(self, container, state, finish_now=False): | ||||||
|         self, container, state, finish_now=False, filename=None, size=None, hashstr=None |  | ||||||
|     ): |  | ||||||
|         """ Make a crawl object from a container data""" |         """ Make a crawl object from a container data""" | ||||||
|         labels = container["Config"]["Labels"] |         labels = container["Config"]["Labels"] | ||||||
| 
 | 
 | ||||||
|  |         tags = {} | ||||||
|  |         for name in labels: | ||||||
|  |             if name.startswith("btrix.tag."): | ||||||
|  |                 tags[name[len("btrix.tag.") :]] = labels.get(name) | ||||||
|  | 
 | ||||||
|         return Crawl( |         return Crawl( | ||||||
|             id=container["Id"], |             id=container["Id"], | ||||||
|             state=state, |             state=state, | ||||||
| @ -472,7 +486,5 @@ class DockerManager: | |||||||
|             finished=datetime.utcnow().replace(microsecond=0, tzinfo=None) |             finished=datetime.utcnow().replace(microsecond=0, tzinfo=None) | ||||||
|             if finish_now |             if finish_now | ||||||
|             else None, |             else None, | ||||||
|             filename=filename, |             tags=tags, | ||||||
|             size=size, |  | ||||||
|             hash=hashstr, |  | ||||||
|         ) |         ) | ||||||
|  | |||||||
| @ -16,6 +16,7 @@ data: | |||||||
| 
 | 
 | ||||||
|   REDIS_URL: "{{ .Values.redis_url }}" |   REDIS_URL: "{{ .Values.redis_url }}" | ||||||
| 
 | 
 | ||||||
|  |   REDIS_CRAWLS_DONE_KEY: "crawls-done" | ||||||
| 
 | 
 | ||||||
| --- | --- | ||||||
| apiVersion: v1 | apiVersion: v1 | ||||||
| @ -26,7 +27,8 @@ metadata: | |||||||
| 
 | 
 | ||||||
| data: | data: | ||||||
|   CRAWL_ARGS: "{{ .Values.crawler_args }} --redisStoreUrl {{ .Values.redis_url }}" |   CRAWL_ARGS: "{{ .Values.crawler_args }} --redisStoreUrl {{ .Values.redis_url }}" | ||||||
|   WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done" |   #WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done" | ||||||
|  |   WEBHOOK_URL: "{{ .Values.redis_url }}/crawls-done" | ||||||
|   STORE_USER: "" |   STORE_USER: "" | ||||||
| 
 | 
 | ||||||
| --- | --- | ||||||
|  | |||||||
| @ -65,7 +65,7 @@ crawler_namespace: "crawlers" | |||||||
| crawl_retries: 1 | crawl_retries: 1 | ||||||
| 
 | 
 | ||||||
| # browsertrix-crawler args: | # browsertrix-crawler args: | ||||||
| crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037 --headless" | crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -14,6 +14,8 @@ STORE_SECRET_KEY=PASSW0RD | |||||||
| 
 | 
 | ||||||
| MC_HOST_local=http://ADMIN:PASSW0RD@minio:9000 | MC_HOST_local=http://ADMIN:PASSW0RD@minio:9000 | ||||||
| 
 | 
 | ||||||
|  | REDIS_URL=redis://redis/0 | ||||||
|  | 
 | ||||||
| # enable to send verification emails | # enable to send verification emails | ||||||
| #EMAIL_SMTP_HOST=smtp.gmail.com | #EMAIL_SMTP_HOST=smtp.gmail.com | ||||||
| #EMAIL_SMTP_PORT=587 | #EMAIL_SMTP_PORT=587 | ||||||
| @ -23,3 +25,5 @@ MC_HOST_local=http://ADMIN:PASSW0RD@minio:9000 | |||||||
| # Browsertrix Crawler image to use | # Browsertrix Crawler image to use | ||||||
| CRAWLER_IMAGE=webrecorder/browsertrix-crawler | CRAWLER_IMAGE=webrecorder/browsertrix-crawler | ||||||
| 
 | 
 | ||||||
|  | CRAWL_ARGS="--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037" | ||||||
|  | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user