use redis based queue instead of url for crawl done webhook
update docker setup to support redis webhook, add consistent CRAWL_ARGS, additional fixes
This commit is contained in:
parent
4ae4005d74
commit
c38e0b7bf7
@ -204,7 +204,11 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
|
||||
@router.get("")
|
||||
async def get_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
|
||||
results = await ops.get_crawl_configs(archive)
|
||||
return {"crawl_configs": [res.serialize() for res in results]}
|
||||
return {
|
||||
"crawl_configs": [
|
||||
res.serialize(exclude={"archive", "runNow"}) for res in results
|
||||
]
|
||||
}
|
||||
|
||||
@router.get("/{cid}")
|
||||
async def get_crawl_config(crawl_config: CrawlConfig = Depends(crawls_dep)):
|
||||
|
@ -1,6 +1,7 @@
|
||||
""" Crawl API """
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
from typing import Optional, List, Dict
|
||||
from datetime import datetime
|
||||
@ -88,6 +89,7 @@ class CrawlOps:
|
||||
self.crawl_manager = crawl_manager
|
||||
self.crawl_configs = crawl_configs
|
||||
self.archives = archives
|
||||
self.crawls_done_key = "crawls-done"
|
||||
|
||||
self.redis = None
|
||||
asyncio.create_task(self.init_redis(redis_url))
|
||||
@ -96,10 +98,29 @@ class CrawlOps:
|
||||
|
||||
async def init_redis(self, redis_url):
|
||||
""" init redis async """
|
||||
self.redis = await aioredis.from_url(redis_url)
|
||||
self.redis = await aioredis.from_url(
|
||||
redis_url, encoding="utf-8", decode_responses=True
|
||||
)
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.create_task(self.run_crawl_complete_loop())
|
||||
|
||||
async def run_crawl_complete_loop(self):
|
||||
""" Wait for any crawls done from redis queue """
|
||||
while True:
|
||||
try:
|
||||
_, value = await self.redis.blpop(self.crawls_done_key, timeout=0)
|
||||
value = json.loads(value)
|
||||
await self.on_handle_crawl_complete(CrawlCompleteIn(**value))
|
||||
|
||||
# pylint: disable=broad-except
|
||||
except Exception as exc:
|
||||
print(f"Retrying crawls done loop: {exc}")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
async def on_handle_crawl_complete(self, msg: CrawlCompleteIn):
|
||||
""" Handle completed crawl, add to crawls db collection, also update archive usage """
|
||||
print(msg, flush=True)
|
||||
crawl, crawl_file = await self.crawl_manager.process_crawl_complete(msg)
|
||||
if not crawl:
|
||||
print("Not a valid crawl complete msg!", flush=True)
|
||||
@ -205,13 +226,6 @@ def init_crawls_api(app, mdb, redis_url, crawl_manager, crawl_config_ops, archiv
|
||||
|
||||
archive_crawl_dep = archives.archive_crawl_dep
|
||||
|
||||
@app.post("/_crawls/done", tags=["_internal"])
|
||||
async def crawl_done(msg: CrawlCompleteIn):
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.create_task(ops.on_handle_crawl_complete(msg))
|
||||
|
||||
return {"success": True}
|
||||
|
||||
@app.get("/archives/{aid}/crawls", tags=["crawls"])
|
||||
async def list_crawls(archive: Archive = Depends(archive_crawl_dep)):
|
||||
return await ops.list_crawls(archive.id)
|
||||
|
@ -41,9 +41,11 @@ class BaseMongoModel(BaseModel):
|
||||
data["id"] = str(data.pop("_id"))
|
||||
return cls(**data)
|
||||
|
||||
def serialize(self):
|
||||
def serialize(self, **opts):
|
||||
"""convert Archive to dict"""
|
||||
return self.dict(exclude_unset=True, exclude_defaults=True, exclude_none=True)
|
||||
return self.dict(
|
||||
exclude_unset=True, exclude_defaults=True, exclude_none=True, **opts
|
||||
)
|
||||
|
||||
def to_dict(self, **opts):
|
||||
"""convert to dict for mongo"""
|
||||
|
@ -19,7 +19,7 @@ from scheduler import run_scheduler
|
||||
|
||||
from archives import S3Storage
|
||||
|
||||
from crawls import Crawl
|
||||
from crawls import Crawl, CrawlFile
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -33,6 +33,11 @@ class DockerManager:
|
||||
self.crawler_image = os.environ["CRAWLER_IMAGE"]
|
||||
self.default_network = os.environ.get("CRAWLER_NETWORK", "btrix-cloud-net")
|
||||
|
||||
self.redis_url = os.environ["REDIS_URL"]
|
||||
self.crawls_done_key = "crawls-done"
|
||||
|
||||
self.crawl_args = os.environ["CRAWL_ARGS"]
|
||||
|
||||
self.archive_ops = archive_ops
|
||||
self.crawl_ops = None
|
||||
|
||||
@ -44,7 +49,7 @@ class DockerManager:
|
||||
name="default",
|
||||
access_key=os.environ["STORE_ACCESS_KEY"],
|
||||
secret_key=os.environ["STORE_SECRET_KEY"],
|
||||
endpont_url=os.environ["STORE_ENDPOINT_URL"],
|
||||
endpoint_url=os.environ["STORE_ENDPOINT_URL"],
|
||||
)
|
||||
}
|
||||
|
||||
@ -137,7 +142,7 @@ class DockerManager:
|
||||
|
||||
# pylint: disable=no-else-return
|
||||
if storage.type == "default":
|
||||
return self.storages[storage], storage.path
|
||||
return self.storages[storage.name], storage.path
|
||||
else:
|
||||
return storage, ""
|
||||
|
||||
@ -169,7 +174,7 @@ class DockerManager:
|
||||
"btrix.user": userid,
|
||||
"btrix.archive": aid,
|
||||
"btrix.crawlconfig": cid,
|
||||
"btrix.coll": crawlconfig.config.collection,
|
||||
"btrix.tag.coll": crawlconfig.config.collection,
|
||||
}
|
||||
|
||||
if crawlconfig.crawlTimeout:
|
||||
@ -186,13 +191,15 @@ class DockerManager:
|
||||
)
|
||||
|
||||
if crawlconfig.runNow:
|
||||
await self._run_crawl_now(
|
||||
return await self._run_crawl_now(
|
||||
storage,
|
||||
storage_path,
|
||||
labels,
|
||||
volume,
|
||||
)
|
||||
|
||||
return ""
|
||||
|
||||
async def update_crawl_schedule(self, cid, schedule):
|
||||
""" Update the schedule for existing crawl config """
|
||||
|
||||
@ -272,10 +279,9 @@ class DockerManager:
|
||||
print(exc, flush=True)
|
||||
return None
|
||||
|
||||
container = await self._run_crawl_now(
|
||||
return await self._run_crawl_now(
|
||||
storage, storage_path, labels, volume_name, schedule, manual
|
||||
)
|
||||
return container["id"][:12]
|
||||
|
||||
async def process_crawl_complete(self, crawlcomplete):
|
||||
"""Validate that crawl is valid by checking that container exists and label matches
|
||||
@ -290,12 +296,15 @@ class DockerManager:
|
||||
container,
|
||||
"complete" if crawlcomplete.completed else "partial_complete",
|
||||
finish_now=True,
|
||||
filename=crawlcomplete.filename,
|
||||
size=crawlcomplete.size,
|
||||
hashstr=crawlcomplete.hash,
|
||||
)
|
||||
|
||||
return crawl
|
||||
crawl_file = CrawlFile(
|
||||
filename=crawlcomplete.filename,
|
||||
size=crawlcomplete.size,
|
||||
hash=crawlcomplete.hash,
|
||||
)
|
||||
|
||||
return crawl, crawl_file
|
||||
|
||||
async def scale_crawl(self): # job_name, aid, parallelism=1):
|
||||
""" Scale running crawl, currently only supported in k8s"""
|
||||
@ -394,7 +403,7 @@ class DockerManager:
|
||||
"--config",
|
||||
"/tmp/crawlconfig/crawl-config.json",
|
||||
"--redisStoreUrl",
|
||||
"redis://redis:6379/0",
|
||||
self.redis_url,
|
||||
]
|
||||
|
||||
if self.extra_crawl_params:
|
||||
@ -411,7 +420,8 @@ class DockerManager:
|
||||
f"STORE_ACCESS_KEY={storage.access_key}",
|
||||
f"STORE_SECRET_KEY={storage.secret_key}",
|
||||
f"STORE_PATH={storage_path}",
|
||||
"WEBHOOK_URL=http://backend:8000/_crawls/done",
|
||||
f"WEBHOOK_URL={self.redis_url}/{self.crawls_done_key}",
|
||||
f"CRAWL_ARGS={self.crawl_args}",
|
||||
]
|
||||
|
||||
labels["btrix.run.schedule"] = schedule
|
||||
@ -429,7 +439,8 @@ class DockerManager:
|
||||
},
|
||||
}
|
||||
|
||||
return await self.client.containers.run(run_config)
|
||||
container = await self.client.containers.run(run_config)
|
||||
return container["id"]
|
||||
|
||||
async def _list_running_containers(self, labels):
|
||||
results = await self.client.containers.list(
|
||||
@ -454,12 +465,15 @@ class DockerManager:
|
||||
await container.delete()
|
||||
|
||||
# pylint: disable=no-self-use,too-many-arguments
|
||||
def _make_crawl_for_container(
|
||||
self, container, state, finish_now=False, filename=None, size=None, hashstr=None
|
||||
):
|
||||
def _make_crawl_for_container(self, container, state, finish_now=False):
|
||||
""" Make a crawl object from a container data"""
|
||||
labels = container["Config"]["Labels"]
|
||||
|
||||
tags = {}
|
||||
for name in labels:
|
||||
if name.startswith("btrix.tag."):
|
||||
tags[name[len("btrix.tag.") :]] = labels.get(name)
|
||||
|
||||
return Crawl(
|
||||
id=container["Id"],
|
||||
state=state,
|
||||
@ -472,7 +486,5 @@ class DockerManager:
|
||||
finished=datetime.utcnow().replace(microsecond=0, tzinfo=None)
|
||||
if finish_now
|
||||
else None,
|
||||
filename=filename,
|
||||
size=size,
|
||||
hash=hashstr,
|
||||
tags=tags,
|
||||
)
|
||||
|
@ -16,6 +16,7 @@ data:
|
||||
|
||||
REDIS_URL: "{{ .Values.redis_url }}"
|
||||
|
||||
REDIS_CRAWLS_DONE_KEY: "crawls-done"
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
@ -26,7 +27,8 @@ metadata:
|
||||
|
||||
data:
|
||||
CRAWL_ARGS: "{{ .Values.crawler_args }} --redisStoreUrl {{ .Values.redis_url }}"
|
||||
WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done"
|
||||
#WEBHOOK_URL: "http://browsertrix-cloud.default/_crawls/done"
|
||||
WEBHOOK_URL: "{{ .Values.redis_url }}/crawls-done"
|
||||
STORE_USER: ""
|
||||
|
||||
---
|
||||
|
@ -65,7 +65,7 @@ crawler_namespace: "crawlers"
|
||||
crawl_retries: 1
|
||||
|
||||
# browsertrix-crawler args:
|
||||
crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037 --headless"
|
||||
crawler_args: "--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037"
|
||||
|
||||
|
||||
|
||||
|
@ -14,6 +14,8 @@ STORE_SECRET_KEY=PASSW0RD
|
||||
|
||||
MC_HOST_local=http://ADMIN:PASSW0RD@minio:9000
|
||||
|
||||
REDIS_URL=redis://redis/0
|
||||
|
||||
# enable to send verification emails
|
||||
#EMAIL_SMTP_HOST=smtp.gmail.com
|
||||
#EMAIL_SMTP_PORT=587
|
||||
@ -23,3 +25,5 @@ MC_HOST_local=http://ADMIN:PASSW0RD@minio:9000
|
||||
# Browsertrix Crawler image to use
|
||||
CRAWLER_IMAGE=webrecorder/browsertrix-crawler
|
||||
|
||||
CRAWL_ARGS="--timeout 90 --logging stats,behaviors,debug --generateWACZ --screencastPort 9037"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user