Add crawl errors incrementally during crawl (#1561)

Fixes #1558 

- Adds crawl errors to database incrementally during crawl rather than
after crawl completes
- Simplifies crawl /errors API endpoint to always return errors from
database
This commit is contained in:
Tessa Walsh 2024-02-29 12:16:34 -05:00 committed by GitHub
parent 804f755787
commit da19691184
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 72 deletions

View File

@ -384,26 +384,6 @@ class CrawlOps(BaseCrawlOps):
return {"total": total, "matched": matched, "nextOffset": next_offset}
async def get_errors_from_redis(
self, crawl_id: str, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1
):
"""Get crawl errors from Redis and optionally store in mongodb."""
# Zero-index page for query
page = page - 1
skip = page * page_size
upper_bound = skip + page_size - 1
async with self.get_redis(crawl_id) as redis:
try:
errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound)
total = await redis.llen(f"{crawl_id}:e")
except exceptions.ConnectionError:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=503, detail="error_logs_not_available")
parsed_errors = parse_jsonl_error_messages(errors)
return parsed_errors, total
async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add):
"""add new exclusion to config or remove exclusion from config
for given crawl_id, update config on crawl"""
@ -470,10 +450,10 @@ class CrawlOps(BaseCrawlOps):
return None, None
return res.get("state"), res.get("finished")
async def add_crawl_errors(self, crawl_id, errors):
"""add crawl errors from redis to mongodb errors field"""
async def add_crawl_error(self, crawl_id: str, error: str):
"""add crawl error from redis to mongodb errors field"""
await self.crawls.find_one_and_update(
{"_id": crawl_id}, {"$push": {"errors": {"$each": errors}}}
{"_id": crawl_id}, {"$push": {"errors": error}}
)
async def add_crawl_file(self, crawl_id, crawl_file, size):
@ -931,15 +911,11 @@ def init_crawls_api(app, user_dep, *args):
crawl_raw = await ops.get_crawl_raw(crawl_id, org)
crawl = Crawl.from_dict(crawl_raw)
if crawl.finished:
skip = (page - 1) * pageSize
upper_bound = skip + pageSize
errors = crawl.errors[skip:upper_bound]
parsed_errors = parse_jsonl_error_messages(errors)
total = len(crawl.errors)
return paginated_format(parsed_errors, total, page, pageSize)
skip = (page - 1) * pageSize
upper_bound = skip + pageSize
errors, total = await ops.get_errors_from_redis(crawl_id, pageSize, page)
return paginated_format(errors, total, page, pageSize)
errors = crawl.errors[skip:upper_bound]
parsed_errors = parse_jsonl_error_messages(errors)
return paginated_format(parsed_errors, len(crawl.errors), page, pageSize)
return ops

View File

@ -300,6 +300,7 @@ class BtrixOperator(K8sAPI):
self.done_key = "crawls-done"
self.pages_key = "pages"
self.errors_key = "e"
self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0)
@ -999,7 +1000,6 @@ class BtrixOperator(K8sAPI):
)
file_done = await redis.lpop(self.done_key)
while file_done:
msg = json.loads(file_done)
# add completed file
@ -1011,12 +1011,16 @@ class BtrixOperator(K8sAPI):
file_done = await redis.lpop(self.done_key)
page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}")
while page_crawled:
page_dict = json.loads(page_crawled)
await self.page_ops.add_page_to_db(page_dict, crawl.id, crawl.oid)
page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}")
crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}")
while crawl_error:
await self.crawl_ops.add_crawl_error(crawl.id, crawl_error)
crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}")
# ensure filesAdded and filesAddedSize always set
status.filesAdded = int(await redis.get("filesAdded") or 0)
status.filesAddedSize = int(await redis.get("filesAddedSize") or 0)
@ -1552,8 +1556,6 @@ class BtrixOperator(K8sAPI):
crawl_id, oid, state
)
await self.add_crawl_errors_to_db(crawl_id)
# finally, delete job
await self.delete_crawl_job(crawl_id)
@ -1582,42 +1584,6 @@ class BtrixOperator(K8sAPI):
if redis:
await redis.close()
async def add_crawl_errors_to_db(self, crawl_id, inc=100):
"""Pull crawl errors from redis and write to mongo db"""
index = 0
redis = None
try:
redis_url = self.get_redis_url(crawl_id)
redis = await self._get_redis(redis_url)
if not redis:
return
# ensure this only runs once
if not await redis.setnx("errors-exported", "1"):
return
while True:
skip = index * inc
upper_bound = skip + inc - 1
errors = await redis.lrange(f"{crawl_id}:e", skip, upper_bound)
if not errors:
break
await self.crawl_ops.add_crawl_errors(crawl_id, errors)
if len(errors) < inc:
# If we have fewer than inc errors, we can assume this is the
# last page of data to add.
break
index += 1
# pylint: disable=bare-except
except:
# likely redis has already been deleted, so nothing to do
pass
finally:
if redis:
await redis.close()
def get_cronjob_crawl_related(self, data: MCBaseRequest):
"""return configmap related to crawl"""
labels = data.parent.get("metadata", {}).get("labels", {})