Ensure error and behavior logs are written to database in order (#2540)

Fixes #2539
This commit is contained in:
Tessa Walsh 2025-04-08 09:35:50 -04:00 committed by GitHub
parent 55bedcb0b7
commit 785fd85105
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -878,7 +878,7 @@ class CrawlOperator(BaseOperator):
if crawler_running: if crawler_running:
status.lastActiveTime = date_to_str(dt_now()) status.lastActiveTime = date_to_str(dt_now())
file_done = await redis.lpop(self.done_key) file_done = await redis.rpop(self.done_key)
while file_done: while file_done:
msg = json.loads(file_done) msg = json.loads(file_done)
# add completed file # add completed file
@ -887,9 +887,9 @@ class CrawlOperator(BaseOperator):
await redis.incr("filesAdded") await redis.incr("filesAdded")
# get next file done # get next file done
file_done = await redis.lpop(self.done_key) file_done = await redis.rpop(self.done_key)
page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") page_crawled = await redis.rpop(f"{crawl.id}:{self.pages_key}")
qa_run_id = crawl.id if crawl.is_qa else None qa_run_id = crawl.id if crawl.is_qa else None
while page_crawled: while page_crawled:
@ -897,21 +897,21 @@ class CrawlOperator(BaseOperator):
await self.page_ops.add_page_to_db( await self.page_ops.add_page_to_db(
page_dict, crawl.db_crawl_id, qa_run_id, crawl.oid page_dict, crawl.db_crawl_id, qa_run_id, crawl.oid
) )
page_crawled = await redis.lpop(f"{crawl.id}:{self.pages_key}") page_crawled = await redis.rpop(f"{crawl.id}:{self.pages_key}")
crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") crawl_error = await redis.rpop(f"{crawl.id}:{self.errors_key}")
while crawl_error: while crawl_error:
await self.crawl_ops.add_crawl_error( await self.crawl_ops.add_crawl_error(
crawl.db_crawl_id, crawl.is_qa, crawl_error crawl.db_crawl_id, crawl.is_qa, crawl_error
) )
crawl_error = await redis.lpop(f"{crawl.id}:{self.errors_key}") crawl_error = await redis.rpop(f"{crawl.id}:{self.errors_key}")
behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}") behavior_log = await redis.rpop(f"{crawl.id}:{self.behavior_logs_key}")
while behavior_log: while behavior_log:
await self.crawl_ops.add_crawl_behavior_log( await self.crawl_ops.add_crawl_behavior_log(
crawl.db_crawl_id, behavior_log crawl.db_crawl_id, behavior_log
) )
behavior_log = await redis.lpop(f"{crawl.id}:{self.behavior_logs_key}") behavior_log = await redis.rpop(f"{crawl.id}:{self.behavior_logs_key}")
# ensure filesAdded and filesAddedSize always set # ensure filesAdded and filesAddedSize always set
status.filesAdded = int(await redis.get("filesAdded") or 0) status.filesAdded = int(await redis.get("filesAdded") or 0)