concurrent crawls: revert change in #2701, previous logic was already correct (#2726)

- change was unneeded (and made it worse), just add more comments
This commit is contained in:
Ilya Kreymer 2025-07-10 13:34:56 -07:00 committed by GitHub
parent f36fc91963
commit 23700cba2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -19,6 +19,7 @@ from btrixcloud.models import (
TYPE_NON_RUNNING_STATES,
TYPE_RUNNING_STATES,
TYPE_ALL_CRAWL_STATES,
NON_RUNNING_STATES,
RUNNING_STATES,
WAITING_STATES,
RUNNING_AND_STARTING_ONLY,
@ -664,7 +665,7 @@ class CrawlOperator(BaseOperator):
the following state transitions are supported:
from starting to org concurrent crawl limit and back:
- starting -> waiting_org_capacity -> starting
- starting -> waiting_org_limit -> starting
from starting to running:
- starting -> running
@ -756,22 +757,27 @@ class CrawlOperator(BaseOperator):
if not max_crawls:
return True
# if total crawls < concurrent, always allow, no need to check further
if len(data.related[CJS]) <= max_crawls:
return True
name = data.parent.get("metadata", {}).get("name")
active_crawls = 0
# assume crawls already sorted from oldest to newest
# (seems to be the case always)
i = 0
for crawl_sorted in data.related[CJS].values():
crawl_state = crawl_sorted.get("status", {}).get("state", "")
# don't count ourselves
if crawl_sorted.get("metadata", {}).get("name") == name:
# if crawl not running, don't count
if crawl_sorted.get("status", {}).get("state") in NON_RUNNING_STATES:
continue
if crawl_state in RUNNING_AND_WAITING_STATES:
active_crawls += 1
# if reached current crawl, if did not reach crawl quota, allow current crawl to run
if crawl_sorted.get("metadata").get("name") == name:
if i < max_crawls:
return True
if active_crawls <= max_crawls:
return True
break
i += 1
await self.set_state(
"waiting_org_limit", status, crawl, allowed_from=["starting"]