concurrent crawls: revert change in #2701, previous logic was already correct (#2726)

- change was unneeded (and made it worse), just add more comments
2025-07-10 13:34:56 -07:00 · 2025-07-10 13:34:56 -07:00 · 23700cba2d
commit 23700cba2d
parent f36fc91963
1 changed files with 17 additions and 11 deletions
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@ -19,6 +19,7 @@ from btrixcloud.models import (
    TYPE_NON_RUNNING_STATES,
    TYPE_RUNNING_STATES,
    TYPE_ALL_CRAWL_STATES,
+    NON_RUNNING_STATES,
    RUNNING_STATES,
    WAITING_STATES,
    RUNNING_AND_STARTING_ONLY,
@ -664,7 +665,7 @@ class CrawlOperator(BaseOperator):
        the following state transitions are supported:

        from starting to org concurrent crawl limit and back:
-         - starting -> waiting_org_capacity -> starting
+         - starting -> waiting_org_limit -> starting

        from starting to running:
         - starting -> running
@ -756,22 +757,27 @@ class CrawlOperator(BaseOperator):
        if not max_crawls:
            return True

+        # if total crawls < concurrent, always allow, no need to check further
+        if len(data.related[CJS]) <= max_crawls:
+            return True
+
        name = data.parent.get("metadata", {}).get("name")

-        active_crawls = 0
-
+        # assume crawls already sorted from oldest to newest
+        # (seems to be the case always)
+        i = 0
        for crawl_sorted in data.related[CJS].values():
-            crawl_state = crawl_sorted.get("status", {}).get("state", "")
-
-            # don't count ourselves
-            if crawl_sorted.get("metadata", {}).get("name") == name:
+            # if crawl not running, don't count
+            if crawl_sorted.get("status", {}).get("state") in NON_RUNNING_STATES:
                continue

-            if crawl_state in RUNNING_AND_WAITING_STATES:
-                active_crawls += 1
+            # if reached current crawl, if did not reach crawl quota, allow current crawl to run
+            if crawl_sorted.get("metadata").get("name") == name:
+                if i < max_crawls:
+                    return True

-        if active_crawls <= max_crawls:
-            return True
+                break
+            i += 1

        await self.set_state(
            "waiting_org_limit", status, crawl, allowed_from=["starting"]