diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
index b1181343..90eaa4ad 100644
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@@ -23,7 +23,7 @@ from .models import (
PaginatedCrawlOutResponse,
User,
StorageRef,
- RUNNING_AND_STARTING_STATES,
+ RUNNING_AND_WAITING_STATES,
SUCCESSFUL_STATES,
QARun,
UpdatedResponse,
@@ -272,7 +272,7 @@ class BaseCrawlOps:
{
"_id": crawl_id,
"type": "crawl",
- "state": {"$in": RUNNING_AND_STARTING_STATES},
+ "state": {"$in": RUNNING_AND_WAITING_STATES},
},
{"$set": data},
)
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index 45da9fe5..354857f6 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -44,7 +44,7 @@ from .models import (
PaginatedCrawlOutResponse,
PaginatedSeedResponse,
PaginatedCrawlErrorResponse,
- RUNNING_AND_STARTING_STATES,
+ RUNNING_AND_WAITING_STATES,
SUCCESSFUL_STATES,
NON_RUNNING_STATES,
ALL_CRAWL_STATES,
@@ -165,7 +165,7 @@ class CrawlOps(BaseCrawlOps):
query["userid"] = userid
if running_only:
- query["state"] = {"$in": RUNNING_AND_STARTING_STATES}
+ query["state"] = {"$in": RUNNING_AND_WAITING_STATES}
# Override running_only if state list is explicitly passed
if state:
@@ -425,7 +425,7 @@ class CrawlOps(BaseCrawlOps):
state, _ = await self.get_crawl_state(crawl_id, False)
- if state not in RUNNING_AND_STARTING_STATES:
+ if state not in RUNNING_AND_WAITING_STATES:
raise HTTPException(status_code=400, detail="crawl_not_running")
total = 0
@@ -463,7 +463,7 @@ class CrawlOps(BaseCrawlOps):
limit <= next_offset < limit + step"""
state, _ = await self.get_crawl_state(crawl_id, False)
- if state not in RUNNING_AND_STARTING_STATES:
+ if state not in RUNNING_AND_WAITING_STATES:
raise HTTPException(status_code=400, detail="crawl_not_running")
total = 0
@@ -513,7 +513,7 @@ class CrawlOps(BaseCrawlOps):
crawl = await self.get_crawl(crawl_id, org)
- if crawl.state not in RUNNING_AND_STARTING_STATES:
+ if crawl.state not in RUNNING_AND_WAITING_STATES:
raise HTTPException(status_code=400, detail="crawl_not_running")
cid = crawl.cid
@@ -591,30 +591,36 @@ class CrawlOps(BaseCrawlOps):
"qaCrawlExecSeconds": exec_time,
"qa.crawlExecSeconds": exec_time,
}
+ field = "qa._lut"
else:
inc_update = {"crawlExecSeconds": exec_time}
+ field = "_lut"
res = await self.crawls.find_one_and_update(
{
"_id": crawl_id,
"type": "crawl",
- "_lut": {"$ne": last_updated_time},
+ field: {"$ne": last_updated_time},
},
{
"$inc": inc_update,
- "$set": {"_lut": last_updated_time},
+ "$set": {field: last_updated_time},
},
)
return res is not None
async def get_crawl_exec_last_update_time(
- self, crawl_id: str
+ self, crawl_id: str, is_qa: bool
) -> Optional[datetime]:
"""get crawl last updated time"""
+ field = "_lut" if not is_qa else "qa._lut"
res = await self.crawls.find_one(
- {"_id": crawl_id, "type": "crawl"}, projection=["_lut"]
+ {"_id": crawl_id, "type": "crawl"}, projection=[field]
)
- return res and res.get("_lut")
+ if not res:
+ return None
+
+ return res.get("qa", {}).get("_lut") if is_qa else res.get("_lut")
async def get_crawl_state(
self, crawl_id: str, is_qa: bool
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index b71cf6eb..4f839651 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -208,8 +208,8 @@ TYPE_RUNNING_STATES = Literal[
]
RUNNING_STATES = get_args(TYPE_RUNNING_STATES)
-TYPE_STARTING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"]
-STARTING_STATES = get_args(TYPE_STARTING_STATES)
+TYPE_WAITING_STATES = Literal["starting", "waiting_capacity", "waiting_org_limit"]
+WAITING_STATES = get_args(TYPE_WAITING_STATES)
TYPE_FAILED_STATES = Literal[
"canceled",
@@ -228,8 +228,8 @@ TYPE_SUCCESSFUL_STATES = Literal[
]
SUCCESSFUL_STATES = get_args(TYPE_SUCCESSFUL_STATES)
-TYPE_RUNNING_AND_STARTING_STATES = Literal[TYPE_STARTING_STATES, TYPE_RUNNING_STATES]
-RUNNING_AND_STARTING_STATES = [*STARTING_STATES, *RUNNING_STATES]
+TYPE_RUNNING_AND_WAITING_STATES = Literal[TYPE_WAITING_STATES, TYPE_RUNNING_STATES]
+RUNNING_AND_WAITING_STATES = [*WAITING_STATES, *RUNNING_STATES]
RUNNING_AND_STARTING_ONLY = ["starting", *RUNNING_STATES]
@@ -237,9 +237,9 @@ TYPE_NON_RUNNING_STATES = Literal[TYPE_FAILED_STATES, TYPE_SUCCESSFUL_STATES]
NON_RUNNING_STATES = [*FAILED_STATES, *SUCCESSFUL_STATES]
TYPE_ALL_CRAWL_STATES = Literal[
- TYPE_RUNNING_AND_STARTING_STATES, TYPE_NON_RUNNING_STATES
+ TYPE_RUNNING_AND_WAITING_STATES, TYPE_NON_RUNNING_STATES
]
-ALL_CRAWL_STATES = [*RUNNING_AND_STARTING_STATES, *NON_RUNNING_STATES]
+ALL_CRAWL_STATES = [*RUNNING_AND_WAITING_STATES, *NON_RUNNING_STATES]
# ============================================================================
diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py
index 81cd77ff..6e21b9a6 100644
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@@ -19,8 +19,9 @@ from btrixcloud.models import (
TYPE_ALL_CRAWL_STATES,
NON_RUNNING_STATES,
RUNNING_STATES,
+ WAITING_STATES,
RUNNING_AND_STARTING_ONLY,
- RUNNING_AND_STARTING_STATES,
+ RUNNING_AND_WAITING_STATES,
SUCCESSFUL_STATES,
FAILED_STATES,
CrawlStats,
@@ -119,6 +120,7 @@ class CrawlOperator(BaseOperator):
"""sync crawls"""
status = CrawlStatus(**data.parent.get("status", {}))
+ status.last_state = status.state
spec = data.parent.get("spec", {})
crawl_id = spec["id"]
@@ -250,11 +252,6 @@ class CrawlOperator(BaseOperator):
else:
status.scale = 1
- now = dt_now()
- await self.crawl_ops.inc_crawl_exec_time(
- crawl.db_crawl_id, crawl.is_qa, 0, now
- )
- status.lastUpdatedTime = to_k8s_date(now)
children = self._load_redis(params, status, data.children)
@@ -807,25 +804,13 @@ class CrawlOperator(BaseOperator):
status.resync_after = self.fast_retry_secs
return status
- # if true (state is set), also run webhook
- if await self.set_state(
+ # ensure running state is set
+ await self.set_state(
"running",
status,
crawl,
allowed_from=["starting", "waiting_capacity"],
- ):
- if not crawl.qa_source_crawl_id:
- self.run_task(
- self.event_webhook_ops.create_crawl_started_notification(
- crawl.id, crawl.oid, scheduled=crawl.scheduled
- )
- )
- else:
- self.run_task(
- self.event_webhook_ops.create_qa_analysis_started_notification(
- crawl.id, crawl.oid, crawl.qa_source_crawl_id
- )
- )
+ )
# update lastActiveTime if crawler is running
if crawler_running:
@@ -967,11 +952,33 @@ class CrawlOperator(BaseOperator):
"""inc exec time tracking"""
now = dt_now()
+ # don't count time crawl is not running
+ if status.state in WAITING_STATES:
+ # reset lastUpdatedTime if at least 2 consecutive updates of non-running state
+ if status.last_state in WAITING_STATES:
+ status.lastUpdatedTime = to_k8s_date(now)
+ return
+
update_start_time = await self.crawl_ops.get_crawl_exec_last_update_time(
- crawl.db_crawl_id
+ crawl.db_crawl_id, crawl.is_qa
)
if not update_start_time:
+ print("Crawl first started, webhooks called", now, crawl.id)
+ # call initial running webhook
+ if not crawl.qa_source_crawl_id:
+ self.run_task(
+ self.event_webhook_ops.create_crawl_started_notification(
+ crawl.id, crawl.oid, scheduled=crawl.scheduled
+ )
+ )
+ else:
+ self.run_task(
+ self.event_webhook_ops.create_qa_analysis_started_notification(
+ crawl.id, crawl.oid, crawl.qa_source_crawl_id
+ )
+ )
+
await self.crawl_ops.inc_crawl_exec_time(
crawl.db_crawl_id, crawl.is_qa, 0, now
)
@@ -1414,7 +1421,7 @@ class CrawlOperator(BaseOperator):
finished = dt_now()
- allowed_from = RUNNING_AND_STARTING_STATES
+ allowed_from = RUNNING_AND_WAITING_STATES
# if set_state returns false, already set to same status, return
if not await self.set_state(
diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py
index 92edaa34..bce5a1e6 100644
--- a/backend/btrixcloud/operator/models.py
+++ b/backend/btrixcloud/operator/models.py
@@ -223,3 +223,6 @@ class CrawlStatus(BaseModel):
# don't include in status, use by metacontroller
resync_after: Optional[int] = Field(default=None, exclude=True)
+
+ # last state
+ last_state: TYPE_ALL_CRAWL_STATES = Field(default="starting", exclude=True)
diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py
index 45b7e157..0a88d055 100644
--- a/backend/btrixcloud/orgs.py
+++ b/backend/btrixcloud/orgs.py
@@ -27,7 +27,7 @@ from aiostream import stream
from .models import (
SUCCESSFUL_STATES,
RUNNING_STATES,
- STARTING_STATES,
+ WAITING_STATES,
BaseCrawl,
Organization,
StorageRef,
@@ -890,7 +890,7 @@ class OrgOps:
{"oid": org.id, "state": {"$in": RUNNING_STATES}}
)
workflows_queued_count = await self.crawls_db.count_documents(
- {"oid": org.id, "state": {"$in": STARTING_STATES}}
+ {"oid": org.id, "state": {"$in": WAITING_STATES}}
)
collections_count = await self.colls_db.count_documents({"oid": org.id})
public_collections_count = await self.colls_db.count_documents(
diff --git a/chart/templates/operators.yaml b/chart/templates/operators.yaml
index c3d8287f..7547ec42 100644
--- a/chart/templates/operators.yaml
+++ b/chart/templates/operators.yaml
@@ -5,7 +5,7 @@ metadata:
name: crawljobs-operator
spec:
generateSelector: false
- resyncPeriodSeconds: {{ .Values.operator_resync_seconds | default 30 }}
+ resyncPeriodSeconds: {{ .Values.operator_resync_seconds | default 10 }}
parentResource:
apiVersion: btrix.cloud/v1
resource: crawljobs
diff --git a/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts b/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts
index 48d267fb..70612fb6 100644
--- a/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts
+++ b/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts
@@ -802,6 +802,7 @@ export class ArchivedItemDetail extends TailwindElement {
? html`${humanizeExecutionSeconds(
this.crawl!.crawlExecSeconds,
+ { displaySeconds: true },
)}`
: html`${msg("Pending")}`}