Crawl Timeout via elapsed time (#1338)

Fixes #1337 

Crawl timeout is tracked via `elapsedCrawlTime` field on the crawl
status, which is similar to regular crawl execution time, but only
counts one pod if scale > 1. If scale == 1, this time is equivalent.

Crawl is gracefully stopped when the elapsed execution time exceeds the
timeout. For more responsiveness, also adding current crawl time since
last update interval.

Details:
- handle crawl timeout via elapsed crawl time - longest running time of a
single pod, instead of expire time.
- include current running from last update for best precision
- more accurately count elapsed time crawl is actually running
- store elapsedCrawlTime in addition to crawlExecTime, storing the
longest duration of each pod since last test interval

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2023-11-06 16:32:58 -08:00 committed by GitHub
parent 5530ca92e1
commit b4fd5e6e94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 31 additions and 24 deletions

View File

@ -2,8 +2,6 @@
import os import os
import traceback import traceback
from datetime import timedelta
import yaml import yaml
from kubernetes_asyncio import client, config from kubernetes_asyncio import client, config
@ -18,7 +16,7 @@ from redis import asyncio as aioredis
from fastapi import HTTPException from fastapi import HTTPException
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from .utils import get_templates_dir, dt_now, to_k8s_date from .utils import get_templates_dir, dt_now
# ============================================================================ # ============================================================================
@ -85,11 +83,6 @@ class K8sAPI:
crawl_id=None, crawl_id=None,
): ):
"""load job template from yaml""" """load job template from yaml"""
if crawl_timeout:
crawl_expire_time = to_k8s_date(dt_now() + timedelta(seconds=crawl_timeout))
else:
crawl_expire_time = ""
if not crawl_id: if not crawl_id:
ts_now = dt_now().strftime("%Y%m%d%H%M%S") ts_now = dt_now().strftime("%Y%m%d%H%M%S")
prefix = "manual" if manual else "sched" prefix = "manual" if manual else "sched"
@ -101,7 +94,7 @@ class K8sAPI:
"oid": oid, "oid": oid,
"userid": userid, "userid": userid,
"scale": scale, "scale": scale,
"expire_time": crawl_expire_time or 0, "timeout": crawl_timeout,
"max_crawl_size": max_crawl_size or 0, "max_crawl_size": max_crawl_size or 0,
"storage_name": str(storage), "storage_name": str(storage),
"manual": "1" if manual else "0", "manual": "1" if manual else "0",

View File

@ -8,7 +8,6 @@ from typing import Optional, DefaultDict, TYPE_CHECKING
from collections import defaultdict from collections import defaultdict
from datetime import datetime
import json import json
from uuid import UUID from uuid import UUID
from fastapi import HTTPException from fastapi import HTTPException
@ -114,8 +113,8 @@ class CrawlSpec(BaseModel):
started: str started: str
stopping: bool = False stopping: bool = False
scheduled: bool = False scheduled: bool = False
expire_time: Optional[datetime] = None timeout: int = 0
max_crawl_size: Optional[int] = None max_crawl_size: int = 0
# ============================================================================ # ============================================================================
@ -232,9 +231,15 @@ class CrawlStatus(BaseModel):
restartTime: Optional[str] restartTime: Optional[str]
canceled: bool = False canceled: bool = False
# Execution Time -- updated on pod exits and at regular interval # updated on pod exits and at regular interval
# Crawl Execution Time -- time all crawler pods have been running
# used to track resource usage and enforce execution minutes limit
crawlExecTime: int = 0 crawlExecTime: int = 0
# Elapsed Exec Time -- time crawl has been running in at least one pod
# used for crawl timeouts
elapsedCrawlTime: int = 0
# last exec time update # last exec time update
lastUpdatedTime: str = "" lastUpdatedTime: str = ""
@ -440,7 +445,7 @@ class BtrixOperator(K8sAPI):
scale=spec.get("scale", 1), scale=spec.get("scale", 1),
started=data.parent["metadata"]["creationTimestamp"], started=data.parent["metadata"]["creationTimestamp"],
stopping=spec.get("stopping", False), stopping=spec.get("stopping", False),
expire_time=from_k8s_date(spec.get("expireTime")), timeout=spec.get("timeout") or 0,
max_crawl_size=int(spec.get("maxCrawlSize") or 0), max_crawl_size=int(spec.get("maxCrawlSize") or 0),
scheduled=spec.get("manual") != "1", scheduled=spec.get("manual") != "1",
) )
@ -1081,6 +1086,7 @@ class BtrixOperator(K8sAPI):
return return
exec_time = 0 exec_time = 0
max_duration = 0
print( print(
f"Exec Time Update: {reason}: {now} - {update_start_time} = {update_duration}" f"Exec Time Update: {reason}: {now} - {update_start_time} = {update_duration}"
) )
@ -1131,11 +1137,13 @@ class BtrixOperator(K8sAPI):
f" - {name}: {pod_state}: {end_time} - {start_time} = {duration}" f" - {name}: {pod_state}: {end_time} - {start_time} = {duration}"
) )
exec_time += duration exec_time += duration
max_duration = max(duration, max_duration)
if exec_time: if exec_time:
await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time) await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time)
await self.org_ops.inc_org_time_stats(oid, exec_time, True) await self.org_ops.inc_org_time_stats(oid, exec_time, True)
status.crawlExecTime += exec_time status.crawlExecTime += exec_time
status.elapsedCrawlTime += max_duration
print( print(
f" Exec Time Total: {status.crawlExecTime}, Incremented By: {exec_time}", f" Exec Time Total: {status.crawlExecTime}, Incremented By: {exec_time}",
@ -1254,7 +1262,7 @@ class BtrixOperator(K8sAPI):
return True return True
def is_crawl_stopping(self, crawl, size): def is_crawl_stopping(self, crawl: CrawlSpec, status: CrawlStatus) -> bool:
"""return true if crawl should begin graceful stopping phase""" """return true if crawl should begin graceful stopping phase"""
# if user requested stop, then enter stopping phase # if user requested stop, then enter stopping phase
@ -1262,12 +1270,19 @@ class BtrixOperator(K8sAPI):
print("Graceful Stop: User requested stop") print("Graceful Stop: User requested stop")
return True return True
# check crawl expiry # check timeout if timeout time exceeds elapsed time
if crawl.expire_time and dt_now() > crawl.expire_time: if crawl.timeout:
print(f"Graceful Stop: Job duration expired at {crawl.expire_time}") elapsed = (
return True status.elapsedCrawlTime
+ (dt_now() - from_k8s_date(status.lastUpdatedTime)).total_seconds()
)
if elapsed > crawl.timeout:
print(
f"Graceful Stop: Crawl running time exceeded {crawl.timeout} second timeout"
)
return True
if crawl.max_crawl_size and size > crawl.max_crawl_size: if crawl.max_crawl_size and status.size > crawl.max_crawl_size:
print(f"Graceful Stop: Maximum crawl size {crawl.max_crawl_size} hit") print(f"Graceful Stop: Maximum crawl size {crawl.max_crawl_size} hit")
return True return True
@ -1311,7 +1326,7 @@ class BtrixOperator(K8sAPI):
pod_info = status.podStatus[key] pod_info = status.podStatus[key]
pod_info.used.storage = value pod_info.used.storage = value
status.stopping = self.is_crawl_stopping(crawl, status.size) status.stopping = self.is_crawl_stopping(crawl, status)
# check exec time quotas and stop if reached limit # check exec time quotas and stop if reached limit
if not status.stopping: if not status.stopping:

View File

@ -12,6 +12,7 @@ from .utils import (
curr_dir = os.path.dirname(os.path.realpath(__file__)) curr_dir = os.path.dirname(os.path.realpath(__file__))
def test_upload_stream(admin_auth_headers, default_org_id): def test_upload_stream(admin_auth_headers, default_org_id):
with open(os.path.join(curr_dir, "..", "test", "data", "example.wacz"), "rb") as fh: with open(os.path.join(curr_dir, "..", "test", "data", "example.wacz"), "rb") as fh:
r = requests.put( r = requests.put(

View File

@ -20,11 +20,9 @@ spec:
oid: "{{ oid }}" oid: "{{ oid }}"
scale: {{ scale }} scale: {{ scale }}
maxCrawlSize: {{ max_crawl_size }} maxCrawlSize: {{ max_crawl_size }}
timeout: {{ timeout }}
manual: {{ manual }} manual: {{ manual }}
ttlSecondsAfterFinished: 30 ttlSecondsAfterFinished: 30
storageName: "{{ storage_name }}" storageName: "{{ storage_name }}"
{% if expire_time %}
expireTime: "{{ expire_time }}"
{% endif %}