Optimize single-page crawl workflows (#2656)

For single page crawls:
- Always force 1 browser to be used, ignoring browser windows/scale
setting
- Don't use custom PVC volumes in crawler / redis, just use emptyDir -
no chance of crawler being interrupted and restarted on different
machine for a single page.

Adds a 'is_single_page' check to CrawlConfig, checking for either limit
or scopeType / no extra hops.

Fixes #2655
This commit is contained in:
Ilya Kreymer 2025-06-10 19:13:57 +00:00 committed by GitHub
parent 86c4d326e9
commit 8ea16393c5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 100 additions and 20 deletions

View File

@ -45,6 +45,7 @@ from .models import (
CrawlerProxy,
CrawlerProxies,
ValidateCustomBehavior,
RawCrawlConfig,
)
from .utils import (
dt_now,
@ -223,15 +224,18 @@ class CrawlConfigOps:
) -> CrawlConfigAddedResponse:
"""Add new crawl config"""
# ensure crawlChannel is valid
if not self.get_channel_crawler_image(config_in.crawlerChannel):
raise HTTPException(status_code=404, detail="crawler_not_found")
# Overrides scale if set
if config_in.browserWindows is None:
config_in.browserWindows = browser_windows_from_scale(
cast(int, config_in.scale)
)
# ensure crawlChannel is valid
if not self.get_channel_crawler_image(config_in.crawlerChannel):
raise HTTPException(status_code=404, detail="crawler_not_found")
if self.is_single_page(config_in.config):
config_in.browserWindows = 1
profileid = None
if isinstance(config_in.profileid, UUID):
@ -321,6 +325,19 @@ class CrawlConfigOps:
execMinutesQuotaReached=exec_mins_quota_reached,
)
def is_single_page(self, config: RawCrawlConfig):
"""return true if this config represents a single page crawl"""
if not config.seeds or len(config.seeds) != 1:
return False
if config.limit == 1:
return True
extra_hops = config.seeds[0].extraHops or config.extraHops
scope_type = config.seeds[0].scopeType or config.scopeType
return extra_hops == 0 and scope_type == "page"
def _validate_link_selectors(self, link_selectors: List[str]):
"""Validate link selectors
@ -435,6 +452,10 @@ class CrawlConfigOps:
if update.config and update.config.lang:
validate_language_code(update.config.lang)
if update.config or update.browserWindows:
if self.is_single_page(update.config or orig_crawl_config.config):
update.browserWindows = 1
# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (
@ -1021,6 +1042,7 @@ class CrawlConfigOps:
warc_prefix=self.get_warc_prefix(org, crawlconfig),
storage_filename=storage_filename,
profile_filename=profile_filename or "",
is_single_page=self.is_single_page(crawlconfig.config),
)
await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
return crawl_id

View File

@ -220,6 +220,7 @@ class CrawlManager(K8sAPI):
warc_prefix: str,
storage_filename: str,
profile_filename: str,
is_single_page: bool,
) -> str:
"""create new crawl job from config"""
cid = str(crawlconfig.id)
@ -244,6 +245,7 @@ class CrawlManager(K8sAPI):
storage_filename=storage_filename,
profile_filename=profile_filename,
proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
is_single_page=is_single_page,
)
async def reload_running_crawl_config(self, crawl_id: str):

View File

@ -95,6 +95,7 @@ class K8sAPI:
profile_filename: str = "",
qa_source: str = "",
proxy_id: str = "",
is_single_page: bool = False,
):
"""load job template from yaml"""
if not crawl_id:
@ -119,6 +120,7 @@ class K8sAPI:
"profile_filename": profile_filename,
"qa_source": qa_source,
"proxy_id": proxy_id,
"is_single_page": "1" if is_single_page else "0",
}
data = self.templates.env.get_template("crawl_job.yaml").render(params)
@ -142,6 +144,7 @@ class K8sAPI:
profile_filename: str = "",
qa_source: str = "",
proxy_id: str = "",
is_single_page: bool = False,
) -> str:
"""load and init crawl job via k8s api"""
crawl_id, data = self.new_crawl_job_yaml(
@ -161,6 +164,7 @@ class K8sAPI:
profile_filename=profile_filename,
qa_source=qa_source,
proxy_id=proxy_id,
is_single_page=is_single_page,
)
# create job directly

View File

@ -101,6 +101,8 @@ class CrawlOperator(BaseOperator):
paused_expires_delta: timedelta
num_browsers_per_pod: int
def __init__(self, *args):
super().__init__(*args)
@ -125,6 +127,8 @@ class CrawlOperator(BaseOperator):
self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes)
self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
def init_routes(self, app):
"""init routes for this operator"""
@ -181,6 +185,7 @@ class CrawlOperator(BaseOperator):
max_crawl_size=int(spec.get("maxCrawlSize") or 0),
scheduled=spec.get("manual") != "1",
qa_source_crawl_id=spec.get("qaSourceCrawlId"),
is_single_page=spec.get("isSinglePage") == "1",
)
if crawl.qa_source_crawl_id:
@ -301,7 +306,7 @@ class CrawlOperator(BaseOperator):
status.stopReason = stop_reason
await self.mark_finished(crawl, status, state)
children = self._load_redis(params, status, data.children)
children = self._load_redis(params, status, crawl, data.children)
storage_path = crawl.storage.get_storage_extra_path(oid)
storage_secret = crawl.storage.get_storage_secret_name(oid)
@ -368,10 +373,8 @@ class CrawlOperator(BaseOperator):
# crawl_scale is the number of pods to create
crawler_scale = scale_from_browser_windows(crawl.browser_windows)
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
for i in range(0, crawler_scale):
if status.pagesFound < i * browsers_per_pod:
if status.pagesFound < i * self.num_browsers_per_pod:
break
children.extend(
@ -392,7 +395,7 @@ class CrawlOperator(BaseOperator):
"resyncAfterSeconds": status.resync_after,
}
def _load_redis(self, params, status: CrawlStatus, children):
def _load_redis(self, params, status: CrawlStatus, crawl: CrawlSpec, children):
name = f"redis-{params['id']}"
has_pod = name in children[POD]
@ -400,6 +403,8 @@ class CrawlOperator(BaseOperator):
params["name"] = name
params["cpu"] = pod_info.newCpu or params.get("redis_cpu")
params["memory"] = pod_info.newMemory or params.get("redis_memory")
params["no_pvc"] = crawl.is_single_page
restart_reason = None
if has_pod:
restart_reason = pod_info.should_restart_pod()
@ -870,7 +875,7 @@ class CrawlOperator(BaseOperator):
if redis_pod in pods:
# if has other pods, keep redis pod until they are removed
if len(pods) > 1:
new_children = self._load_redis(params, status, children)
new_children = self._load_redis(params, status, crawl, children)
await self.increment_pod_exec_time(pods, crawl, status)
# keep pvs until pods are removed

View File

@ -140,6 +140,7 @@ class CronJobOperator(BaseOperator):
storage_filename=self.crawl_config_ops.default_filename_template,
profile_filename=profile_filename or "",
proxy_id=crawlconfig.proxyId or "",
is_single_page=self.crawl_config_ops.is_single_page(crawlconfig.config),
)
return MCDecoratorSyncResponse(attachments=list(yaml.safe_load_all(crawljob)))

View File

@ -86,6 +86,7 @@ class CrawlSpec(BaseModel):
max_crawl_size: int = 0
qa_source_crawl_id: Optional[str] = ""
proxy_id: Optional[str] = None
is_single_page: bool = False
@property
def db_crawl_id(self) -> str:

View File

@ -331,7 +331,7 @@ def sample_crawl_data():
return {
"runNow": False,
"name": "Test Crawl",
"config": {"seeds": [{"url": "https://example.com/"}]},
"config": {"seeds": [{"url": "https://example.com/"}], "extraHops": 1},
"tags": ["tag1", "tag2"],
}

View File

@ -6,6 +6,7 @@ from .conftest import API_PREFIX
cid = None
cid_single_page = None
UPDATED_NAME = "Updated name"
UPDATED_DESCRIPTION = "Updated description"
UPDATED_TAGS = ["tag3", "tag4"]
@ -67,6 +68,37 @@ def test_verify_default_browser_windows(
assert data["browserWindows"] == 2
def test_add_crawl_config_single_page(
crawler_auth_headers, default_org_id, sample_crawl_data
):
# Create crawl config
sample_crawl_data["config"]["limit"] = 1
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 200
data = r.json()
global cid_single_page
cid_single_page = data["id"]
def test_verify_default_browser_windows_single_page(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid_single_page}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("scale") is None
assert data["browserWindows"] == 1
def test_custom_browser_windows(
crawler_auth_headers, default_org_id, sample_crawl_data
):

View File

@ -11,8 +11,8 @@ def test_get_config_by_created_by(crawler_auth_headers, default_org_id, crawler_
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?userid={crawler_userid}",
headers=crawler_auth_headers,
)
assert len(r.json()["items"]) == 7
assert r.json()["total"] == 7
assert len(r.json()["items"]) == 8
assert r.json()["total"] == 8
def test_get_config_by_modified_by(
@ -23,8 +23,8 @@ def test_get_config_by_modified_by(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}",
headers=crawler_auth_headers,
)
assert len(r.json()["items"]) == 7
assert r.json()["total"] == 7
assert len(r.json()["items"]) == 8
assert r.json()["total"] == 8
def test_get_configs_by_first_seed(
@ -362,9 +362,9 @@ def test_sort_crawl_configs(
headers=crawler_auth_headers,
)
data = r.json()
assert data["total"] == 13
assert data["total"] == 14
items = data["items"]
assert len(items) == 13
assert len(items) == 14
last_created = None
for config in items:

View File

@ -39,3 +39,5 @@ spec:
pausedAt: "{{ pausedAt }}"
isSinglePage: "{{ is_single_page }}"

View File

@ -1,3 +1,4 @@
{% if not no_pvc %}
# -------
# PVC
# -------
@ -23,7 +24,7 @@ spec:
storageClassName: {{ volume_storage_class }}
{% endif %}
{% endif %}
# -------
# CRAWLER
@ -67,8 +68,12 @@ spec:
name: qa-replay-{{ qa_source_crawl_id }}
{% endif %}
- name: crawl-data
{% if not no_pvc %}
persistentVolumeClaim:
claimName: {{ name }}
{% else %}
emptyDir: {}
{% endif %}
{% if proxy_id %}
- name: proxies
secret:

View File

@ -1,3 +1,4 @@
{% if not no_pvc %}
# -------
# PVC
# -------
@ -22,6 +23,7 @@ spec:
{% if volume_storage_class %}
storageClassName: {{ volume_storage_class }}
{% endif %}
{% endif %}
# --------
# REDIS
@ -51,8 +53,12 @@ spec:
path: redis.conf
- name: redis-data
{% if not no_pvc %}
persistentVolumeClaim:
claimName: {{ name }}
{% else %}
emptyDir: {}
{% endif %}
affinity:
nodeAffinity:

View File

@ -277,11 +277,11 @@ crawler_memory_base: 1024Mi
# number of browser workers per crawler instances
crawler_browser_instances: 2
# number of browser workers per crawler instances for QA runs
# number of browser workers per QA pod to run for QA runs
# defaults to 'crawler_browser_instances' if not set
# qa_browser_instances: 2
qa_browser_instances: 1
# fixed scale (number of crawler pods) for QA runs
# fixed scale (number of QA pods) to run
qa_scale: 1
# this value is added to crawler_cpu_base, for each additional browser