Add max crawl size option to backend and frontend (#1045)

Backend: - add 'maxCrawlSize' to models and crawljob spec - add 'MAX_CRAWL_SIZE' to configmap - add maxCrawlSize to new crawlconfig + update APIs - operator: gracefully stop crawl if current size (from stats) exceeds maxCrawlSize - tests: add max crawl size tests Frontend: - Add Max Crawl Size text box Limits tab - Users enter max crawl size in GB, convert to bytes - Add BYTES_PER_GB as constant for converting to bytes - docs: Crawl Size Limit to user guide workflow setup section Operator Refactor: - use 'status.stopping' instead of 'crawl.stopping' to indicate crawl is being stopped, as changing later has no effect in operator - add is_crawl_stopping() to return if crawl is being stopped, based on crawl.stopping or size or time limit being reached - crawlerjob status: store byte size under 'size', human readable size under 'sizeHuman' for clarity - size stat always exists so remove unneeded conditional (defaults to 0) - store raw byte size in 'size', human readable size in 'sizeHuman' Charts: - subchart: update crawlerjob crd in btrix-crds to show status.stopping instead of spec.stopping - subchart: show 'sizeHuman' property instead of 'size' - bump subchart version to 0.1.1 --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-08-27 01:00:37 -04:00 · 2023-08-27 01:00:37 -04:00 · e667fe2e97
commit e667fe2e97
parent 2da6c1c905
22 changed files with 225 additions and 36 deletions
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -200,6 +200,9 @@ class CrawlConfigOps:
        changed = changed or (
            self.check_attr_changed(orig_crawl_config, update, "crawlTimeout")
        )
+        changed = changed or (
+            self.check_attr_changed(orig_crawl_config, update, "maxCrawlSize")
+        )
        changed = changed or (
            self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate")
        )
@ -267,7 +270,7 @@ class CrawlConfigOps:
                status_code=404, detail=f"Crawl Config '{cid}' not found"
            )

-        # update in crawl manager if config, schedule, scale or crawlTimeout changed
+        # update in crawl manager if config, schedule, scale, maxCrawlSize or crawlTimeout changed
        if changed:
            crawlconfig = CrawlConfig.from_dict(result)
            try:
--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@ -103,7 +103,8 @@ class CrawlManager(K8sAPI):
            STORAGE_NAME=storage_name,
            PROFILE_FILENAME=profile_filename,
            INITIAL_SCALE=str(crawlconfig.scale),
-            CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout or 0)
+            CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout or 0),
+            MAX_CRAWL_SIZE=str(crawlconfig.maxCrawlSize or 0)
            # REV=str(crawlconfig.rev),
        )

@ -128,6 +129,7 @@ class CrawlManager(K8sAPI):
            crawlconfig.oid,
            crawlconfig.scale,
            crawlconfig.crawlTimeout,
+            crawlconfig.maxCrawlSize,
            manual=True,
        )

@ -137,6 +139,7 @@ class CrawlManager(K8sAPI):
        has_sched_update = update.schedule is not None
        has_scale_update = update.scale is not None
        has_timeout_update = update.crawlTimeout is not None
+        has_max_crawl_size_update = update.maxCrawlSize is not None
        has_config_update = update.config is not None

        if has_sched_update:
@ -147,6 +150,7 @@ class CrawlManager(K8sAPI):
            or has_config_update
            or has_timeout_update
            or profile_filename
+            or has_max_crawl_size_update
        ):
            await self._update_config_map(
                crawlconfig,
@ -409,6 +413,9 @@ class CrawlManager(K8sAPI):
        if update.crawlTimeout is not None:
            config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout)

+        if update.maxCrawlSize is not None:
+            config_map.data["MAX_CRAWL_SIZE"] = str(update.maxCrawlSize)
+
        if update.crawlFilenameTemplate is not None:
            config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate

--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -538,6 +538,7 @@ async def add_new_crawl(
        profileid=crawlconfig.profileid,
        schedule=crawlconfig.schedule,
        crawlTimeout=crawlconfig.crawlTimeout,
+        maxCrawlSize=crawlconfig.maxCrawlSize,
        manual=manual,
        started=started,
        tags=crawlconfig.tags,
--- a/backend/btrixcloud/k8sapi.py
+++ b/backend/btrixcloud/k8sapi.py
@ -71,7 +71,7 @@ class K8sAPI:

    # pylint: disable=too-many-arguments
    async def new_crawl_job(
-        self, cid, userid, oid, scale=1, crawl_timeout=0, manual=True
+        self, cid, userid, oid, scale=1, crawl_timeout=0, max_crawl_size=0, manual=True
    ):
        """load job template from yaml"""
        if crawl_timeout:
@ -90,6 +90,7 @@ class K8sAPI:
            "userid": userid,
            "scale": scale,
            "expire_time": crawl_expire_time,
+            "max_crawl_size": max_crawl_size,
            "manual": "1" if manual else "0",
        }

--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -128,6 +128,7 @@ class CrawlConfigIn(BaseModel):
    tags: Optional[List[str]] = []

    crawlTimeout: Optional[int] = 0
+    maxCrawlSize: Optional[int] = 0
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1

    crawlFilenameTemplate: Optional[str]
@ -146,6 +147,7 @@ class ConfigRevision(BaseMongoModel):
    profileid: Optional[UUID4]

    crawlTimeout: Optional[int] = 0
+    maxCrawlSize: Optional[int] = 0
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1

    modified: datetime
@ -166,6 +168,7 @@ class CrawlConfigCore(BaseMongoModel):
    tags: Optional[List[str]] = []

    crawlTimeout: Optional[int] = 0
+    maxCrawlSize: Optional[int] = 0
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1

    oid: UUID4
@ -250,6 +253,7 @@ class UpdateCrawlConfig(BaseModel):
    schedule: Optional[str]
    profileid: Optional[str]
    crawlTimeout: Optional[int]
+    maxCrawlSize: Optional[int]
    scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)]
    crawlFilenameTemplate: Optional[str]
    config: Optional[RawCrawlConfig]
--- a/backend/btrixcloud/operator.py
+++ b/backend/btrixcloud/operator.py
@ -88,6 +88,7 @@ class CrawlSpec(BaseModel):
    started: str
    stopping: bool = False
    expire_time: Optional[datetime] = None
+    max_crawl_size: Optional[int] = None


 # ============================================================================
@ -97,7 +98,9 @@ class CrawlStatus(BaseModel):
    state: str = "starting"
    pagesFound: int = 0
    pagesDone: int = 0
-    size: str = ""
+    size: int = 0
+    # human readable size string
+    sizeHuman: str = ""
    scale: int = 1
    filesAdded: int = 0
    filesAddedSize: int = 0
@ -110,12 +113,11 @@ class CrawlStatus(BaseModel):


 # ============================================================================
-# pylint: disable=too-many-statements
+# pylint: disable=too-many-statements, too-many-public-methods, too-many-branches
+# pylint: disable=too-many-instance-attributes,too-many-locals
 class BtrixOperator(K8sAPI):
    """BtrixOperator Handler"""

-    # pylint: disable=too-many-instance-attributes,too-many-locals
-
    def __init__(self):
        super().__init__()
        self.config_file = "/config/config.yaml"
@ -209,6 +211,7 @@ class BtrixOperator(K8sAPI):
            started=data.parent["metadata"]["creationTimestamp"],
            stopping=spec.get("stopping", False),
            expire_time=from_k8s_date(spec.get("expireTime")),
+            max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
        )

        if status.state in ("starting", "waiting_org_limit"):
@ -226,7 +229,7 @@ class BtrixOperator(K8sAPI):
        if has_crawl_children:
            pods = data.related[POD]
            status = await self.sync_crawl_state(redis_url, crawl, status, pods)
-            if crawl.stopping:
+            if status.stopping:
                await self.check_if_finished(crawl, status)

            if status.finished:
@ -618,31 +621,42 @@ class BtrixOperator(K8sAPI):

        return True

-    # pylint: disable=too-many-branches
+    def is_crawl_stopping(self, crawl, size):
+        """return true if crawl should begin graceful stopping phase"""
+
+        # if user requested stop, then enter stopping phase
+        if crawl.stopping:
+            print("Graceful Stop: User requested stop")
+            return True
+
+        # check crawl expiry
+        if crawl.expire_time and datetime.utcnow() > crawl.expire_time:
+            print(f"Graceful Stop: Job duration expired at {crawl.expire_time}")
+            return True
+
+        if crawl.max_crawl_size and size > crawl.max_crawl_size:
+            print(f"Graceful Stop: Maximum crawl size {crawl.max_crawl_size} hit")
+            return True
+
+        return False
+
    async def update_crawl_state(self, redis, crawl, status):
        """update crawl state and check if crawl is now done"""
        results = await redis.hvals(f"{crawl.id}:status")
        stats = await get_redis_crawl_stats(redis, crawl.id)

-        # check crawl expiry
-        if crawl.expire_time and datetime.utcnow() > crawl.expire_time:
-            crawl.stopping = True
-            print(
-                "Job duration expired at {crawl.expire_time}, "
-                + "gracefully stopping crawl"
-            )
-
-        if crawl.stopping:
-            print("Graceful Stop")
-            await redis.set(f"{crawl.id}:stopping", "1")
-            # backwards compatibility with older crawler
-            await redis.set("crawl-stop", "1")
-
        # update status
        status.pagesDone = stats["done"]
        status.pagesFound = stats["found"]
-        if stats["size"] is not None:
-            status.size = humanize.naturalsize(stats["size"])
+        status.size = stats["size"]
+        status.sizeHuman = humanize.naturalsize(status.size)
+
+        status.stopping = self.is_crawl_stopping(crawl, status.size)
+
+        if status.stopping:
+            await redis.set(f"{crawl.id}:stopping", "1")
+            # backwards compatibility with older crawler
+            await redis.set("crawl-stop", "1")

        # check if done / failed
        status_count = {}
@ -669,7 +683,7 @@ class BtrixOperator(K8sAPI):
        # check if all crawlers failed
        elif status_count.get("failed", 0) >= crawl.scale:
            # if stopping, and no pages finished, mark as canceled
-            if crawl.stopping and not status.pagesDone:
+            if status.stopping and not status.pagesDone:
                state = "canceled"
            else:
                state = "failed"
--- a/backend/btrixcloud/templates/crawl_job.yaml
+++ b/backend/btrixcloud/templates/crawl_job.yaml
@ -18,6 +18,7 @@ spec:
  cid: "{{ cid }}"
  oid: "{{ oid }}"
  scale: {{ scale }}
+  maxCrawlSize: {{ max_crawl_size }}
  ttlSecondsAfterFinished: 30

  {% if expire_time %}
--- a/backend/test/test_crawlconfigs.py
+++ b/backend/test/test_crawlconfigs.py
@ -196,6 +196,30 @@ def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl
    assert data["crawlTimeout"] == 60


+def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
+    # Verify that updating crawl timeout works
+    r = requests.patch(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
+        headers=crawler_auth_headers,
+        json={"maxCrawlSize": 4096},
+    )
+    assert r.status_code == 200
+    data = r.json()
+
+    assert data["settings_changed"] == True
+    assert data["metadata_changed"] == False
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+
+    data = r.json()
+
+    assert data["maxCrawlSize"] == 4096
+
+
 def test_verify_delete_tags(crawler_auth_headers, default_org_id):
    # Verify that deleting tags and name works as well
    r = requests.patch(
@ -224,9 +248,9 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id):
    assert r.status_code == 200

    data = r.json()
-    assert data["total"] == 2
+    assert data["total"] == 3
    items = data["items"]
-    assert len(items) == 2
+    assert len(items) == 3
    sorted_data = sorted(items, key=lambda revision: revision["rev"])
    assert sorted_data[0]["config"]["scopeType"] == "prefix"

--- a/backend/test_nightly/conftest.py
+++ b/backend/test_nightly/conftest.py
@ -227,6 +227,30 @@ def timeout_crawl(admin_auth_headers, default_org_id):
    return data["run_now_job"]


+@pytest.fixture(scope="session")
+def max_crawl_size_crawl_id(admin_auth_headers, default_org_id):
+    # Start crawl
+    crawl_data = {
+        "runNow": True,
+        "name": "Crawl with 5 MB max crawl size limit",
+        # Note crawl will exceed this size, as crawl begins to gracefully
+        # shut down when operator notices this value has been exceeded.
+        "maxCrawlSize": 5242880,
+        "config": {
+            "seeds": [{"url": "https://webrecorder.net/"}],
+            "scopeType": "domain",
+            "limit": 100,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+    return data["run_now_job"]
+
+
@pytest.fixture(scope="session")
 def error_crawl_id(admin_auth_headers, default_org_id):
    crawl_data = {
--- a/backend/test_nightly/test_max_crawl_size_limit.py
+++ b/backend/test_nightly/test_max_crawl_size_limit.py
@ -0,0 +1,33 @@
+import requests
+import time
+
+from .conftest import API_PREFIX
+
+
+def test_max_crawl_size(admin_auth_headers, default_org_id, max_crawl_size_crawl_id):
+    # Verify that crawl has started
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{max_crawl_size_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["state"] in (
+        "starting",
+        "running",
+        "generate-wacz",
+        "uploading-wacz",
+        "pending-wait",
+    )
+
+    # Wait some time to let crawl start, hit max size limit, and gracefully stop
+    time.sleep(240)
+
+    # Verify crawl was stopped
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{max_crawl_size_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["state"] == "partial_complete"
--- a/chart/Chart.lock
+++ b/chart/Chart.lock
@ -4,9 +4,9 @@ dependencies:
  version: 0.1.0
 - name: btrix-crds
  repository: file://./btrix-crds
-  version: 0.1.0
+  version: 0.1.1
 - name: metacontroller-helm
  repository: oci://ghcr.io/metacontroller
  version: v4.10.1
-digest: sha256:e40073e42a13c1765a9ddaf91c5cd93ccc4804bedf6954e3ba1ade8fb26cca7c
-generated: "2023-04-22T01:08:16.572747-07:00"
+digest: sha256:4b95cff1974baeaec17b87a0f2c41787bf58437d6235dbacc121195390f4910e
+generated: "2023-08-24T21:19:12.893947-07:00"
--- a/chart/Chart.yaml
+++ b/chart/Chart.yaml
@ -20,7 +20,7 @@ dependencies:
    condition: addons.admin.logging
    repository: file://./admin/logging
  - name: btrix-crds
-    version: 0.1.0
+    version: 0.1.1
    repository: file://./btrix-crds
  - name: metacontroller-helm
    version: v4.10.1
--- a/chart/btrix-crds/Chart.yaml
+++ b/chart/btrix-crds/Chart.yaml
@ -7,9 +7,9 @@ icon: https://webrecorder.net/assets/icon.png
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 0.1.1

 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
-appVersion: 0.1.0
+appVersion: 0.1.1
--- a/chart/btrix-crds/templates/crawlerjob.yaml
+++ b/chart/btrix-crds/templates/crawlerjob.yaml
@ -53,7 +53,7 @@ spec:

        - name: Size
          type: string
-          jsonPath: .status.size
+          jsonPath: .status.sizeHuman
          description: Crawl Size

        - name: Time Started
@ -68,7 +68,7 @@ spec:

        - name: Stopping
          type: boolean
-          jsonPath: .spec.stopping
+          jsonPath: .status.stopping
          description: "if set, crawl is being stopped"

        - name: Files Added
--- a/chart/charts/btrix-admin-logging-0.1.0.tgz
+++ b/chart/charts/btrix-admin-logging-0.1.0.tgz
--- a/chart/charts/btrix-crds-0.1.0.tgz
+++ b/chart/charts/btrix-crds-0.1.0.tgz
--- a/chart/charts/btrix-crds-0.1.1.tgz
+++ b/chart/charts/btrix-crds-0.1.1.tgz
--- a/docs/user-guide/workflow-setup.md
+++ b/docs/user-guide/workflow-setup.md
@ -116,6 +116,10 @@ Adds a hard limit on the number of pages that will be crawled. The crawl will be

 The crawl will be gracefully stopped after this set period of time.

+### Crawl Size Limit
+
+The crawl will be gracefully stopped after reaching this set size in GB.
+
 ### Crawler Instances

 Increasing the amount of crawler instances will speed up crawls by using additional browser windows to capture more pages in parallel. This will also increase the amount of traffic sent to the website and may result in a higher chance of getting rate limited.
--- a/frontend/src/components/config-details.ts
+++ b/frontend/src/components/config-details.ts
@ -96,6 +96,41 @@ export class ConfigDetails extends LiteElement {
      }
    };

+    const renderSize = (valueBytes?: number | null, fallbackValue?: number) => {
+      const bytesPerGB = 1073741824;
+
+      // Eventually we will want to set this to the selected locale
+      const formatter = new Intl.NumberFormat(undefined, {
+        style: "unit",
+        unit: "gigabyte",
+        unitDisplay: "narrow",
+      });
+
+      if (valueBytes) {
+        const sizeGB = Math.floor(valueBytes / bytesPerGB);
+        return formatter.format(sizeGB);
+      }
+
+      if (typeof fallbackValue === "number") {
+        let value = "";
+        if (fallbackValue === Infinity) {
+          value = msg("Unlimited");
+        } else if (fallbackValue === 0) {
+          value = formatter.format(0);
+        } else {
+          const sizeGB = Math.floor(fallbackValue / bytesPerGB);
+          value = formatter.format(sizeGB);
+        }
+        return html`<span class="text-neutral-400"
+          >${value} ${msg("(default)")}</span
+        >`;
+      }
+
+      return html`<span class="text-neutral-400"
+        >${msg("Unlimited")} ${msg("(default)")}</span
+      >`;
+    };
+
    return html`
      <section id="crawler-settings" class="mb-8">
        <btrix-section-heading style="--margin: var(--sl-spacing-medium)">
@ -168,6 +203,10 @@ export class ConfigDetails extends LiteElement {
            msg("Crawl Time Limit"),
            renderTimeLimit(crawlConfig?.crawlTimeout, Infinity)
          )}
+          ${this.renderSetting(
+            msg("Crawl Size Limit"),
+            renderSize(crawlConfig?.maxCrawlSize, Infinity)
+          )}
          ${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)}
        </btrix-desc-list>
      </section>
--- a/frontend/src/pages/org/workflow-editor.ts
+++ b/frontend/src/pages/org/workflow-editor.ts
@ -82,6 +82,7 @@ type FormState = {
  behaviorTimeoutSeconds: number | null;
  pageLoadTimeoutSeconds: number | null;
  pageExtraDelaySeconds: number | null;
+  maxCrawlSizeGB: number | null;
  maxScopeDepth: number | null;
  scopeType: WorkflowParams["config"]["scopeType"];
  exclusions: WorkflowParams["config"]["exclude"];
@ -153,6 +154,7 @@ const getDefaultFormState = (): FormState => ({
  useSitemap: true,
  customIncludeUrlList: "",
  crawlTimeoutMinutes: null,
+  maxCrawlSizeGB: 0,
  behaviorTimeoutSeconds: null,
  pageLoadTimeoutSeconds: null,
  pageExtraDelaySeconds: null,
@ -213,6 +215,7 @@ const DEFAULT_BEHAVIORS = [
  "autofetch",
  "siteSpecific",
 ];
+const BYTES_PER_GB = 1073741824;

@localized()
 export class CrawlConfigEditor extends LiteElement {
@ -490,6 +493,12 @@ export class CrawlConfigEditor extends LiteElement {
      return fallback;
    };

+    const bytesToGB = (value: any, fallback: number | null) => {
+      if (typeof value === "number" && value > 0)
+        return Math.floor(value / BYTES_PER_GB);
+      return fallback;
+    };
+
    return {
      primarySeedUrl: defaultFormState.primarySeedUrl,
      urlList: defaultFormState.urlList,
@ -498,6 +507,10 @@ export class CrawlConfigEditor extends LiteElement {
        this.initialWorkflow.crawlTimeout,
        defaultFormState.crawlTimeoutMinutes
      ),
+      maxCrawlSizeGB: bytesToGB(
+        this.initialWorkflow.maxCrawlSize,
+        defaultFormState.maxCrawlSizeGB
+      ),
      behaviorTimeoutSeconds:
        seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds,
      pageLoadTimeoutSeconds:
@ -1314,6 +1327,22 @@ https://archiveweb.page/images/${"logo.svg"}`}
      ${this.renderHelpTextCol(
        msg(`Gracefully stop the crawler after a specified time limit.`)
      )}
+      ${this.renderFormCol(html`
+        <sl-input
+          name="maxCrawlSizeGB"
+          label=${msg("Crawl Size Limit")}
+          value=${this.formState.maxCrawlSizeGB || ""}
+          placeholder=${msg("Default: Unlimited")}
+          min="0"
+          type="number"
+          inputmode="numeric"
+        >
+          <span slot="suffix">${msg("GB")}</span>
+        </sl-input>
+      `)}
+      ${this.renderHelpTextCol(
+        msg(`Gracefully stop the crawler after a specified size limit.`)
+      )}
      ${this.renderFormCol(html`
        <sl-radio-group
          name="scale"
@ -2109,6 +2138,9 @@ https://archiveweb.page/images/${"logo.svg"}`}
      crawlTimeout: this.formState.crawlTimeoutMinutes
        ? this.formState.crawlTimeoutMinutes * 60
        : null,
+      maxCrawlSize: this.formState.maxCrawlSizeGB
+        ? this.formState.maxCrawlSizeGB * BYTES_PER_GB
+        : null,
      tags: this.formState.tags,
      autoAddCollections: this.formState.autoAddCollections,
      config: {
--- a/frontend/src/pages/org/workflows-new.ts
+++ b/frontend/src/pages/org/workflows-new.ts
@ -27,6 +27,7 @@ const defaultValue = {
  },
  tags: [],
  crawlTimeout: null,
+  maxCrawlSize: null,
  jobType: undefined,
  scale: 1,
  autoAddCollections: [],
--- a/frontend/src/types/crawler.ts
+++ b/frontend/src/types/crawler.ts
@ -44,6 +44,7 @@ export type WorkflowParams = {
  config: SeedConfig;
  tags: string[];
  crawlTimeout: number | null;
+  maxCrawlSize: number | null;
  description: string | null;
  autoAddCollections: string[];
 };