feat: Apply saved workflow settings to current crawl (#2514)

Resolves https://github.com/webrecorder/browsertrix/issues/2366

## Changes

Allows users to update current crawl with newly saved workflow settings.

## Manual testing

1. Log in as crawler
2. Start a crawl
3. Go to edit workflow. Verify "Update Crawl" button is shown
4. Click "Update Crawl". Verify crawl is updated with new settings

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
sua yoo 2025-04-29 11:43:14 -07:00 committed by GitHub
parent c4a7ebce29
commit 1fa43335c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 70 additions and 43 deletions

View File

@ -387,7 +387,7 @@ class CrawlConfigOps:
async def update_crawl_config(
self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig
) -> dict[str, bool | str]:
) -> CrawlConfigUpdateResponse:
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
"""Update name, scale, schedule, and/or tags for an existing crawl config"""
@ -455,11 +455,9 @@ class CrawlConfigOps:
run_now = update.runNow
if not changed and not metadata_changed and not run_now:
return {
"updated": True,
"settings_changed": changed,
"metadata_changed": metadata_changed,
}
return CrawlConfigUpdateResponse(
settings_changed=changed, metadata_changed=metadata_changed
)
if changed:
orig_dict = orig_crawl_config.dict(exclude_unset=True, exclude_none=True)
@ -498,10 +496,11 @@ class CrawlConfigOps:
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
crawlconfig = CrawlConfig.from_dict(result)
# update in crawl manager to change schedule
if schedule_changed:
try:
crawlconfig = CrawlConfig.from_dict(result)
await self.crawl_manager.update_scheduled_job(crawlconfig, str(user.id))
except Exception as exc:
@ -511,16 +510,24 @@ class CrawlConfigOps:
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
ret: dict[str, bool | str] = {
"updated": True,
"settings_changed": changed,
"metadata_changed": metadata_changed,
"storageQuotaReached": self.org_ops.storage_quota_reached(org),
"execMinutesQuotaReached": self.org_ops.exec_mins_quota_reached(org),
}
ret = CrawlConfigUpdateResponse(
settings_changed=changed,
metadata_changed=metadata_changed,
storageQuotaReached=self.org_ops.storage_quota_reached(org),
execMinutesQuotaReached=self.org_ops.exec_mins_quota_reached(org),
)
if run_now:
crawl_id = await self.run_now(cid, org, user)
ret["started"] = crawl_id
ret.started = crawl_id
elif update.updateRunning and changed:
running_crawl = await self.get_running_crawl(cid)
if running_crawl:
await self.crawl_manager.update_running_crawl_config(
running_crawl.id, crawlconfig
)
ret.updatedRunning = True
return ret
async def update_usernames(self, userid: UUID, updated_name: str) -> None:

View File

@ -249,6 +249,25 @@ class CrawlManager(K8sAPI):
crawl_id, {"lastConfigUpdate": date_to_str(dt_now())}
)
async def update_running_crawl_config(
self, crawl_id: str, crawlconfig: CrawlConfig
):
"""force update of config for running crawl"""
time_now = date_to_str(dt_now())
# pylint: disable=use-dict-literal
patch = dict(
crawlerChannel=crawlconfig.crawlerChannel,
scale=crawlconfig.scale,
timeout=crawlconfig.crawlTimeout,
maxCrawlSize=crawlconfig.maxCrawlSize,
proxyId=crawlconfig.proxyId or DEFAULT_PROXY_ID,
lastConfigUpdate=time_now,
restartTime=time_now,
)
return await self._patch_job(crawl_id, patch)
async def create_qa_crawl_job(
self,
crawlconfig: CrawlConfig,

View File

@ -506,6 +506,7 @@ class UpdateCrawlConfig(BaseModel):
description: Optional[str] = None
autoAddCollections: Optional[List[UUID]] = None
runNow: bool = False
updateRunning: bool = False
# crawl data: revision tracked
schedule: Optional[str] = None
@ -578,9 +579,10 @@ class CrawlConfigSearchValues(BaseModel):
class CrawlConfigUpdateResponse(BaseModel):
"""Response model for updating crawlconfigs"""
updated: bool
updated: bool = True
settings_changed: bool
metadata_changed: bool
updatedRunning: bool = False
storageQuotaReached: Optional[bool] = False
execMinutesQuotaReached: Optional[bool] = False

View File

@ -1305,8 +1305,7 @@ def test_custom_behavior_logs(
if log["context"] == "behaviorScriptCustom":
assert log["message"] in (
"test-stat",
"done!",
"Using Site-Specific Behavior: TestBehavior",
"In Test Behavior!",
)
if log["message"] in ("test-stat", "done!"):
assert log["details"]["behavior"] == "TestBehavior"
@ -1314,7 +1313,7 @@ def test_custom_behavior_logs(
custom_log_line_count += 1
assert custom_log_line_count == 3
assert custom_log_line_count == 2
def test_crawls_exclude_behavior_logs(

View File

@ -110,12 +110,12 @@ import {
type WorkflowDefaults,
} from "@/utils/workflow";
type NewCrawlConfigParams = WorkflowParams & {
runNow: boolean;
type CrawlConfigParams = WorkflowParams & {
config: WorkflowParams["config"] & {
seeds: Seed[];
};
};
type WorkflowRunParams = { runNow: boolean; updateRunning?: boolean };
const STEPS = SECTIONS;
type StepName = (typeof STEPS)[number];
@ -626,14 +626,14 @@ export class WorkflowEditor extends BtrixElement {
type="button"
?disabled=${this.isSubmitting}
?loading=${this.isSubmitting}
@click=${this.save}
@click=${() => void this.save()}
>
${msg("Save")}
</sl-button>
</sl-tooltip>
<sl-tooltip
content=${this.isCrawlRunning
? msg("Crawl is already running")
? msg("Save and apply settings to current crawl")
: msg("Save and run with new settings")}
?disabled=${this.isCrawlRunning === null}
>
@ -641,13 +641,13 @@ export class WorkflowEditor extends BtrixElement {
size="small"
variant="primary"
type="submit"
?disabled=${isArchivingDisabled(this.org, true) ||
?disabled=${(!this.isCrawlRunning &&
isArchivingDisabled(this.org, true)) ||
this.isSubmitting ||
this.isCrawlRunning ||
this.isCrawlRunning === null}
?loading=${this.isSubmitting || this.isCrawlRunning === null}
>
${msg(html`Run Crawl`)}
${msg(this.isCrawlRunning ? "Update Crawl" : "Run Crawl")}
</sl-button>
</sl-tooltip>
</footer>
@ -2191,14 +2191,13 @@ https://archiveweb.page/images/${"logo.svg"}`}
private async onSubmit(event: SubmitEvent) {
event.preventDefault();
this.updateFormState({
runNow: true,
void this.save({
runNow: !this.isCrawlRunning,
updateRunning: Boolean(this.isCrawlRunning),
});
void this.save();
}
private async save() {
private async save(opts?: WorkflowRunParams) {
if (!this.formElem) return;
// TODO Move away from manual validation check
@ -2235,7 +2234,15 @@ https://archiveweb.page/images/${"logo.svg"}`}
return;
}
const config = this.parseConfig();
const config: CrawlConfigParams & WorkflowRunParams = {
...this.parseConfig(),
runNow: Boolean(opts?.runNow),
};
if (this.configId) {
config.updateRunning = Boolean(opts?.updateRunning);
}
this.isSubmitting = true;
try {
@ -2412,15 +2419,14 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
}
private parseConfig(): NewCrawlConfigParams {
const config: NewCrawlConfigParams = {
private parseConfig(): CrawlConfigParams {
const config: CrawlConfigParams = {
// Job types are now merged into a single type
jobType: "custom",
name: this.formState.jobName || "",
description: this.formState.description,
scale: this.formState.scale,
profileid: this.formState.browserProfile?.id || "",
runNow: this.formState.runNow,
schedule: this.formState.scheduleType === "cron" ? this.utcSchedule : "",
crawlTimeout: this.formState.crawlTimeoutMinutes * 60,
maxCrawlSize: this.formState.maxCrawlSizeGB * BYTES_PER_GB,
@ -2471,7 +2477,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
private parseUrlListConfig(): Pick<
NewCrawlConfigParams["config"],
CrawlConfigParams["config"],
"seeds" | "scopeType" | "extraHops" | "useSitemap" | "failOnFailedSeed"
> {
const config = {
@ -2489,7 +2495,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
private parseSeededConfig(): Pick<
NewCrawlConfigParams["config"],
CrawlConfigParams["config"],
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
> {
const primarySeedUrl = this.formState.primarySeedUrl;

View File

@ -81,7 +81,6 @@ export type FormState = {
minute: number;
period: "AM" | "PM";
};
runNow: boolean;
jobName: WorkflowParams["name"];
browserProfile: Profile | null;
tags: Tags;
@ -139,7 +138,6 @@ export const getDefaultFormState = (): FormState => ({
minute: 0,
period: "AM",
},
runNow: false,
jobName: "",
browserProfile: null,
tags: [],
@ -275,10 +273,6 @@ export function getInitialFormState(params: {
lang: params.initialWorkflow.config.lang ?? defaultFormState.lang,
scheduleType: defaultFormState.scheduleType,
scheduleFrequency: defaultFormState.scheduleFrequency,
runNow:
params.org?.storageQuotaReached || params.org?.execMinutesQuotaReached
? false
: defaultFormState.runNow,
tags: params.initialWorkflow.tags,
autoAddCollections: params.initialWorkflow.autoAddCollections,
jobName: params.initialWorkflow.name || defaultFormState.jobName,