From ef93c5ad9079a87d1f60eed736a102ef5366c549 Mon Sep 17 00:00:00 2001 From: sua yoo Date: Tue, 20 May 2025 12:19:09 -0700 Subject: [PATCH] docs: Document latest crawl (#2613) Follows https://github.com/webrecorder/browsertrix/issues/2603 ## Changes - Updates documentation on "Latest Crawl" tab - Fixes extra fetch in workflow detail page - Reverts workflow detail labels from "Duration" back to "Run Duration" and "Pages" back to "Pages Crawled" --- frontend/docs/docs/user-guide/crawl-workflows.md | 2 +- frontend/docs/docs/user-guide/overview.md | 2 +- frontend/docs/docs/user-guide/running-crawl.md | 10 +++++++--- frontend/src/pages/org/workflow-detail.ts | 13 +++++++------ 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/frontend/docs/docs/user-guide/crawl-workflows.md b/frontend/docs/docs/user-guide/crawl-workflows.md index ecf039c0..d697d1b5 100644 --- a/frontend/docs/docs/user-guide/crawl-workflows.md +++ b/frontend/docs/docs/user-guide/crawl-workflows.md @@ -32,7 +32,7 @@ After deciding what type of crawl you'd like to run, you can begin to set up you Run a crawl workflow by clicking _Run Crawl_ in the actions menu of the workflow in the crawl workflow list, or by clicking the _Run Crawl_ button on the workflow's details page. -While crawling, the **Watch Crawl** section displays a list of queued URLs that will be visited, and streams the current state of the browser windows as they visit pages from the queue. You can [modify the crawl live](./running-crawl.md) by adding URL exclusions or changing the number of crawling instances. +While crawling, the **Latest Crawl** section streams the current state of the browser windows as they visit pages. You can [modify the crawl live](./running-crawl.md) by adding URL exclusions or changing the number of crawling instances. Re-running a crawl workflow can be useful to capture a website as it changes over time, or to run with an updated [crawl scope](workflow-setup.md#crawl-scope-options). diff --git a/frontend/docs/docs/user-guide/overview.md b/frontend/docs/docs/user-guide/overview.md index 63b55e73..3f186d29 100644 --- a/frontend/docs/docs/user-guide/overview.md +++ b/frontend/docs/docs/user-guide/overview.md @@ -21,7 +21,7 @@ The crawling panel lists the number of currently running and waiting crawls, as For organizations with a set execution minute limit, the crawling panel displays a graph of how much execution time has been used and how much is currently remaining. Monthly execution time limits reset on the first of each month at 12:00 AM GMT. ??? Question "How is execution time calculated?" - Execution time is the total runtime of scaled by the [_Browser Windows_](workflow-setup.md/#browser-windows) setting increment value during a crawl. Like elapsed time, this is tracked as the crawl runs so changing the amount of _Browser Windows_ while a crawl is running may change the amount of execution time used in a given time period. + Execution time is the total runtime of a crawl scaled by the [_Browser Windows_](workflow-setup.md/#browser-windows) value during a crawl. Like elapsed time, this is tracked while the crawl runs. Changing the amount of _Browser Windows_ while a crawl is running may change the amount of execution time used in a given time period. ## Collections diff --git a/frontend/docs/docs/user-guide/running-crawl.md b/frontend/docs/docs/user-guide/running-crawl.md index 6c539654..68583be6 100644 --- a/frontend/docs/docs/user-guide/running-crawl.md +++ b/frontend/docs/docs/user-guide/running-crawl.md @@ -1,6 +1,6 @@ # Modifying Running Crawls -Running crawls can be modified from the crawl workflow **Watch Crawl** tab. You may want to modify a runnning crawl if you find that the workflow is crawling pages that you didn't intend to archive, or if you want a boost of speed. +Running crawls can be modified from the crawl workflow **Latest Crawl** tab. You may want to modify a runnning crawl if you find that the workflow is crawling pages that you didn't intend to archive, or if you want a boost of speed. ## Crawl Workflow Status @@ -15,17 +15,21 @@ A crawl workflow that is in progress can be in one of the following states: | :btrix-status-dot: Finishing Crawl | The workflow has finished crawling and data is being packaged into WACZ files.| | :btrix-status-dot: Uploading WACZ | WACZ files have been created and are being transferred to storage.| +## Watch Crawl + +You can watch the current state of the browser windows as the crawler visit pages in the **Watch** tab of **Latest Crawl**. A list of queued URLs are displayed below in the **Upcoming Pages** section. + ## Live Exclusion Editing While [exclusions](workflow-setup.md#exclude-pages) can be set before running a crawl workflow, sometimes while crawling the crawler may find new parts of the site that weren't previously known about and shouldn't be crawled, or get stuck browsing parts of a website that automatically generate URLs known as ["crawler traps"](https://en.wikipedia.org/wiki/Spider_trap). -If the crawl queue is filled with URLs that should not be crawled, use the _Edit Exclusions_ button on the Watch Crawl page to instruct the crawler what pages should be excluded from the queue. +If the crawl queue is filled with URLs that should not be crawled, use the _Edit Exclusions_ button in the **Watch** tab to instruct the crawler what pages should be excluded from the queue. Exclusions added while crawling are applied to the same exclusion table saved in the workflow's settings and will be used the next time the crawl workflow is run unless they are manually removed. ## Changing the Number of Browser Windows -Like exclusions, the number of [browser windows](workflow-setup.md#browser-windows) can also be adjusted while crawling. On the **Watch Crawl** tab, press the _Edit Browser Windows_ button, and set the desired value. +Like exclusions, the number of [browser windows](workflow-setup.md#browser-windows) can also be adjusted while crawling. On the **Watch** tab, press the **+/-** button next to the _Running in_ N _browser windows_ text and set the desired value. Unlike exclusions, this change will not be applied to future workflow runs. diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 6416ef97..cfb193c2 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -162,13 +162,14 @@ export class WorkflowDetail extends BtrixElement { ) { void this.fetchWorkflow(); void this.fetchSeeds(); + void this.fetchCrawls(); + } else if (changedProperties.has("workflowTab")) { + void this.fetchDataForTab(); } + if (changedProperties.has("isEditing") && this.isEditing) { this.stopPoll(); } - if (changedProperties.has("workflowTab")) { - void this.fetchDataForTab(); - } } private async fetchDataForTab() { @@ -829,7 +830,7 @@ export class WorkflowDetail extends BtrixElement { class="underline hover:no-underline" @click=${this.navigate.link} > - ${msg("Watch Running Crawl")} + ${msg("Watch Crawl")} `, @@ -1071,7 +1072,7 @@ export class WorkflowDetail extends BtrixElement { return html` - ${this.renderDetailItem(msg("Duration"), (workflow) => + ${this.renderDetailItem(msg("Run Duration"), (workflow) => this.lastCrawlStartTime ? this.localize.humanizeDuration( (workflow.lastCrawlTime && !workflow.isCrawlRunning @@ -1081,7 +1082,7 @@ export class WorkflowDetail extends BtrixElement { ) : skeleton, )} - ${this.renderDetailItem(msg("Pages"), pages)} + ${this.renderDetailItem(msg("Pages Crawled"), pages)} ${this.renderDetailItem(msg("Size"), (workflow) => this.localize.bytes(workflow.lastCrawlSize || 0, { unitDisplay: "narrow",