This PR adds a new checkbox to both page and seed crawl workflow types, which will fail the crawl if behaviors detect the browser is not logged in for supported sites. Changes include: - Backend support for the new crawler flag - A new `failed_not_logged_in` crawl state - Checkbox workflow editor and config details in the frontend (currently in the Scope section - I think it makes sense to have this option up front, but worth considering) - User Guide documentation of new option - A new nightly test for the new workflow option and `failed_not_logged_in` state --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: sua yoo <sua@webrecorder.org>
435 lines
13 KiB
TypeScript
435 lines
13 KiB
TypeScript
/**
|
|
* TODO Move to utils/crawl-configs/
|
|
*/
|
|
import { msg, str } from "@lit/localize";
|
|
import { z } from "zod";
|
|
|
|
import { getAppSettings, type AppSettings } from "./app";
|
|
|
|
import type { Tags } from "@/components/ui/tag-input";
|
|
import type { UserGuideEventMap } from "@/index";
|
|
import {
|
|
Behavior,
|
|
ScopeType,
|
|
type Profile,
|
|
type Seed,
|
|
type SeedConfig,
|
|
type WorkflowParams,
|
|
} from "@/types/crawler";
|
|
import type { OrgData } from "@/types/org";
|
|
import {
|
|
WorkflowScopeType,
|
|
type NewWorkflowOnlyScopeType,
|
|
} from "@/types/workflow";
|
|
import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
|
|
import { DEFAULT_MAX_SCALE, isPageScopeType } from "@/utils/crawler";
|
|
import { getNextDate, getScheduleInterval } from "@/utils/cron";
|
|
import localize, { getDefaultLang } from "@/utils/localize";
|
|
|
|
export const BYTES_PER_GB = 1e9;
|
|
export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const];
|
|
export const DEFAULT_AUTOCLICK_SELECTOR = "a";
|
|
export const SEED_LIST_FILE_EXT = "txt";
|
|
export const MAX_SEED_LIST_STRING_BYTES = 500 * 1000;
|
|
export const MAX_SEED_LIST_FILE_BYTES = 25 * 1e6;
|
|
|
|
export const SECTIONS = [
|
|
"scope",
|
|
"limits",
|
|
"behaviors",
|
|
"browserSettings",
|
|
"scheduling",
|
|
"metadata",
|
|
] as const;
|
|
export const sectionsEnum = z.enum(SECTIONS);
|
|
export type SectionsEnum = z.infer<typeof sectionsEnum>;
|
|
|
|
export enum GuideHash {
|
|
Scope = "scope",
|
|
Limits = "crawl-limits",
|
|
Behaviors = "page-behavior",
|
|
BrowserSettings = "browser-settings",
|
|
Scheduling = "scheduling",
|
|
Metadata = "metadata",
|
|
}
|
|
|
|
export enum SeedListFormat {
|
|
JSON = "json",
|
|
File = "file",
|
|
}
|
|
|
|
export const workflowTabToGuideHash: Record<SectionsEnum, GuideHash> = {
|
|
scope: GuideHash.Scope,
|
|
limits: GuideHash.Limits,
|
|
behaviors: GuideHash.Behaviors,
|
|
browserSettings: GuideHash.BrowserSettings,
|
|
scheduling: GuideHash.Scheduling,
|
|
metadata: GuideHash.Metadata,
|
|
};
|
|
|
|
export function makeUserGuideEvent(
|
|
section: SectionsEnum,
|
|
): UserGuideEventMap["btrix-user-guide-show"] {
|
|
const userGuideHash =
|
|
(workflowTabToGuideHash[section] as GuideHash | undefined) ||
|
|
GuideHash.Scope;
|
|
|
|
return new CustomEvent<UserGuideEventMap["btrix-user-guide-show"]["detail"]>(
|
|
"btrix-user-guide-show",
|
|
{
|
|
detail: {
|
|
path: `workflow-setup/#${userGuideHash}`,
|
|
},
|
|
bubbles: true,
|
|
composed: true,
|
|
},
|
|
);
|
|
}
|
|
|
|
export function defaultLabel(value: unknown): string {
|
|
if (value === Infinity) {
|
|
return msg("Default: Unlimited");
|
|
}
|
|
if (typeof value === "number") {
|
|
return msg(str`Default: ${localize.number(value)}`);
|
|
}
|
|
if (value) {
|
|
return msg(str`Default: ${value}`);
|
|
}
|
|
return "";
|
|
}
|
|
|
|
export function defaultSeedListFileName() {
|
|
return `URL-List-${new Date()
|
|
.toISOString()
|
|
.split(".")[0]
|
|
.replace(/[^0-9]/g, "")}.${SEED_LIST_FILE_EXT}`;
|
|
}
|
|
|
|
export type FormState = {
|
|
primarySeedUrl: string;
|
|
urlList: string;
|
|
seedListFormat: SeedListFormat;
|
|
seedFileId: string | null;
|
|
seedFile: File | null;
|
|
includeLinkedPages: boolean;
|
|
useSitemap: boolean;
|
|
failOnFailedSeed: boolean;
|
|
failOnContentCheck: boolean;
|
|
customIncludeUrlList: string;
|
|
crawlTimeoutMinutes: number;
|
|
behaviorTimeoutSeconds: number | null;
|
|
pageLoadTimeoutSeconds: number | null;
|
|
pageExtraDelaySeconds: number | null;
|
|
postLoadDelaySeconds: number | null;
|
|
maxCrawlSizeGB: number;
|
|
maxScopeDepth: number | null;
|
|
scopeType:
|
|
| Exclude<ScopeType, ScopeType.Any>
|
|
| (typeof NewWorkflowOnlyScopeType)[keyof typeof NewWorkflowOnlyScopeType];
|
|
exclusions: WorkflowParams["config"]["exclude"];
|
|
pageLimit: WorkflowParams["config"]["limit"];
|
|
browserWindows: WorkflowParams["browserWindows"];
|
|
blockAds: WorkflowParams["config"]["blockAds"];
|
|
lang: WorkflowParams["config"]["lang"];
|
|
scheduleType: "date" | "cron" | "none";
|
|
scheduleFrequency: "daily" | "weekly" | "monthly" | "";
|
|
scheduleDayOfMonth?: number;
|
|
scheduleDayOfWeek?: number;
|
|
scheduleTime?: {
|
|
hour: number;
|
|
minute: number;
|
|
period: "AM" | "PM";
|
|
};
|
|
jobName: WorkflowParams["name"];
|
|
browserProfile: Profile | null;
|
|
tags: Tags;
|
|
autoAddCollections: string[];
|
|
description: WorkflowParams["description"];
|
|
autoscrollBehavior: boolean;
|
|
autoclickBehavior: boolean;
|
|
customBehavior: boolean;
|
|
userAgent: string | null;
|
|
crawlerChannel: string;
|
|
proxyId: string | null;
|
|
selectLinks: string[];
|
|
clickSelector: string;
|
|
};
|
|
|
|
export type FormStateField = keyof FormState;
|
|
|
|
export type WorkflowDefaults = {
|
|
behaviorTimeoutSeconds?: number;
|
|
pageLoadTimeoutSeconds?: number;
|
|
maxPagesPerCrawl?: number;
|
|
maxBrowserWindows: number;
|
|
};
|
|
|
|
export const appDefaults: WorkflowDefaults = {
|
|
maxBrowserWindows: DEFAULT_MAX_SCALE,
|
|
};
|
|
|
|
export const getDefaultFormState = (): FormState => ({
|
|
primarySeedUrl: "",
|
|
urlList: "",
|
|
seedListFormat: SeedListFormat.JSON,
|
|
seedFileId: null,
|
|
seedFile: null,
|
|
includeLinkedPages: false,
|
|
useSitemap: false,
|
|
failOnFailedSeed: false,
|
|
failOnContentCheck: false,
|
|
customIncludeUrlList: "",
|
|
crawlTimeoutMinutes: 0,
|
|
maxCrawlSizeGB: 0,
|
|
behaviorTimeoutSeconds: null,
|
|
pageLoadTimeoutSeconds: null,
|
|
pageExtraDelaySeconds: null,
|
|
postLoadDelaySeconds: null,
|
|
maxScopeDepth: null,
|
|
scopeType: ScopeType.Page,
|
|
exclusions: [],
|
|
pageLimit: null,
|
|
browserWindows: 2,
|
|
blockAds: true,
|
|
lang: getDefaultLang(),
|
|
scheduleType: "none",
|
|
scheduleFrequency: "weekly",
|
|
scheduleDayOfMonth: new Date().getDate(),
|
|
scheduleDayOfWeek: new Date().getDay(),
|
|
scheduleTime: {
|
|
hour: 12,
|
|
minute: 0,
|
|
period: "AM",
|
|
},
|
|
jobName: "",
|
|
browserProfile: null,
|
|
tags: [],
|
|
autoAddCollections: [],
|
|
description: null,
|
|
autoscrollBehavior: true,
|
|
autoclickBehavior: false,
|
|
userAgent: null,
|
|
crawlerChannel: "default",
|
|
proxyId: null,
|
|
selectLinks: DEFAULT_SELECT_LINKS,
|
|
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
|
|
customBehavior: false,
|
|
});
|
|
|
|
export const mapSeedToUrl = (arr: Seed[]) =>
|
|
arr.map((seed) => (typeof seed === "string" ? seed : seed.url));
|
|
|
|
export function getInitialFormState(params: {
|
|
configId?: string;
|
|
initialSeeds?: Seed[];
|
|
initialWorkflow?: WorkflowParams;
|
|
org?: OrgData | null;
|
|
}): FormState {
|
|
const defaultFormState = getDefaultFormState();
|
|
if (!params.initialWorkflow) return defaultFormState;
|
|
const formState: Partial<FormState> = {};
|
|
const seedsConfig = params.initialWorkflow.config;
|
|
let primarySeedConfig: SeedConfig | Seed = seedsConfig;
|
|
if (!isPageScopeType(params.initialWorkflow.config.scopeType)) {
|
|
if (params.initialSeeds) {
|
|
const firstSeed = params.initialSeeds[0];
|
|
if (typeof firstSeed === "string") {
|
|
formState.primarySeedUrl = firstSeed;
|
|
} else {
|
|
primarySeedConfig = firstSeed;
|
|
formState.primarySeedUrl = primarySeedConfig.url;
|
|
}
|
|
}
|
|
if (primarySeedConfig.include?.length) {
|
|
formState.customIncludeUrlList = primarySeedConfig.include
|
|
// Unescape regex
|
|
.map(unescapeCustomPrefix)
|
|
.join("\n");
|
|
// if we have additional include URLs, set to "custom" scope here
|
|
// to indicate 'Custom Page Prefix' option
|
|
formState.scopeType = ScopeType.Custom;
|
|
}
|
|
const additionalSeeds = params.initialSeeds?.slice(1);
|
|
if (additionalSeeds?.length) {
|
|
formState.urlList = mapSeedToUrl(additionalSeeds).join("\n");
|
|
}
|
|
formState.useSitemap = seedsConfig.useSitemap;
|
|
} else {
|
|
if (params.initialWorkflow.config.seedFileId) {
|
|
formState.seedFileId = params.initialWorkflow.config.seedFileId;
|
|
formState.scopeType = WorkflowScopeType.PageList;
|
|
formState.seedListFormat = SeedListFormat.File;
|
|
} else if (params.initialSeeds?.length) {
|
|
if (params.initialSeeds.length === 1) {
|
|
formState.scopeType = WorkflowScopeType.Page;
|
|
} else {
|
|
formState.scopeType = WorkflowScopeType.PageList;
|
|
}
|
|
|
|
formState.urlList = mapSeedToUrl(params.initialSeeds).join("\n");
|
|
}
|
|
|
|
formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
|
|
formState.failOnContentCheck = seedsConfig.failOnContentCheck;
|
|
}
|
|
|
|
if (params.initialWorkflow.schedule) {
|
|
formState.scheduleType = "cron";
|
|
formState.scheduleFrequency = getScheduleInterval(
|
|
params.initialWorkflow.schedule,
|
|
);
|
|
const nextDate = getNextDate(params.initialWorkflow.schedule)!;
|
|
formState.scheduleDayOfMonth = nextDate.getDate();
|
|
formState.scheduleDayOfWeek = nextDate.getDay();
|
|
const hours = nextDate.getHours();
|
|
formState.scheduleTime = {
|
|
hour: hours % 12 || 12,
|
|
minute: nextDate.getMinutes(),
|
|
period: hours > 11 ? "PM" : "AM",
|
|
};
|
|
} else {
|
|
formState.scheduleType = "none";
|
|
}
|
|
|
|
if (params.initialWorkflow.tags.length) {
|
|
formState.tags = params.initialWorkflow.tags;
|
|
}
|
|
|
|
if (params.initialWorkflow.autoAddCollections.length) {
|
|
formState.autoAddCollections = params.initialWorkflow.autoAddCollections;
|
|
}
|
|
|
|
const secondsToMinutes = (value: unknown, fallback = 0) => {
|
|
if (typeof value === "number" && value > 0) return value / 60;
|
|
return fallback;
|
|
};
|
|
|
|
const bytesToGB = (value: unknown, fallback = 0) => {
|
|
if (typeof value === "number" && value > 0)
|
|
return Math.floor(value / BYTES_PER_GB);
|
|
return fallback;
|
|
};
|
|
|
|
const enableCustomBehaviors = Boolean(
|
|
params.initialWorkflow.config.customBehaviors.length,
|
|
);
|
|
|
|
return {
|
|
...defaultFormState,
|
|
primarySeedUrl: defaultFormState.primarySeedUrl,
|
|
urlList: defaultFormState.urlList,
|
|
customIncludeUrlList: defaultFormState.customIncludeUrlList,
|
|
crawlTimeoutMinutes: secondsToMinutes(
|
|
params.initialWorkflow.crawlTimeout,
|
|
defaultFormState.crawlTimeoutMinutes,
|
|
),
|
|
maxCrawlSizeGB: bytesToGB(
|
|
params.initialWorkflow.maxCrawlSize,
|
|
defaultFormState.maxCrawlSizeGB,
|
|
),
|
|
behaviorTimeoutSeconds:
|
|
seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds,
|
|
pageLoadTimeoutSeconds:
|
|
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
|
|
pageExtraDelaySeconds:
|
|
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
|
|
postLoadDelaySeconds:
|
|
seedsConfig.postLoadDelay ?? defaultFormState.postLoadDelaySeconds,
|
|
maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth,
|
|
browserWindows: params.initialWorkflow.browserWindows,
|
|
blockAds: params.initialWorkflow.config.blockAds,
|
|
lang: params.initialWorkflow.config.lang ?? defaultFormState.lang,
|
|
scheduleType: defaultFormState.scheduleType,
|
|
scheduleFrequency: defaultFormState.scheduleFrequency,
|
|
tags: params.initialWorkflow.tags,
|
|
autoAddCollections: params.initialWorkflow.autoAddCollections,
|
|
jobName: params.initialWorkflow.name || defaultFormState.jobName,
|
|
description: params.initialWorkflow.description,
|
|
browserProfile: params.initialWorkflow.profileid
|
|
? ({ id: params.initialWorkflow.profileid } as Profile)
|
|
: defaultFormState.browserProfile,
|
|
scopeType: primarySeedConfig.scopeType as FormState["scopeType"],
|
|
exclusions: seedsConfig.exclude?.length === 0 ? [""] : seedsConfig.exclude,
|
|
includeLinkedPages: Boolean(
|
|
primarySeedConfig.extraHops || seedsConfig.extraHops,
|
|
),
|
|
useSitemap: seedsConfig.useSitemap ?? defaultFormState.useSitemap,
|
|
failOnFailedSeed:
|
|
seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
|
|
failOnContentCheck:
|
|
seedsConfig.failOnContentCheck ?? defaultFormState.failOnContentCheck,
|
|
pageLimit:
|
|
params.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
|
autoscrollBehavior: params.initialWorkflow.config.behaviors
|
|
? params.initialWorkflow.config.behaviors.includes(Behavior.AutoScroll)
|
|
: enableCustomBehaviors
|
|
? false
|
|
: defaultFormState.autoscrollBehavior,
|
|
autoclickBehavior: params.initialWorkflow.config.behaviors
|
|
? params.initialWorkflow.config.behaviors.includes(Behavior.AutoClick)
|
|
: enableCustomBehaviors
|
|
? false
|
|
: defaultFormState.autoclickBehavior,
|
|
customBehavior: enableCustomBehaviors,
|
|
selectLinks: params.initialWorkflow.config.selectLinks,
|
|
clickSelector: params.initialWorkflow.config.clickSelector,
|
|
userAgent:
|
|
params.initialWorkflow.config.userAgent ?? defaultFormState.userAgent,
|
|
crawlerChannel:
|
|
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
|
|
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
|
|
...formState,
|
|
};
|
|
}
|
|
|
|
export async function getServerDefaults(): Promise<WorkflowDefaults> {
|
|
const defaults = { ...appDefaults };
|
|
|
|
try {
|
|
const data = await getAppSettings();
|
|
|
|
if (data.defaultBehaviorTimeSeconds > 0) {
|
|
defaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds;
|
|
}
|
|
if (data.defaultPageLoadTimeSeconds > 0) {
|
|
defaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds;
|
|
}
|
|
if (data.maxPagesPerCrawl > 0) {
|
|
defaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
|
|
}
|
|
if (data.maxBrowserWindows) {
|
|
defaults.maxBrowserWindows = data.maxBrowserWindows;
|
|
}
|
|
|
|
return defaults;
|
|
} catch (e) {
|
|
console.debug(e);
|
|
}
|
|
|
|
return defaults;
|
|
}
|
|
|
|
export function* rangeBrowserWindows(
|
|
settings: AppSettings | null,
|
|
): Iterable<number> {
|
|
if (!settings) {
|
|
yield 1;
|
|
return;
|
|
}
|
|
|
|
const { numBrowsersPerInstance, maxBrowserWindows } = settings;
|
|
|
|
for (let i = 1; i < numBrowsersPerInstance; i++) {
|
|
yield i;
|
|
}
|
|
|
|
for (
|
|
let i = numBrowsersPerInstance;
|
|
i <= maxBrowserWindows;
|
|
i += numBrowsersPerInstance
|
|
) {
|
|
yield i;
|
|
}
|
|
}
|