Add support for custom link selectors to backend (#2346)

Related to #2152 

This PR adds backend support for custom link selectors via `selectLinks`
on the crawl workflow config. Tests have been updated as well.

It also adds `selectLinks` to the frontend in a minimal and for now
hardcoded way that we can use as a basis for proper frontend support
moving forward.

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
Tessa Walsh 2025-02-14 01:22:27 -05:00 committed by GitHub
parent 659e124168
commit 39d99e7c5d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 19 additions and 0 deletions

View File

@ -358,6 +358,8 @@ class RawCrawlConfig(BaseModel):
userAgent: Optional[str] = None
selectLinks: List[str] = ["a[href]->href"]
# ============================================================================
class CrawlConfigIn(BaseModel):

View File

@ -173,6 +173,17 @@ def test_update_config_invalid_exclude_regex(
assert r.json()["detail"] == "invalid_regex"
def test_verify_default_select_links(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
@ -181,6 +192,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
}
},
)
@ -195,6 +207,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
data = r.json()
assert data["config"]["scopeType"] == "domain"
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
def test_update_config_no_changes(
@ -207,6 +220,7 @@ def test_update_config_no_changes(
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"selectLinks": ["a[href]->href", "script[src]->src"],
}
},
)

View File

@ -2110,6 +2110,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
blockAds: this.formState.blockAds,
exclude: trimArray(this.formState.exclusions),
behaviors: this.setBehaviors(),
selectLinks: ["a[href]->href"],
},
crawlerChannel: this.formState.crawlerChannel || "default",
proxyId: this.formState.proxyId,

View File

@ -84,6 +84,7 @@ export class WorkflowsNew extends LiteElement {
useSitemap: false,
failOnFailedSeed: false,
userAgent: null,
selectLinks: ["a[href]->href"],
},
tags: [],
crawlTimeout: null,

View File

@ -36,6 +36,7 @@ export type SeedConfig = Expand<
failOnFailedSeed?: boolean;
depth?: number | null;
userAgent?: string | null;
selectLinks: string[];
}
>;