Add support for custom link selectors to backend (#2346)
Related to #2152 This PR adds backend support for custom link selectors via `selectLinks` on the crawl workflow config. Tests have been updated as well. It also adds `selectLinks` to the frontend in a minimal and for now hardcoded way that we can use as a basis for proper frontend support moving forward. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
659e124168
commit
39d99e7c5d
@ -358,6 +358,8 @@ class RawCrawlConfig(BaseModel):
|
||||
|
||||
userAgent: Optional[str] = None
|
||||
|
||||
selectLinks: List[str] = ["a[href]->href"]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlConfigIn(BaseModel):
|
||||
|
@ -173,6 +173,17 @@ def test_update_config_invalid_exclude_regex(
|
||||
assert r.json()["detail"] == "invalid_regex"
|
||||
|
||||
|
||||
def test_verify_default_select_links(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
|
||||
|
||||
|
||||
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
@ -181,6 +192,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
|
||||
"config": {
|
||||
"seeds": [{"url": "https://example.com/"}],
|
||||
"scopeType": "domain",
|
||||
"selectLinks": ["a[href]->href", "script[src]->src"],
|
||||
}
|
||||
},
|
||||
)
|
||||
@ -195,6 +207,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
|
||||
data = r.json()
|
||||
|
||||
assert data["config"]["scopeType"] == "domain"
|
||||
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
|
||||
|
||||
|
||||
def test_update_config_no_changes(
|
||||
@ -207,6 +220,7 @@ def test_update_config_no_changes(
|
||||
"config": {
|
||||
"seeds": [{"url": "https://example.com/"}],
|
||||
"scopeType": "domain",
|
||||
"selectLinks": ["a[href]->href", "script[src]->src"],
|
||||
}
|
||||
},
|
||||
)
|
||||
|
@ -2110,6 +2110,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
blockAds: this.formState.blockAds,
|
||||
exclude: trimArray(this.formState.exclusions),
|
||||
behaviors: this.setBehaviors(),
|
||||
selectLinks: ["a[href]->href"],
|
||||
},
|
||||
crawlerChannel: this.formState.crawlerChannel || "default",
|
||||
proxyId: this.formState.proxyId,
|
||||
|
@ -84,6 +84,7 @@ export class WorkflowsNew extends LiteElement {
|
||||
useSitemap: false,
|
||||
failOnFailedSeed: false,
|
||||
userAgent: null,
|
||||
selectLinks: ["a[href]->href"],
|
||||
},
|
||||
tags: [],
|
||||
crawlTimeout: null,
|
||||
|
@ -36,6 +36,7 @@ export type SeedConfig = Expand<
|
||||
failOnFailedSeed?: boolean;
|
||||
depth?: number | null;
|
||||
userAgent?: string | null;
|
||||
selectLinks: string[];
|
||||
}
|
||||
>;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user