Add support for custom link selectors to backend (#2346)
Related to #2152 This PR adds backend support for custom link selectors via `selectLinks` on the crawl workflow config. Tests have been updated as well. It also adds `selectLinks` to the frontend in a minimal and for now hardcoded way that we can use as a basis for proper frontend support moving forward. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
This commit is contained in:
parent
659e124168
commit
39d99e7c5d
@ -358,6 +358,8 @@ class RawCrawlConfig(BaseModel):
|
|||||||
|
|
||||||
userAgent: Optional[str] = None
|
userAgent: Optional[str] = None
|
||||||
|
|
||||||
|
selectLinks: List[str] = ["a[href]->href"]
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlConfigIn(BaseModel):
|
class CrawlConfigIn(BaseModel):
|
||||||
|
@ -173,6 +173,17 @@ def test_update_config_invalid_exclude_regex(
|
|||||||
assert r.json()["detail"] == "invalid_regex"
|
assert r.json()["detail"] == "invalid_regex"
|
||||||
|
|
||||||
|
|
||||||
|
def test_verify_default_select_links(
|
||||||
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||||
|
):
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert r.json()["config"]["selectLinks"] == ["a[href]->href"]
|
||||||
|
|
||||||
|
|
||||||
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
||||||
r = requests.patch(
|
r = requests.patch(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||||
@ -181,6 +192,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
|
|||||||
"config": {
|
"config": {
|
||||||
"seeds": [{"url": "https://example.com/"}],
|
"seeds": [{"url": "https://example.com/"}],
|
||||||
"scopeType": "domain",
|
"scopeType": "domain",
|
||||||
|
"selectLinks": ["a[href]->href", "script[src]->src"],
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@ -195,6 +207,7 @@ def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_d
|
|||||||
data = r.json()
|
data = r.json()
|
||||||
|
|
||||||
assert data["config"]["scopeType"] == "domain"
|
assert data["config"]["scopeType"] == "domain"
|
||||||
|
assert data["config"]["selectLinks"] == ["a[href]->href", "script[src]->src"]
|
||||||
|
|
||||||
|
|
||||||
def test_update_config_no_changes(
|
def test_update_config_no_changes(
|
||||||
@ -207,6 +220,7 @@ def test_update_config_no_changes(
|
|||||||
"config": {
|
"config": {
|
||||||
"seeds": [{"url": "https://example.com/"}],
|
"seeds": [{"url": "https://example.com/"}],
|
||||||
"scopeType": "domain",
|
"scopeType": "domain",
|
||||||
|
"selectLinks": ["a[href]->href", "script[src]->src"],
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -2110,6 +2110,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
blockAds: this.formState.blockAds,
|
blockAds: this.formState.blockAds,
|
||||||
exclude: trimArray(this.formState.exclusions),
|
exclude: trimArray(this.formState.exclusions),
|
||||||
behaviors: this.setBehaviors(),
|
behaviors: this.setBehaviors(),
|
||||||
|
selectLinks: ["a[href]->href"],
|
||||||
},
|
},
|
||||||
crawlerChannel: this.formState.crawlerChannel || "default",
|
crawlerChannel: this.formState.crawlerChannel || "default",
|
||||||
proxyId: this.formState.proxyId,
|
proxyId: this.formState.proxyId,
|
||||||
|
@ -84,6 +84,7 @@ export class WorkflowsNew extends LiteElement {
|
|||||||
useSitemap: false,
|
useSitemap: false,
|
||||||
failOnFailedSeed: false,
|
failOnFailedSeed: false,
|
||||||
userAgent: null,
|
userAgent: null,
|
||||||
|
selectLinks: ["a[href]->href"],
|
||||||
},
|
},
|
||||||
tags: [],
|
tags: [],
|
||||||
crawlTimeout: null,
|
crawlTimeout: null,
|
||||||
|
@ -36,6 +36,7 @@ export type SeedConfig = Expand<
|
|||||||
failOnFailedSeed?: boolean;
|
failOnFailedSeed?: boolean;
|
||||||
depth?: number | null;
|
depth?: number | null;
|
||||||
userAgent?: string | null;
|
userAgent?: string | null;
|
||||||
|
selectLinks: string[];
|
||||||
}
|
}
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user