exclude match api pagination: (#1214)
- limit how many exclusion matches are returned at once - option to specify 'offset', 'limit' and return 'nextOffset' for further pagination - set page limit to 1000 by default
This commit is contained in:
parent
18b2c1abfc
commit
e6bccac953
@ -409,8 +409,10 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
|
|
||||||
return {"total": total, "results": results, "matched": matched}
|
return {"total": total, "results": results, "matched": matched}
|
||||||
|
|
||||||
async def match_crawl_queue(self, crawl_id, regex):
|
async def match_crawl_queue(self, crawl_id, regex, offset=0, limit=1000):
|
||||||
"""get list of urls that match regex"""
|
"""get list of urls that match regex, starting at offset and at most
|
||||||
|
around 'limit'. (limit rounded to next step boundary, so
|
||||||
|
limit <= next_offset < limit + step"""
|
||||||
total = 0
|
total = 0
|
||||||
matched = []
|
matched = []
|
||||||
step = 50
|
step = 50
|
||||||
@ -427,7 +429,9 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
except re.error as exc:
|
except re.error as exc:
|
||||||
raise HTTPException(status_code=400, detail="invalid_regex") from exc
|
raise HTTPException(status_code=400, detail="invalid_regex") from exc
|
||||||
|
|
||||||
for count in range(0, total, step):
|
next_offset = -1
|
||||||
|
|
||||||
|
for count in range(offset, total, step):
|
||||||
results = await self._crawl_queue_range(
|
results = await self._crawl_queue_range(
|
||||||
redis, f"{crawl_id}:q", count, step
|
redis, f"{crawl_id}:q", count, step
|
||||||
)
|
)
|
||||||
@ -436,7 +440,13 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
if regex.search(url):
|
if regex.search(url):
|
||||||
matched.append(url)
|
matched.append(url)
|
||||||
|
|
||||||
return {"total": total, "matched": matched}
|
# if exceeded limit set nextOffset to next step boundary
|
||||||
|
# and break
|
||||||
|
if len(matched) >= limit:
|
||||||
|
next_offset = count + step
|
||||||
|
break
|
||||||
|
|
||||||
|
return {"total": total, "matched": matched, "nextOffset": next_offset}
|
||||||
|
|
||||||
async def filter_crawl_queue(self, crawl_id, regex):
|
async def filter_crawl_queue(self, crawl_id, regex):
|
||||||
"""filter out urls that match regex"""
|
"""filter out urls that match regex"""
|
||||||
@ -867,11 +877,15 @@ def init_crawls_api(
|
|||||||
tags=["crawls"],
|
tags=["crawls"],
|
||||||
)
|
)
|
||||||
async def match_crawl_queue(
|
async def match_crawl_queue(
|
||||||
crawl_id, regex: str, org: Organization = Depends(org_crawl_dep)
|
crawl_id,
|
||||||
|
regex: str,
|
||||||
|
offset: int = 0,
|
||||||
|
limit: int = 1000,
|
||||||
|
org: Organization = Depends(org_crawl_dep),
|
||||||
):
|
):
|
||||||
await ops.get_crawl_raw(crawl_id, org)
|
await ops.get_crawl_raw(crawl_id, org)
|
||||||
|
|
||||||
return await ops.match_crawl_queue(crawl_id, regex)
|
return await ops.match_crawl_queue(crawl_id, regex, offset, limit)
|
||||||
|
|
||||||
@app.post(
|
@app.post(
|
||||||
"/orgs/{oid}/crawls/{crawl_id}/exclusions",
|
"/orgs/{oid}/crawls/{crawl_id}/exclusions",
|
||||||
|
Loading…
Reference in New Issue
Block a user