exclude match api pagination: (#1214)
- limit how many exclusion matches are returned at once - option to specify 'offset', 'limit' and return 'nextOffset' for further pagination - set page limit to 1000 by default
This commit is contained in:
parent
18b2c1abfc
commit
e6bccac953
@ -409,8 +409,10 @@ class CrawlOps(BaseCrawlOps):
|
||||
|
||||
return {"total": total, "results": results, "matched": matched}
|
||||
|
||||
async def match_crawl_queue(self, crawl_id, regex):
|
||||
"""get list of urls that match regex"""
|
||||
async def match_crawl_queue(self, crawl_id, regex, offset=0, limit=1000):
|
||||
"""get list of urls that match regex, starting at offset and at most
|
||||
around 'limit'. (limit rounded to next step boundary, so
|
||||
limit <= next_offset < limit + step"""
|
||||
total = 0
|
||||
matched = []
|
||||
step = 50
|
||||
@ -427,7 +429,9 @@ class CrawlOps(BaseCrawlOps):
|
||||
except re.error as exc:
|
||||
raise HTTPException(status_code=400, detail="invalid_regex") from exc
|
||||
|
||||
for count in range(0, total, step):
|
||||
next_offset = -1
|
||||
|
||||
for count in range(offset, total, step):
|
||||
results = await self._crawl_queue_range(
|
||||
redis, f"{crawl_id}:q", count, step
|
||||
)
|
||||
@ -436,7 +440,13 @@ class CrawlOps(BaseCrawlOps):
|
||||
if regex.search(url):
|
||||
matched.append(url)
|
||||
|
||||
return {"total": total, "matched": matched}
|
||||
# if exceeded limit set nextOffset to next step boundary
|
||||
# and break
|
||||
if len(matched) >= limit:
|
||||
next_offset = count + step
|
||||
break
|
||||
|
||||
return {"total": total, "matched": matched, "nextOffset": next_offset}
|
||||
|
||||
async def filter_crawl_queue(self, crawl_id, regex):
|
||||
"""filter out urls that match regex"""
|
||||
@ -867,11 +877,15 @@ def init_crawls_api(
|
||||
tags=["crawls"],
|
||||
)
|
||||
async def match_crawl_queue(
|
||||
crawl_id, regex: str, org: Organization = Depends(org_crawl_dep)
|
||||
crawl_id,
|
||||
regex: str,
|
||||
offset: int = 0,
|
||||
limit: int = 1000,
|
||||
org: Organization = Depends(org_crawl_dep),
|
||||
):
|
||||
await ops.get_crawl_raw(crawl_id, org)
|
||||
|
||||
return await ops.match_crawl_queue(crawl_id, regex)
|
||||
return await ops.match_crawl_queue(crawl_id, regex, offset, limit)
|
||||
|
||||
@app.post(
|
||||
"/orgs/{oid}/crawls/{crawl_id}/exclusions",
|
||||
|
Loading…
Reference in New Issue
Block a user