exclude match api pagination: (#1214)

- limit how many exclusion matches are returned at once - option to specify 'offset', 'limit' and return 'nextOffset' for further pagination - set page limit to 1000 by default
2023-09-26 13:45:54 -07:00 · 2023-09-26 13:45:54 -07:00 · e6bccac953
commit e6bccac953
parent 18b2c1abfc
1 changed files with 20 additions and 6 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -409,8 +409,10 @@ class CrawlOps(BaseCrawlOps):

        return {"total": total, "results": results, "matched": matched}

-    async def match_crawl_queue(self, crawl_id, regex):
-        """get list of urls that match regex"""
+    async def match_crawl_queue(self, crawl_id, regex, offset=0, limit=1000):
+        """get list of urls that match regex, starting at offset and at most
+        around 'limit'. (limit rounded to next step boundary, so
+        limit <= next_offset < limit + step"""
        total = 0
        matched = []
        step = 50
@ -427,7 +429,9 @@ class CrawlOps(BaseCrawlOps):
            except re.error as exc:
                raise HTTPException(status_code=400, detail="invalid_regex") from exc

-            for count in range(0, total, step):
+            next_offset = -1
+
+            for count in range(offset, total, step):
                results = await self._crawl_queue_range(
                    redis, f"{crawl_id}:q", count, step
                )
@ -436,7 +440,13 @@ class CrawlOps(BaseCrawlOps):
                    if regex.search(url):
                        matched.append(url)

-        return {"total": total, "matched": matched}
+                # if exceeded limit set nextOffset to next step boundary
+                # and break
+                if len(matched) >= limit:
+                    next_offset = count + step
+                    break
+
+        return {"total": total, "matched": matched, "nextOffset": next_offset}

    async def filter_crawl_queue(self, crawl_id, regex):
        """filter out urls that match regex"""
@ -867,11 +877,15 @@ def init_crawls_api(
        tags=["crawls"],
    )
    async def match_crawl_queue(
-        crawl_id, regex: str, org: Organization = Depends(org_crawl_dep)
+        crawl_id,
+        regex: str,
+        offset: int = 0,
+        limit: int = 1000,
+        org: Organization = Depends(org_crawl_dep),
    ):
        await ops.get_crawl_raw(crawl_id, org)

-        return await ops.match_crawl_queue(crawl_id, regex)
+        return await ops.match_crawl_queue(crawl_id, regex, offset, limit)

    @app.post(
        "/orgs/{oid}/crawls/{crawl_id}/exclusions",