Crawl Queue API (#342)

* crawl queue api work: (#329) - add api to /crawls/{crawl_id}/queue api to get crawl queue, with offset, count, and optional regex. returns results and regex matches within the results, along with total urls in queue. - add api to match entire crawl queue, /crawls/{crawl_id}/queueMatch with query 'regex' arg, which processes entire crawl queue on backend and returns a list of matches (more experimental) - if crawl not yet started / redis not available, return empty queue - only supported for k8s deployment at the moment
2022-10-12 19:56:13 -07:00 · 2022-10-12 19:56:13 -07:00 · f7836c345d
commit f7836c345d
parent 8708c24a74
1 changed files with 112 additions and 22 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -3,12 +3,15 @@
 import asyncio
 import uuid
 import os
+import json
+import re

 from typing import Optional, List, Dict, Union
 from datetime import datetime, timedelta

 from fastapi import Depends, HTTPException
 from pydantic import BaseModel, UUID4, conint
+from redis import asyncio as aioredis, exceptions
 import pymongo


@ -155,6 +158,7 @@ class CrawlOps:
        self.crawl_configs = crawl_configs
        self.user_manager = users
        self.archives = archives
+        self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"

        self.crawl_configs.set_crawl_ops(self)

@ -405,6 +409,66 @@ class CrawlOps:
        # return whatever detail may be included in the response
        raise HTTPException(status_code=400, detail=result.get("error"))

+    async def get_crawl_queue(self, crawl_id, offset, count, regex):
+        """ get crawl queue """
+
+        total = 0
+        results = []
+        redis = None
+
+        try:
+            redis = await aioredis.from_url(
+                self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
+            )
+
+            total = await redis.llen(f"{crawl_id}:q")
+            results = await redis.lrange(f"{crawl_id}:q", offset, count)
+            results = [json.loads(result)["url"] for result in results]
+        except exceptions.ConnectionError:
+            # can't connect to redis, likely not initialized yet
+            pass
+
+        matched = []
+        if regex:
+            regex = re.compile(regex)
+            matched = [result for result in results if regex.search(result)]
+
+        return {"total": total, "results": results, "matched": matched}
+
+    async def match_crawl_queue(self, crawl_id, regex):
+        """ get crawl queue """
+
+        total = 0
+
+        try:
+            redis = await aioredis.from_url(
+                self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
+            )
+
+            total = await redis.llen(f"{crawl_id}:q")
+        except exceptions.ConnectionError:
+            # can't connect to redis, likely not initialized yet
+            pass
+
+        matched = []
+        regex = re.compile(regex)
+
+        step = 50
+
+        for count in range(0, total, step):
+            results = await redis.lrange(f"{crawl_id}:q", count, count + step)
+            for result in results:
+                url = json.loads(result)["url"]
+                if regex.search(url):
+                    matched.append(url)
+
+        return {"total": total, "matched": matched}
+
+    def get_redis_url(self, crawl_id):
+        """ get redis url for crawl id """
+        # pylint: disable=line-too-long
+        return f"redis://redis-{crawl_id}-0.redis-{crawl_id}.{self.namespace}.svc.cluster.local/0"
+

 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals
@ -505,6 +569,32 @@ def init_crawls_api(
        if await ops.get_crawl_raw(crawl_id, archive):
            return {}

+    @app.get(
+        "/archives/{aid}/crawls/{crawl_id}/queue",
+        tags=["crawls"],
+    )
+    async def get_crawl_queue(
+        crawl_id,
+        offset: int,
+        count: int,
+        regex: Optional[str] = "",
+        archive: Archive = Depends(archive_crawl_dep),
+    ):
+        await ops.get_crawl_raw(crawl_id, archive)
+
+        return await ops.get_crawl_queue(crawl_id, offset, count, regex)
+
+    @app.get(
+        "/archives/{aid}/crawls/{crawl_id}/queueMatchAll",
+        tags=["crawls"],
+    )
+    async def match_crawl_queue(
+        crawl_id, regex: str, archive: Archive = Depends(archive_crawl_dep)
+    ):
+        await ops.get_crawl_raw(crawl_id, archive)
+
+        return await ops.match_crawl_queue(crawl_id, regex)
+
    return ops