Crawl Queue API (#342)

* crawl queue api work: (#329) - add api to /crawls/{crawl_id}/queue api to get crawl queue, with offset, count, and optional regex. returns results and regex matches within the results, along with total urls in queue. - add api to match entire crawl queue, /crawls/{crawl_id}/queueMatch with query 'regex' arg, which processes entire crawl queue on backend and returns a list of matches (more experimental) - if crawl not yet started / redis not available, return empty queue - only supported for k8s deployment at the moment
2022-10-12 19:56:13 -07:00 · 2022-10-12 19:56:13 -07:00 · f7836c345d
commit f7836c345d
parent 8708c24a74
1 changed files with 112 additions and 22 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -3,12 +3,15 @@
 import asyncio
 import uuid
 import os
 import json
 import re
 from typing import Optional, List, Dict, Union
 from datetime import datetime, timedelta
 from fastapi import Depends, HTTPException
 from pydantic import BaseModel, UUID4, conint
 from redis import asyncio as aioredis, exceptions
 import pymongo
@ -155,6 +158,7 @@ class CrawlOps:
        self.crawl_configs = crawl_configs
        self.user_manager = users
        self.archives = archives
        self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
        self.crawl_configs.set_crawl_ops(self)
@ -405,6 +409,66 @@ class CrawlOps:
        # return whatever detail may be included in the response
        raise HTTPException(status_code=400, detail=result.get("error"))
    async def get_crawl_queue(self, crawl_id, offset, count, regex):
        """ get crawl queue """
        total = 0
        results = []
        redis = None
        try:
            redis = await aioredis.from_url(
                self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
            )
            total = await redis.llen(f"{crawl_id}:q")
            results = await redis.lrange(f"{crawl_id}:q", offset, count)
            results = [json.loads(result)["url"] for result in results]
        except exceptions.ConnectionError:
            # can't connect to redis, likely not initialized yet
            pass
        matched = []
        if regex:
            regex = re.compile(regex)
            matched = [result for result in results if regex.search(result)]
        return {"total": total, "results": results, "matched": matched}
    async def match_crawl_queue(self, crawl_id, regex):
        """ get crawl queue """
        total = 0
        try:
            redis = await aioredis.from_url(
                self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
            )
            total = await redis.llen(f"{crawl_id}:q")
        except exceptions.ConnectionError:
            # can't connect to redis, likely not initialized yet
            pass
        matched = []
        regex = re.compile(regex)
        step = 50
        for count in range(0, total, step):
            results = await redis.lrange(f"{crawl_id}:q", count, count + step)
            for result in results:
                url = json.loads(result)["url"]
                if regex.search(url):
                    matched.append(url)
        return {"total": total, "matched": matched}
    def get_redis_url(self, crawl_id):
        """ get redis url for crawl id """
        # pylint: disable=line-too-long
        return f"redis://redis-{crawl_id}-0.redis-{crawl_id}.{self.namespace}.svc.cluster.local/0"
 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals
@ -505,6 +569,32 @@ def init_crawls_api(
        if await ops.get_crawl_raw(crawl_id, archive):
            return {}
    @app.get(
        "/archives/{aid}/crawls/{crawl_id}/queue",
        tags=["crawls"],
    )
    async def get_crawl_queue(
        crawl_id,
        offset: int,
        count: int,
        regex: Optional[str] = "",
        archive: Archive = Depends(archive_crawl_dep),
    ):
        await ops.get_crawl_raw(crawl_id, archive)
        return await ops.get_crawl_queue(crawl_id, offset, count, regex)
    @app.get(
        "/archives/{aid}/crawls/{crawl_id}/queueMatchAll",
        tags=["crawls"],
    )
    async def match_crawl_queue(
        crawl_id, regex: str, archive: Archive = Depends(archive_crawl_dep)
    ):
        await ops.get_crawl_raw(crawl_id, archive)
        return await ops.match_crawl_queue(crawl_id, regex)
    return ops