Crawl Queue API (#342)

* crawl queue api work: (#329) - add api to /crawls/{crawl_id}/queue api to get crawl queue, with offset, count, and optional regex. returns results and regex matches within the results, along with total urls in queue. - add api to match entire crawl queue, /crawls/{crawl_id}/queueMatch with query 'regex' arg, which processes entire crawl queue on backend and returns a list of matches (more experimental) - if crawl not yet started / redis not available, return empty queue - only supported for k8s deployment at the moment
2022-10-12 19:56:13 -07:00 · 2022-10-12 19:56:13 -07:00 · f7836c345d
commit f7836c345d
parent 8708c24a74
1 changed files with 112 additions and 22 deletions
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -3,12 +3,15 @@
 import asyncio
 import uuid
 import os
 import json
 import re
 from typing import Optional, List, Dict, Union
 from datetime import datetime, timedelta
 from fastapi import Depends, HTTPException
 from pydantic import BaseModel, UUID4, conint
 from redis import asyncio as aioredis, exceptions
 import pymongo
@ -20,21 +23,21 @@ from .storages import get_presigned_url
 # ============================================================================
 class DeleteCrawlList(BaseModel):
-    """ delete crawl list POST body """
+    """delete crawl list POST body"""
    crawl_ids: List[str]
 # ============================================================================
 class CrawlScale(BaseModel):
-    """ scale the crawl to N parallel containers """
+    """scale the crawl to N parallel containers"""
    scale: conint(ge=1, le=MAX_CRAWL_SCALE) = 1
 # ============================================================================
 class CrawlFile(BaseModel):
-    """ file from a crawl """
+    """file from a crawl"""
    filename: str
    hash: str
@ -47,7 +50,7 @@ class CrawlFile(BaseModel):
 # ============================================================================
 class CrawlFileOut(BaseModel):
-    """ output for file from a crawl (conformance to Data Resource Spec) """
+    """output for file from a crawl (conformance to Data Resource Spec)"""
    name: str
    path: str
@ -57,7 +60,7 @@ class CrawlFileOut(BaseModel):
 # ============================================================================
 class Crawl(BaseMongoModel):
-    """ Store State of a Crawl (Finished or Running) """
+    """Store State of a Crawl (Finished or Running)"""
    id: str
@ -85,7 +88,7 @@ class Crawl(BaseMongoModel):
 # ============================================================================
 class CrawlOut(Crawl):
-    """ Output for single crawl, add configName and userName"""
+    """Output for single crawl, add configName and userName"""
    userName: Optional[str]
    configName: Optional[str]
@ -96,7 +99,7 @@ class CrawlOut(Crawl):
 # ============================================================================
 class ListCrawlOut(BaseMongoModel):
-    """ Crawl output model for list view """
+    """Crawl output model for list view"""
    id: str
@ -124,14 +127,14 @@ class ListCrawlOut(BaseMongoModel):
 # ============================================================================
 class ListCrawls(BaseModel):
-    """ Response model for list of crawls """
+    """Response model for list of crawls"""
    crawls: List[ListCrawlOut]
 # ============================================================================
 class CrawlCompleteIn(BaseModel):
-    """ Completed Crawl Webhook POST message  """
+    """Completed Crawl Webhook POST message"""
    id: str
@ -146,7 +149,7 @@ class CrawlCompleteIn(BaseModel):
 # ============================================================================
 class CrawlOps:
-    """ Crawl Ops """
+    """Crawl Ops"""
    # pylint: disable=too-many-arguments, too-many-instance-attributes
    def __init__(self, mdb, users, crawl_manager, crawl_configs, archives):
@ -155,6 +158,7 @@ class CrawlOps:
        self.crawl_configs = crawl_configs
        self.user_manager = users
        self.archives = archives
        self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
        self.crawl_configs.set_crawl_ops(self)
@ -163,7 +167,7 @@ class CrawlOps:
        asyncio.create_task(self.init_index())
    async def init_index(self):
-        """ init index for crawls db """
+        """init index for crawls db"""
        await self.crawls.create_index("colls")
    async def list_crawls(
@ -174,7 +178,7 @@ class CrawlOps:
        exclude_files=True,
        running_only=False,
    ):
-        """List all finished crawls from the db """
+        """List all finished crawls from the db"""
        aid = archive.id if archive else None
@ -233,7 +237,7 @@ class CrawlOps:
        return crawls
    async def get_crawl_raw(self, crawlid: str, archive: Archive):
-        """ Get data for single crawl """
+        """Get data for single crawl"""
        query = {"_id": crawlid}
        if archive:
@ -247,7 +251,7 @@ class CrawlOps:
        return res
    async def get_crawl(self, crawlid: str, archive: Archive):
-        """ Get data for single crawl """
+        """Get data for single crawl"""
        res = await self.get_crawl_raw(crawlid, archive)
@ -268,7 +272,7 @@ class CrawlOps:
    async def _resolve_crawl_refs(
        self, crawl: Union[CrawlOut, ListCrawlOut], archive: Archive
    ):
-        """ Resolve running crawl data """
+        """Resolve running crawl data"""
        config = await self.crawl_configs.get_crawl_config(
            crawl.cid, archive, active_only=False
        )
@ -331,14 +335,14 @@ class CrawlOps:
            await self.crawls.find_one_and_update(*update)
    async def delete_crawls(self, aid: uuid.UUID, delete_list: DeleteCrawlList):
-        """ Delete a list of crawls by id for given archive """
+        """Delete a list of crawls by id for given archive"""
        res = await self.crawls.delete_many(
            {"_id": {"$in": delete_list.crawl_ids}, "aid": aid}
        )
        return res.deleted_count
    async def add_new_crawl(self, crawl_id: str, crawlconfig):
-        """ initialize new crawl """
+        """initialize new crawl"""
        crawl = Crawl(
            id=crawl_id,
            state="starting",
@ -358,7 +362,7 @@ class CrawlOps:
            return False
    async def update_crawl_state(self, crawl_id: str, state: str):
-        """ called only when job container is being stopped/canceled """
+        """called only when job container is being stopped/canceled"""
        data = {"state": state}
        # if cancelation, set the finish time here
@ -374,7 +378,7 @@ class CrawlOps:
        )
    async def shutdown_crawl(self, crawl_id: str, archive: Archive, graceful: bool):
-        """ stop or cancel specified crawl """
+        """stop or cancel specified crawl"""
        result = None
        try:
            result = await self.crawl_manager.shutdown_crawl(
@ -405,13 +409,73 @@ class CrawlOps:
        # return whatever detail may be included in the response
        raise HTTPException(status_code=400, detail=result.get("error"))
    async def get_crawl_queue(self, crawl_id, offset, count, regex):
        """ get crawl queue """
        total = 0
        results = []
        redis = None
        try:
            redis = await aioredis.from_url(
                self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
            )
            total = await redis.llen(f"{crawl_id}:q")
            results = await redis.lrange(f"{crawl_id}:q", offset, count)
            results = [json.loads(result)["url"] for result in results]
        except exceptions.ConnectionError:
            # can't connect to redis, likely not initialized yet
            pass
        matched = []
        if regex:
            regex = re.compile(regex)
            matched = [result for result in results if regex.search(result)]
        return {"total": total, "results": results, "matched": matched}
    async def match_crawl_queue(self, crawl_id, regex):
        """ get crawl queue """
        total = 0
        try:
            redis = await aioredis.from_url(
                self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
            )
            total = await redis.llen(f"{crawl_id}:q")
        except exceptions.ConnectionError:
            # can't connect to redis, likely not initialized yet
            pass
        matched = []
        regex = re.compile(regex)
        step = 50
        for count in range(0, total, step):
            results = await redis.lrange(f"{crawl_id}:q", count, count + step)
            for result in results:
                url = json.loads(result)["url"]
                if regex.search(url):
                    matched.append(url)
        return {"total": total, "matched": matched}
    def get_redis_url(self, crawl_id):
        """ get redis url for crawl id """
        # pylint: disable=line-too-long
        return f"redis://redis-{crawl_id}-0.redis-{crawl_id}.{self.namespace}.svc.cluster.local/0"
 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals
 def init_crawls_api(
    app, mdb, users, crawl_manager, crawl_config_ops, archives, user_dep
 ):
-    """ API for crawl management, including crawl done callback"""
+    """API for crawl management, including crawl done callback"""
    ops = CrawlOps(mdb, users, crawl_manager, crawl_config_ops, archives)
@ -505,14 +569,40 @@ def init_crawls_api(
        if await ops.get_crawl_raw(crawl_id, archive):
            return {}
    @app.get(
        "/archives/{aid}/crawls/{crawl_id}/queue",
        tags=["crawls"],
    )
    async def get_crawl_queue(
        crawl_id,
        offset: int,
        count: int,
        regex: Optional[str] = "",
        archive: Archive = Depends(archive_crawl_dep),
    ):
        await ops.get_crawl_raw(crawl_id, archive)
        return await ops.get_crawl_queue(crawl_id, offset, count, regex)
    @app.get(
        "/archives/{aid}/crawls/{crawl_id}/queueMatchAll",
        tags=["crawls"],
    )
    async def match_crawl_queue(
        crawl_id, regex: str, archive: Archive = Depends(archive_crawl_dep)
    ):
        await ops.get_crawl_raw(crawl_id, archive)
        return await ops.match_crawl_queue(crawl_id, regex)
    return ops
 def dt_now():
-    """ get current ts """
+    """get current ts"""
    return datetime.utcnow().replace(microsecond=0, tzinfo=None)
 def ts_now():
-    """ get current ts """
+    """get current ts"""
    return str(dt_now())