Crawl Queue API (#342)
* crawl queue api work: (#329) - add api to /crawls/{crawl_id}/queue api to get crawl queue, with offset, count, and optional regex. returns results and regex matches within the results, along with total urls in queue. - add api to match entire crawl queue, /crawls/{crawl_id}/queueMatch with query 'regex' arg, which processes entire crawl queue on backend and returns a list of matches (more experimental) - if crawl not yet started / redis not available, return empty queue - only supported for k8s deployment at the moment
This commit is contained in:
parent
8708c24a74
commit
f7836c345d
@ -3,12 +3,15 @@
|
||||
import asyncio
|
||||
import uuid
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
from typing import Optional, List, Dict, Union
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from fastapi import Depends, HTTPException
|
||||
from pydantic import BaseModel, UUID4, conint
|
||||
from redis import asyncio as aioredis, exceptions
|
||||
import pymongo
|
||||
|
||||
|
||||
@ -155,6 +158,7 @@ class CrawlOps:
|
||||
self.crawl_configs = crawl_configs
|
||||
self.user_manager = users
|
||||
self.archives = archives
|
||||
self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
|
||||
|
||||
self.crawl_configs.set_crawl_ops(self)
|
||||
|
||||
@ -405,6 +409,66 @@ class CrawlOps:
|
||||
# return whatever detail may be included in the response
|
||||
raise HTTPException(status_code=400, detail=result.get("error"))
|
||||
|
||||
async def get_crawl_queue(self, crawl_id, offset, count, regex):
|
||||
""" get crawl queue """
|
||||
|
||||
total = 0
|
||||
results = []
|
||||
redis = None
|
||||
|
||||
try:
|
||||
redis = await aioredis.from_url(
|
||||
self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
|
||||
)
|
||||
|
||||
total = await redis.llen(f"{crawl_id}:q")
|
||||
results = await redis.lrange(f"{crawl_id}:q", offset, count)
|
||||
results = [json.loads(result)["url"] for result in results]
|
||||
except exceptions.ConnectionError:
|
||||
# can't connect to redis, likely not initialized yet
|
||||
pass
|
||||
|
||||
matched = []
|
||||
if regex:
|
||||
regex = re.compile(regex)
|
||||
matched = [result for result in results if regex.search(result)]
|
||||
|
||||
return {"total": total, "results": results, "matched": matched}
|
||||
|
||||
async def match_crawl_queue(self, crawl_id, regex):
|
||||
""" get crawl queue """
|
||||
|
||||
total = 0
|
||||
|
||||
try:
|
||||
redis = await aioredis.from_url(
|
||||
self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
|
||||
)
|
||||
|
||||
total = await redis.llen(f"{crawl_id}:q")
|
||||
except exceptions.ConnectionError:
|
||||
# can't connect to redis, likely not initialized yet
|
||||
pass
|
||||
|
||||
matched = []
|
||||
regex = re.compile(regex)
|
||||
|
||||
step = 50
|
||||
|
||||
for count in range(0, total, step):
|
||||
results = await redis.lrange(f"{crawl_id}:q", count, count + step)
|
||||
for result in results:
|
||||
url = json.loads(result)["url"]
|
||||
if regex.search(url):
|
||||
matched.append(url)
|
||||
|
||||
return {"total": total, "matched": matched}
|
||||
|
||||
def get_redis_url(self, crawl_id):
|
||||
""" get redis url for crawl id """
|
||||
# pylint: disable=line-too-long
|
||||
return f"redis://redis-{crawl_id}-0.redis-{crawl_id}.{self.namespace}.svc.cluster.local/0"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-arguments, too-many-locals
|
||||
@ -505,6 +569,32 @@ def init_crawls_api(
|
||||
if await ops.get_crawl_raw(crawl_id, archive):
|
||||
return {}
|
||||
|
||||
@app.get(
|
||||
"/archives/{aid}/crawls/{crawl_id}/queue",
|
||||
tags=["crawls"],
|
||||
)
|
||||
async def get_crawl_queue(
|
||||
crawl_id,
|
||||
offset: int,
|
||||
count: int,
|
||||
regex: Optional[str] = "",
|
||||
archive: Archive = Depends(archive_crawl_dep),
|
||||
):
|
||||
await ops.get_crawl_raw(crawl_id, archive)
|
||||
|
||||
return await ops.get_crawl_queue(crawl_id, offset, count, regex)
|
||||
|
||||
@app.get(
|
||||
"/archives/{aid}/crawls/{crawl_id}/queueMatchAll",
|
||||
tags=["crawls"],
|
||||
)
|
||||
async def match_crawl_queue(
|
||||
crawl_id, regex: str, archive: Archive = Depends(archive_crawl_dep)
|
||||
):
|
||||
await ops.get_crawl_raw(crawl_id, archive)
|
||||
|
||||
return await ops.match_crawl_queue(crawl_id, regex)
|
||||
|
||||
return ops
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user