Crawl Queue API (#342)

* crawl queue api work: (#329)
- add api to /crawls/{crawl_id}/queue api to get crawl queue, with offset, count, and optional regex. returns results and regex matches within the results, along with total urls in queue.
- add api to match entire crawl queue, /crawls/{crawl_id}/queueMatch with query 'regex' arg, which processes entire crawl queue on backend and returns a list of matches (more experimental)
- if crawl not yet started / redis not available, return empty queue
- only supported for k8s deployment at the moment
This commit is contained in:
Ilya Kreymer 2022-10-12 19:56:13 -07:00 committed by GitHub
parent 8708c24a74
commit f7836c345d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -3,12 +3,15 @@
import asyncio
import uuid
import os
import json
import re
from typing import Optional, List, Dict, Union
from datetime import datetime, timedelta
from fastapi import Depends, HTTPException
from pydantic import BaseModel, UUID4, conint
from redis import asyncio as aioredis, exceptions
import pymongo
@ -155,6 +158,7 @@ class CrawlOps:
self.crawl_configs = crawl_configs
self.user_manager = users
self.archives = archives
self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
self.crawl_configs.set_crawl_ops(self)
@ -405,6 +409,66 @@ class CrawlOps:
# return whatever detail may be included in the response
raise HTTPException(status_code=400, detail=result.get("error"))
async def get_crawl_queue(self, crawl_id, offset, count, regex):
""" get crawl queue """
total = 0
results = []
redis = None
try:
redis = await aioredis.from_url(
self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
)
total = await redis.llen(f"{crawl_id}:q")
results = await redis.lrange(f"{crawl_id}:q", offset, count)
results = [json.loads(result)["url"] for result in results]
except exceptions.ConnectionError:
# can't connect to redis, likely not initialized yet
pass
matched = []
if regex:
regex = re.compile(regex)
matched = [result for result in results if regex.search(result)]
return {"total": total, "results": results, "matched": matched}
async def match_crawl_queue(self, crawl_id, regex):
""" get crawl queue """
total = 0
try:
redis = await aioredis.from_url(
self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
)
total = await redis.llen(f"{crawl_id}:q")
except exceptions.ConnectionError:
# can't connect to redis, likely not initialized yet
pass
matched = []
regex = re.compile(regex)
step = 50
for count in range(0, total, step):
results = await redis.lrange(f"{crawl_id}:q", count, count + step)
for result in results:
url = json.loads(result)["url"]
if regex.search(url):
matched.append(url)
return {"total": total, "matched": matched}
def get_redis_url(self, crawl_id):
""" get redis url for crawl id """
# pylint: disable=line-too-long
return f"redis://redis-{crawl_id}-0.redis-{crawl_id}.{self.namespace}.svc.cluster.local/0"
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals
@ -505,6 +569,32 @@ def init_crawls_api(
if await ops.get_crawl_raw(crawl_id, archive):
return {}
@app.get(
"/archives/{aid}/crawls/{crawl_id}/queue",
tags=["crawls"],
)
async def get_crawl_queue(
crawl_id,
offset: int,
count: int,
regex: Optional[str] = "",
archive: Archive = Depends(archive_crawl_dep),
):
await ops.get_crawl_raw(crawl_id, archive)
return await ops.get_crawl_queue(crawl_id, offset, count, regex)
@app.get(
"/archives/{aid}/crawls/{crawl_id}/queueMatchAll",
tags=["crawls"],
)
async def match_crawl_queue(
crawl_id, regex: str, archive: Archive = Depends(archive_crawl_dep)
):
await ops.get_crawl_raw(crawl_id, archive)
return await ops.match_crawl_queue(crawl_id, regex)
return ops