Crawl Queue API (#342)
* crawl queue api work: (#329) - add api to /crawls/{crawl_id}/queue api to get crawl queue, with offset, count, and optional regex. returns results and regex matches within the results, along with total urls in queue. - add api to match entire crawl queue, /crawls/{crawl_id}/queueMatch with query 'regex' arg, which processes entire crawl queue on backend and returns a list of matches (more experimental) - if crawl not yet started / redis not available, return empty queue - only supported for k8s deployment at the moment
This commit is contained in:
parent
8708c24a74
commit
f7836c345d
@ -3,12 +3,15 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import uuid
|
import uuid
|
||||||
import os
|
import os
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
from typing import Optional, List, Dict, Union
|
from typing import Optional, List, Dict, Union
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
from fastapi import Depends, HTTPException
|
from fastapi import Depends, HTTPException
|
||||||
from pydantic import BaseModel, UUID4, conint
|
from pydantic import BaseModel, UUID4, conint
|
||||||
|
from redis import asyncio as aioredis, exceptions
|
||||||
import pymongo
|
import pymongo
|
||||||
|
|
||||||
|
|
||||||
@ -155,6 +158,7 @@ class CrawlOps:
|
|||||||
self.crawl_configs = crawl_configs
|
self.crawl_configs = crawl_configs
|
||||||
self.user_manager = users
|
self.user_manager = users
|
||||||
self.archives = archives
|
self.archives = archives
|
||||||
|
self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
|
||||||
|
|
||||||
self.crawl_configs.set_crawl_ops(self)
|
self.crawl_configs.set_crawl_ops(self)
|
||||||
|
|
||||||
@ -405,6 +409,66 @@ class CrawlOps:
|
|||||||
# return whatever detail may be included in the response
|
# return whatever detail may be included in the response
|
||||||
raise HTTPException(status_code=400, detail=result.get("error"))
|
raise HTTPException(status_code=400, detail=result.get("error"))
|
||||||
|
|
||||||
|
async def get_crawl_queue(self, crawl_id, offset, count, regex):
|
||||||
|
""" get crawl queue """
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
results = []
|
||||||
|
redis = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
redis = await aioredis.from_url(
|
||||||
|
self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
|
||||||
|
)
|
||||||
|
|
||||||
|
total = await redis.llen(f"{crawl_id}:q")
|
||||||
|
results = await redis.lrange(f"{crawl_id}:q", offset, count)
|
||||||
|
results = [json.loads(result)["url"] for result in results]
|
||||||
|
except exceptions.ConnectionError:
|
||||||
|
# can't connect to redis, likely not initialized yet
|
||||||
|
pass
|
||||||
|
|
||||||
|
matched = []
|
||||||
|
if regex:
|
||||||
|
regex = re.compile(regex)
|
||||||
|
matched = [result for result in results if regex.search(result)]
|
||||||
|
|
||||||
|
return {"total": total, "results": results, "matched": matched}
|
||||||
|
|
||||||
|
async def match_crawl_queue(self, crawl_id, regex):
|
||||||
|
""" get crawl queue """
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
redis = await aioredis.from_url(
|
||||||
|
self.get_redis_url(crawl_id), encoding="utf-8", decode_responses=True
|
||||||
|
)
|
||||||
|
|
||||||
|
total = await redis.llen(f"{crawl_id}:q")
|
||||||
|
except exceptions.ConnectionError:
|
||||||
|
# can't connect to redis, likely not initialized yet
|
||||||
|
pass
|
||||||
|
|
||||||
|
matched = []
|
||||||
|
regex = re.compile(regex)
|
||||||
|
|
||||||
|
step = 50
|
||||||
|
|
||||||
|
for count in range(0, total, step):
|
||||||
|
results = await redis.lrange(f"{crawl_id}:q", count, count + step)
|
||||||
|
for result in results:
|
||||||
|
url = json.loads(result)["url"]
|
||||||
|
if regex.search(url):
|
||||||
|
matched.append(url)
|
||||||
|
|
||||||
|
return {"total": total, "matched": matched}
|
||||||
|
|
||||||
|
def get_redis_url(self, crawl_id):
|
||||||
|
""" get redis url for crawl id """
|
||||||
|
# pylint: disable=line-too-long
|
||||||
|
return f"redis://redis-{crawl_id}-0.redis-{crawl_id}.{self.namespace}.svc.cluster.local/0"
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=too-many-arguments, too-many-locals
|
# pylint: disable=too-many-arguments, too-many-locals
|
||||||
@ -505,6 +569,32 @@ def init_crawls_api(
|
|||||||
if await ops.get_crawl_raw(crawl_id, archive):
|
if await ops.get_crawl_raw(crawl_id, archive):
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@app.get(
|
||||||
|
"/archives/{aid}/crawls/{crawl_id}/queue",
|
||||||
|
tags=["crawls"],
|
||||||
|
)
|
||||||
|
async def get_crawl_queue(
|
||||||
|
crawl_id,
|
||||||
|
offset: int,
|
||||||
|
count: int,
|
||||||
|
regex: Optional[str] = "",
|
||||||
|
archive: Archive = Depends(archive_crawl_dep),
|
||||||
|
):
|
||||||
|
await ops.get_crawl_raw(crawl_id, archive)
|
||||||
|
|
||||||
|
return await ops.get_crawl_queue(crawl_id, offset, count, regex)
|
||||||
|
|
||||||
|
@app.get(
|
||||||
|
"/archives/{aid}/crawls/{crawl_id}/queueMatchAll",
|
||||||
|
tags=["crawls"],
|
||||||
|
)
|
||||||
|
async def match_crawl_queue(
|
||||||
|
crawl_id, regex: str, archive: Archive = Depends(archive_crawl_dep)
|
||||||
|
):
|
||||||
|
await ops.get_crawl_raw(crawl_id, archive)
|
||||||
|
|
||||||
|
return await ops.match_crawl_queue(crawl_id, regex)
|
||||||
|
|
||||||
return ops
|
return ops
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user