backend: update queue apis to work with new sorted queue apis (also b… (#712)
* backend: update queue apis to work with new sorted queue apis (also backwards compatible to existing apis) designed for browsertrix-crawler 0.9.0-beta.1 but also backwards compatible with older list-based queue as well
This commit is contained in:
parent
b9a24fa5e2
commit
07e9f51292
@ -502,6 +502,30 @@ class CrawlOps:
|
|||||||
# return whatever detail may be included in the response
|
# return whatever detail may be included in the response
|
||||||
raise HTTPException(status_code=400, detail=result.get("error"))
|
raise HTTPException(status_code=400, detail=result.get("error"))
|
||||||
|
|
||||||
|
async def _crawl_queue_len(self, redis, key):
|
||||||
|
try:
|
||||||
|
return await redis.zcard(key)
|
||||||
|
except exceptions.ResponseError:
|
||||||
|
# fallback to old crawler queue
|
||||||
|
return await redis.llen(key)
|
||||||
|
|
||||||
|
async def _crawl_queue_range(self, redis, key, offset, count):
|
||||||
|
try:
|
||||||
|
return await redis.zrangebyscore(key, 0, "inf", offset, count)
|
||||||
|
except exceptions.ResponseError:
|
||||||
|
# fallback to old crawler queue
|
||||||
|
return reversed(await redis.lrange(key, -offset - count, -offset - 1))
|
||||||
|
|
||||||
|
async def _crawl_queue_rem(self, redis, key, values, dircount=1):
|
||||||
|
try:
|
||||||
|
return await redis.zrem(key, *values)
|
||||||
|
except exceptions.ResponseError:
|
||||||
|
# fallback to old crawler queue
|
||||||
|
res = 0
|
||||||
|
for value in values:
|
||||||
|
res += await redis.lrem(key, dircount, value)
|
||||||
|
return res
|
||||||
|
|
||||||
async def get_crawl_queue(self, crawl_id, offset, count, regex):
|
async def get_crawl_queue(self, crawl_id, offset, count, regex):
|
||||||
"""get crawl queue"""
|
"""get crawl queue"""
|
||||||
|
|
||||||
@ -511,9 +535,12 @@ class CrawlOps:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
redis = await self.get_redis(crawl_id)
|
redis = await self.get_redis(crawl_id)
|
||||||
total = await redis.llen(f"{crawl_id}:q")
|
|
||||||
results = await redis.lrange(f"{crawl_id}:q", -offset - count, -offset - 1)
|
total = await self._crawl_queue_len(redis, f"{crawl_id}:q")
|
||||||
results = [json.loads(result)["url"] for result in reversed(results)]
|
results = await self._crawl_queue_range(
|
||||||
|
redis, f"{crawl_id}:q", offset, count
|
||||||
|
)
|
||||||
|
results = [json.loads(result)["url"] for result in results]
|
||||||
except exceptions.ConnectionError:
|
except exceptions.ConnectionError:
|
||||||
# can't connect to redis, likely not initialized yet
|
# can't connect to redis, likely not initialized yet
|
||||||
pass
|
pass
|
||||||
@ -525,9 +552,6 @@ class CrawlOps:
|
|||||||
|
|
||||||
return {"total": total, "results": results, "matched": matched}
|
return {"total": total, "results": results, "matched": matched}
|
||||||
|
|
||||||
async def iter_crawl_queue(self, regex, redis, crawl_id, total, step=50):
|
|
||||||
"""iterate over urls that match regex in crawl queue list"""
|
|
||||||
|
|
||||||
async def match_crawl_queue(self, crawl_id, regex):
|
async def match_crawl_queue(self, crawl_id, regex):
|
||||||
"""get list of urls that match regex"""
|
"""get list of urls that match regex"""
|
||||||
total = 0
|
total = 0
|
||||||
@ -535,7 +559,7 @@ class CrawlOps:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
redis = await self.get_redis(crawl_id)
|
redis = await self.get_redis(crawl_id)
|
||||||
total = await redis.llen(f"{crawl_id}:q")
|
total = await self._crawl_queue_len(redis, f"{crawl_id}:q")
|
||||||
except exceptions.ConnectionError:
|
except exceptions.ConnectionError:
|
||||||
# can't connect to redis, likely not initialized yet
|
# can't connect to redis, likely not initialized yet
|
||||||
pass
|
pass
|
||||||
@ -545,8 +569,8 @@ class CrawlOps:
|
|||||||
step = 50
|
step = 50
|
||||||
|
|
||||||
for count in range(0, total, step):
|
for count in range(0, total, step):
|
||||||
results = await redis.lrange(f"{crawl_id}:q", -count - step, -count - 1)
|
results = await self._crawl_queue_range(redis, f"{crawl_id}:q", count, step)
|
||||||
for result in reversed(results):
|
for result in results:
|
||||||
url = json.loads(result)["url"]
|
url = json.loads(result)["url"]
|
||||||
if regex.search(url):
|
if regex.search(url):
|
||||||
matched.append(url)
|
matched.append(url)
|
||||||
@ -555,6 +579,7 @@ class CrawlOps:
|
|||||||
|
|
||||||
async def filter_crawl_queue(self, crawl_id, regex):
|
async def filter_crawl_queue(self, crawl_id, regex):
|
||||||
"""filter out urls that match regex"""
|
"""filter out urls that match regex"""
|
||||||
|
# pylint: disable=too-many-locals
|
||||||
total = 0
|
total = 0
|
||||||
redis = None
|
redis = None
|
||||||
|
|
||||||
@ -563,7 +588,7 @@ class CrawlOps:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
redis = await self.get_redis(crawl_id)
|
redis = await self.get_redis(crawl_id)
|
||||||
total = await redis.llen(q_key)
|
total = await self._crawl_queue_len(redis, f"{crawl_id}:q")
|
||||||
except exceptions.ConnectionError:
|
except exceptions.ConnectionError:
|
||||||
# can't connect to redis, likely not initialized yet
|
# can't connect to redis, likely not initialized yet
|
||||||
pass
|
pass
|
||||||
@ -583,17 +608,29 @@ class CrawlOps:
|
|||||||
while count < total:
|
while count < total:
|
||||||
if dircount == -1 and count > total / 2:
|
if dircount == -1 and count > total / 2:
|
||||||
dircount = 1
|
dircount = 1
|
||||||
results = await redis.lrange(q_key, -count - step, -count - 1)
|
results = await self._crawl_queue_range(redis, q_key, count, step)
|
||||||
count += step
|
count += step
|
||||||
for result in reversed(results):
|
|
||||||
|
qrems = []
|
||||||
|
srems = []
|
||||||
|
|
||||||
|
for result in results:
|
||||||
url = json.loads(result)["url"]
|
url = json.loads(result)["url"]
|
||||||
if regex.search(url):
|
if regex.search(url):
|
||||||
await redis.srem(s_key, url)
|
srems.append(url)
|
||||||
res = await redis.lrem(q_key, dircount, result)
|
# await redis.srem(s_key, url)
|
||||||
if res:
|
# res = await self._crawl_queue_rem(redis, q_key, result, dircount)
|
||||||
count -= res
|
qrems.append(result)
|
||||||
num_removed += res
|
|
||||||
print(f"Removed {result}: {res}", flush=True)
|
if not srems:
|
||||||
|
continue
|
||||||
|
|
||||||
|
await redis.srem(s_key, *srems)
|
||||||
|
res = await self._crawl_queue_rem(redis, q_key, qrems, dircount)
|
||||||
|
if res:
|
||||||
|
count -= res
|
||||||
|
num_removed += res
|
||||||
|
print(f"Removed {res} from queue", flush=True)
|
||||||
|
|
||||||
return num_removed
|
return num_removed
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user