Watch Stream Directly from Browsertrix Crawler (#189)
* watch work: proxy directly to crawls instead of redis pubsub
- add 'watchIPs' to crawl detail output
- cache crawl ips for quick access for auth
- add '/ipaccess/{ip}' endpoint for watch ws connection to ensure ws has access to the specified container ip
- enable 'auth_request' in nginx frontend
- requirements: update to latest redis-py
remaining fixes for #134
This commit is contained in:
parent
c18418ff09
commit
cdd0ab34a3
@ -8,11 +8,11 @@ import os
|
|||||||
from typing import Optional, List, Dict, Union
|
from typing import Optional, List, Dict, Union
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import websockets
|
from fastapi import Depends, HTTPException
|
||||||
from fastapi import Depends, HTTPException, WebSocket
|
|
||||||
from pydantic import BaseModel, UUID4, conint
|
from pydantic import BaseModel, UUID4, conint
|
||||||
import pymongo
|
import pymongo
|
||||||
import aioredis
|
from redis import asyncio as aioredis
|
||||||
|
|
||||||
|
|
||||||
from db import BaseMongoModel
|
from db import BaseMongoModel
|
||||||
from archives import Archive, MAX_CRAWL_SCALE
|
from archives import Archive, MAX_CRAWL_SCALE
|
||||||
@ -89,6 +89,8 @@ class CrawlOut(Crawl):
|
|||||||
configName: Optional[str]
|
configName: Optional[str]
|
||||||
resources: Optional[List[CrawlFileOut]] = []
|
resources: Optional[List[CrawlFileOut]] = []
|
||||||
|
|
||||||
|
watchIPs: Optional[List[str]] = []
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class ListCrawlOut(BaseMongoModel):
|
class ListCrawlOut(BaseMongoModel):
|
||||||
@ -169,7 +171,6 @@ class CrawlOps:
|
|||||||
self.redis = await aioredis.from_url(
|
self.redis = await aioredis.from_url(
|
||||||
redis_url, encoding="utf-8", decode_responses=True
|
redis_url, encoding="utf-8", decode_responses=True
|
||||||
)
|
)
|
||||||
self.pubsub = self.redis.pubsub()
|
|
||||||
|
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
loop.create_task(self.run_crawl_complete_loop())
|
loop.create_task(self.run_crawl_complete_loop())
|
||||||
@ -324,6 +325,7 @@ class CrawlOps:
|
|||||||
crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id_str)
|
crawl = await self.crawl_manager.get_running_crawl(crawlid, archive.id_str)
|
||||||
if crawl:
|
if crawl:
|
||||||
await self.get_redis_stats([crawl])
|
await self.get_redis_stats([crawl])
|
||||||
|
await self.cache_ips(crawl)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
files = [CrawlFile(**data) for data in res["files"]]
|
files = [CrawlFile(**data) for data in res["files"]]
|
||||||
@ -421,6 +423,19 @@ class CrawlOps:
|
|||||||
for crawl, (done, total) in zip(crawl_list, pairwise(results)):
|
for crawl, (done, total) in zip(crawl_list, pairwise(results)):
|
||||||
crawl.stats = {"done": done, "found": total}
|
crawl.stats = {"done": done, "found": total}
|
||||||
|
|
||||||
|
async def cache_ips(self, crawl: CrawlOut):
|
||||||
|
""" cache ips for ws auth check """
|
||||||
|
if crawl.watchIPs:
|
||||||
|
await self.redis.sadd(f"{crawl.id}:ips", *crawl.watchIPs)
|
||||||
|
await self.redis.expire(f"{crawl.id}:ips", 300)
|
||||||
|
|
||||||
|
async def ip_access_check(self, crawl_id, crawler_ip):
|
||||||
|
""" check if ip has access to this crawl based on redis cached ip """
|
||||||
|
if await self.redis.sismember(f"{crawl_id}:ips", crawler_ip):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
raise HTTPException(status_code=403, detail="Unauthorized")
|
||||||
|
|
||||||
async def delete_crawls(self, aid: uuid.UUID, delete_list: DeleteCrawlList):
|
async def delete_crawls(self, aid: uuid.UUID, delete_list: DeleteCrawlList):
|
||||||
""" Delete a list of crawls by id for given archive """
|
""" Delete a list of crawls by id for given archive """
|
||||||
res = await self.crawls.delete_many(
|
res = await self.crawls.delete_many(
|
||||||
@ -428,37 +443,6 @@ class CrawlOps:
|
|||||||
)
|
)
|
||||||
return res.deleted_count
|
return res.deleted_count
|
||||||
|
|
||||||
async def handle_watch_ws(self, crawl_id: str, websocket: WebSocket):
|
|
||||||
""" Handle watch WS by proxying screencast data via redis pubsub """
|
|
||||||
# ensure websocket connected
|
|
||||||
await websocket.accept()
|
|
||||||
|
|
||||||
ctrl_channel = f"c:{crawl_id}:ctrl"
|
|
||||||
cast_channel = f"c:{crawl_id}:cast"
|
|
||||||
|
|
||||||
await self.redis.publish(ctrl_channel, "connect")
|
|
||||||
|
|
||||||
async with self.pubsub as chan:
|
|
||||||
await chan.subscribe(cast_channel)
|
|
||||||
|
|
||||||
# pylint: disable=broad-except
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
message = await chan.get_message(ignore_subscribe_messages=True)
|
|
||||||
if not message:
|
|
||||||
continue
|
|
||||||
|
|
||||||
await websocket.send_text(message["data"])
|
|
||||||
|
|
||||||
except websockets.exceptions.ConnectionClosedOK:
|
|
||||||
pass
|
|
||||||
|
|
||||||
except Exception as exc:
|
|
||||||
print(exc, flush=True)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
await self.redis.publish(ctrl_channel, "disconnect")
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=too-many-arguments, too-many-locals
|
# pylint: disable=too-many-arguments, too-many-locals
|
||||||
@ -559,13 +543,15 @@ def init_crawls_api(
|
|||||||
|
|
||||||
return {"scaled": scale.scale}
|
return {"scaled": scale.scale}
|
||||||
|
|
||||||
@app.websocket("/archives/{aid}/crawls/{crawl_id}/watch/ws")
|
@app.get(
|
||||||
async def watch_ws(
|
"/archives/{aid}/crawls/{crawl_id}/ipaccess/{crawler_ip}",
|
||||||
crawl_id, websocket: WebSocket, archive: Archive = Depends(archive_crawl_dep)
|
tags=["crawls"],
|
||||||
):
|
)
|
||||||
# ensure crawl exists
|
|
||||||
await ops.get_crawl(crawl_id, archive)
|
|
||||||
|
|
||||||
await ops.handle_watch_ws(crawl_id, websocket)
|
# pylint: disable=unused-argument
|
||||||
|
async def ip_access_check(
|
||||||
|
crawl_id, crawler_ip, archive: Archive = Depends(archive_crawl_dep)
|
||||||
|
):
|
||||||
|
return await ops.ip_access_check(crawl_id, crawler_ip)
|
||||||
|
|
||||||
return ops
|
return ops
|
||||||
|
|||||||
@ -15,7 +15,8 @@ from tempfile import NamedTemporaryFile
|
|||||||
|
|
||||||
import aiodocker
|
import aiodocker
|
||||||
import aioprocessing
|
import aioprocessing
|
||||||
import aioredis
|
from redis import asyncio as aioredis
|
||||||
|
|
||||||
|
|
||||||
from scheduler import run_scheduler
|
from scheduler import run_scheduler
|
||||||
|
|
||||||
@ -360,6 +361,7 @@ class DockerManager:
|
|||||||
|
|
||||||
async def get_running_crawl(self, crawl_id, aid=None):
|
async def get_running_crawl(self, crawl_id, aid=None):
|
||||||
""" Return a single running crawl as CrawlOut """
|
""" Return a single running crawl as CrawlOut """
|
||||||
|
# pylint: disable=broad-except,bare-except
|
||||||
try:
|
try:
|
||||||
container = await self.client.containers.get(crawl_id)
|
container = await self.client.containers.get(crawl_id)
|
||||||
|
|
||||||
@ -373,10 +375,17 @@ class DockerManager:
|
|||||||
if stop_type == "canceled":
|
if stop_type == "canceled":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self._make_crawl_for_container(
|
crawl = self._make_crawl_for_container(
|
||||||
container, "stopping" if stop_type else "running", False, CrawlOut
|
container, "stopping" if stop_type else "running", False, CrawlOut
|
||||||
)
|
)
|
||||||
# pylint: disable=broad-except
|
|
||||||
|
try:
|
||||||
|
crawl.watchIPs = [container.attrs["NetworkSettings"]["IPAddress"]]
|
||||||
|
except:
|
||||||
|
crawl.watchIPs = []
|
||||||
|
|
||||||
|
return crawl
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(exc, flush=True)
|
print(exc, flush=True)
|
||||||
return None
|
return None
|
||||||
|
|||||||
@ -5,7 +5,8 @@ import datetime
|
|||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
import aioredis
|
from redis import asyncio as aioredis
|
||||||
|
|
||||||
|
|
||||||
from kubernetes_asyncio import client, config, watch
|
from kubernetes_asyncio import client, config, watch
|
||||||
from kubernetes_asyncio.stream import WsApiClient
|
from kubernetes_asyncio.stream import WsApiClient
|
||||||
@ -388,7 +389,17 @@ class K8SManager:
|
|||||||
if not status:
|
if not status:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self._make_crawl_for_job(job, status, False, CrawlOut)
|
crawl = self._make_crawl_for_job(job, status, False, CrawlOut)
|
||||||
|
|
||||||
|
pods = await self.core_api.list_namespaced_pod(
|
||||||
|
namespace=self.namespace,
|
||||||
|
label_selector=f"job-name={name},btrix.archive={aid}",
|
||||||
|
)
|
||||||
|
|
||||||
|
crawl.watchIPs = [
|
||||||
|
pod.status.pod_ip for pod in pods.items if pod.status.pod_ip
|
||||||
|
]
|
||||||
|
return crawl
|
||||||
|
|
||||||
# pylint: disable=broad-except
|
# pylint: disable=broad-except
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
@ -8,5 +8,4 @@ aiodocker
|
|||||||
apscheduler
|
apscheduler
|
||||||
aioprocessing
|
aioprocessing
|
||||||
aiobotocore
|
aiobotocore
|
||||||
aioredis
|
redis>=4.2.0rc1
|
||||||
websockets
|
|
||||||
|
|||||||
@ -64,18 +64,16 @@ spec:
|
|||||||
httpGet:
|
httpGet:
|
||||||
path: /healthz
|
path: /healthz
|
||||||
port: 8000
|
port: 8000
|
||||||
failureThreshold: 12
|
initialDelaySeconds: 20
|
||||||
periodSeconds: 5
|
periodSeconds: 5
|
||||||
timeoutSeconds: 3
|
failureThreshold: 30
|
||||||
failureThreshold: 5
|
|
||||||
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /healthz
|
path: /healthz
|
||||||
port: 8000
|
port: 8000
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 15
|
||||||
periodSeconds: 30
|
periodSeconds: 30
|
||||||
timeoutSeconds: 3
|
|
||||||
failureThreshold: 5
|
failureThreshold: 5
|
||||||
|
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
@ -84,7 +82,6 @@ spec:
|
|||||||
port: 8000
|
port: 8000
|
||||||
initialDelaySeconds: 15
|
initialDelaySeconds: 15
|
||||||
periodSeconds: 30
|
periodSeconds: 30
|
||||||
timeoutSeconds: 3
|
|
||||||
failureThreshold: 5
|
failureThreshold: 5
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -36,5 +36,28 @@ server {
|
|||||||
proxy_set_header Host $http_host;
|
proxy_set_header Host $http_host;
|
||||||
proxy_set_header X-Forwarded-Proto $scheme;
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
location ~* /watch/([^/]+)/([^/]+)/([^/]+)/ws {
|
||||||
|
set $archive $1;
|
||||||
|
set $crawl $2;
|
||||||
|
set $crawlerip $3;
|
||||||
|
set $auth_bearer $arg_auth_bearer;
|
||||||
|
auth_request /ipaccess;
|
||||||
|
|
||||||
|
proxy_pass http://$crawlerip:9037/ws;
|
||||||
|
proxy_set_header Host "localhost";
|
||||||
|
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Upgrade $http_upgrade;
|
||||||
|
proxy_set_header Connection $http_connection;
|
||||||
|
}
|
||||||
|
|
||||||
|
location = /ipaccess {
|
||||||
|
internal;
|
||||||
|
proxy_pass http://${BACKEND_HOST}:8000/archives/$archive/crawls/$crawl/ipaccess/$crawlerip?auth_bearer=$auth_bearer;
|
||||||
|
proxy_pass_request_body off;
|
||||||
|
proxy_set_header Content-Length "";
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user