If a crawl is completed, the endpoint streams the logs from the log files in all of the created WACZ files, sorted by timestamp. The API endpoint supports filtering by log_level and context whether the crawl is still running or not. This is not yet proper streaming because the entire log file is read into memory before being streamed to the client. We will want to switch to proper streaming eventually, but are currently blocked by an aiobotocore bug - see: https://github.com/aio-libs/aiobotocore/issues/991?#issuecomment-1490737762
188 lines
6.0 KiB
Python
188 lines
6.0 KiB
Python
"""
|
|
Storage API
|
|
"""
|
|
from typing import Union
|
|
from urllib.parse import urlsplit
|
|
from contextlib import asynccontextmanager
|
|
|
|
from fastapi import Depends, HTTPException
|
|
from aiobotocore.session import get_session
|
|
|
|
from .orgs import Organization, DefaultStorage, S3Storage
|
|
from .users import User
|
|
from .zip import get_zip_file, extract_and_parse_log_file
|
|
|
|
|
|
# ============================================================================
|
|
def init_storages_api(org_ops, crawl_manager, user_dep):
|
|
"""API for updating storage for an org"""
|
|
|
|
router = org_ops.router
|
|
org_owner_dep = org_ops.org_owner_dep
|
|
|
|
# pylint: disable=bare-except, raise-missing-from
|
|
@router.patch("/storage", tags=["organizations"])
|
|
async def update_storage(
|
|
storage: Union[S3Storage, DefaultStorage],
|
|
org: Organization = Depends(org_owner_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
if storage.type == "default":
|
|
try:
|
|
await crawl_manager.check_storage(storage.name, is_default=True)
|
|
except:
|
|
raise HTTPException(
|
|
status_code=400, detail=f"Invalid default storage {storage.name}"
|
|
)
|
|
|
|
else:
|
|
try:
|
|
await verify_storage_upload(storage, ".btrix-upload-verify")
|
|
except:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Could not verify custom storage. Check credentials are valid?",
|
|
)
|
|
|
|
await org_ops.update_storage(org, storage)
|
|
|
|
await crawl_manager.update_org_storage(org.id, str(user.id), org.storage)
|
|
|
|
return {"updated": True}
|
|
|
|
|
|
# ============================================================================
|
|
@asynccontextmanager
|
|
async def get_s3_client(storage, use_access=False):
|
|
"""context manager for s3 client"""
|
|
endpoint_url = (
|
|
storage.endpoint_url if not use_access else storage.access_endpoint_url
|
|
)
|
|
if not endpoint_url.endswith("/"):
|
|
endpoint_url += "/"
|
|
|
|
parts = urlsplit(endpoint_url)
|
|
bucket, key = parts.path[1:].split("/", 1)
|
|
|
|
endpoint_url = parts.scheme + "://" + parts.netloc
|
|
|
|
session = get_session()
|
|
|
|
async with session.create_client(
|
|
"s3",
|
|
region_name=storage.region,
|
|
endpoint_url=endpoint_url,
|
|
aws_access_key_id=storage.access_key,
|
|
aws_secret_access_key=storage.secret_key,
|
|
) as client:
|
|
yield client, bucket, key
|
|
|
|
|
|
# ============================================================================
|
|
async def verify_storage_upload(storage, filename):
|
|
"""Test credentials and storage endpoint by uploading an empty test file"""
|
|
|
|
async with get_s3_client(storage) as (client, bucket, key):
|
|
key += filename
|
|
data = b""
|
|
|
|
resp = await client.put_object(Bucket=bucket, Key=key, Body=data)
|
|
assert resp["ResponseMetadata"]["HTTPStatusCode"] == 200
|
|
|
|
|
|
# ============================================================================
|
|
async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600):
|
|
"""generate pre-signed url for crawl file"""
|
|
if crawlfile.def_storage_name:
|
|
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)
|
|
|
|
elif org.storage.type == "s3":
|
|
s3storage = org.storage
|
|
|
|
else:
|
|
raise TypeError("No Default Storage Found, Invalid Storage Type")
|
|
|
|
async with get_s3_client(s3storage, s3storage.use_access_for_presign) as (
|
|
client,
|
|
bucket,
|
|
key,
|
|
):
|
|
key += crawlfile.filename
|
|
|
|
presigned_url = await client.generate_presigned_url(
|
|
"get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=duration
|
|
)
|
|
|
|
if (
|
|
not s3storage.use_access_for_presign
|
|
and s3storage.access_endpoint_url
|
|
and s3storage.access_endpoint_url != s3storage.endpoint_url
|
|
):
|
|
presigned_url = presigned_url.replace(
|
|
s3storage.endpoint_url, s3storage.access_endpoint_url
|
|
)
|
|
|
|
return presigned_url
|
|
|
|
|
|
# ============================================================================
|
|
async def delete_crawl_file_object(org, crawlfile, crawl_manager):
|
|
"""delete crawl file from storage."""
|
|
status_code = None
|
|
|
|
if crawlfile.def_storage_name:
|
|
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)
|
|
|
|
elif org.storage.type == "s3":
|
|
s3storage = org.storage
|
|
|
|
else:
|
|
raise TypeError("No Default Storage Found, Invalid Storage Type")
|
|
|
|
async with get_s3_client(s3storage, s3storage.use_access_for_presign) as (
|
|
client,
|
|
bucket,
|
|
key,
|
|
):
|
|
key += crawlfile.filename
|
|
response = await client.delete_object(Bucket=bucket, Key=key)
|
|
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
|
|
|
|
return status_code
|
|
|
|
|
|
# ============================================================================
|
|
async def get_wacz_logs(org, crawlfile, crawl_manager):
|
|
"""Return combined and sorted list of log line dicts from all logs in WACZ."""
|
|
if crawlfile.def_storage_name:
|
|
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)
|
|
|
|
elif org.storage.type == "s3":
|
|
s3storage = org.storage
|
|
|
|
else:
|
|
raise TypeError("No Default Storage Found, Invalid Storage Type")
|
|
|
|
async with get_s3_client(s3storage, s3storage.use_access_for_presign) as (
|
|
client,
|
|
bucket,
|
|
key,
|
|
):
|
|
key += crawlfile.filename
|
|
cd_start, zip_file = await get_zip_file(client, bucket, key)
|
|
log_files = [
|
|
f
|
|
for f in zip_file.filelist
|
|
if f.filename.startswith("logs/") and not f.is_dir()
|
|
]
|
|
|
|
combined_log_lines = []
|
|
|
|
for log_zipinfo in log_files:
|
|
parsed_log_lines = await extract_and_parse_log_file(
|
|
client, bucket, key, log_zipinfo, cd_start
|
|
)
|
|
combined_log_lines.extend(parsed_log_lines)
|
|
|
|
return combined_log_lines
|