browsertrix/backend/btrixcloud/zip.py
Tessa Walsh fb80a04f18 Add crawl /log API endpoint
If a crawl is completed, the endpoint streams the logs from the log
files in all of the created WACZ files, sorted by timestamp.

The API endpoint supports filtering by log_level and context whether
the crawl is still running or not.

This is not yet proper streaming because the entire log file is read
into memory before being streamed to the client. We will want to
switch to proper streaming eventually, but are currently blocked by
an aiobotocore bug - see:

https://github.com/aio-libs/aiobotocore/issues/991?#issuecomment-1490737762
2023-04-11 11:51:17 -04:00

148 lines
4.7 KiB
Python

"""
Methods for interacting with zip/WACZ files
"""
import io
import json
import os
import struct
import zipfile
import zlib
from fastapi import HTTPException
# ============================================================================
EOCD_RECORD_SIZE = 22
ZIP64_EOCD_RECORD_SIZE = 56
ZIP64_EOCD_LOCATOR_SIZE = 20
MAX_STANDARD_ZIP_SIZE = 4_294_967_295
# ============================================================================
async def extract_and_parse_log_file(client, bucket, key, log_zipinfo, cd_start):
"""Return parsed JSON from extracted and uncompressed log"""
# pylint: disable=too-many-locals
file_head = await fetch(
client, bucket, key, cd_start + log_zipinfo.header_offset + 26, 4
)
name_len = parse_little_endian_to_int(file_head[0:2])
extra_len = parse_little_endian_to_int(file_head[2:4])
content = await fetch(
client,
bucket,
key,
cd_start + log_zipinfo.header_offset + 30 + name_len + extra_len,
log_zipinfo.compress_size,
)
if log_zipinfo.compress_type == zipfile.ZIP_DEFLATED:
uncompressed_content = zlib.decompressobj(-zlib.MAX_WBITS).decompress(content)
else:
uncompressed_content = content
content_length = len(uncompressed_content)
if not log_zipinfo.file_size == content_length:
# pylint: disable=line-too-long
detail = f"Error extracting log file {log_zipinfo.filename} from WACZ {os.path.basename(key)}."
detail += f" Expected {log_zipinfo.file_size} bytes uncompressed but found {content_length}"
print(detail, flush=True)
raise HTTPException(status_code=500, detail=detail)
parsed_log_lines = []
for json_line in uncompressed_content.decode("utf-8").split("\n"):
if not json_line:
continue
try:
result = json.loads(json_line)
parsed_log_lines.append(result)
except json.JSONDecodeError as err:
print(f"Error decoding json-l line: {json_line}. Error: {err}", flush=True)
return parsed_log_lines
async def get_zip_file(client, bucket, key):
"""Fetch enough of the WACZ file be able to read the zip filelist"""
file_size = await get_file_size(client, bucket, key)
eocd_record = await fetch(
client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
)
if file_size <= MAX_STANDARD_ZIP_SIZE:
cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
central_directory = await fetch(client, bucket, key, cd_start, cd_size)
return (
cd_start,
zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)),
)
zip64_eocd_record = await fetch(
client,
bucket,
key,
file_size
- (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
ZIP64_EOCD_RECORD_SIZE,
)
zip64_eocd_locator = await fetch(
client,
bucket,
key,
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
ZIP64_EOCD_LOCATOR_SIZE,
)
cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
central_directory = await fetch(client, bucket, key, cd_start, cd_size)
return (
cd_start,
zipfile.ZipFile(
io.BytesIO(
central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
)
),
)
async def get_file_size(client, bucket, key):
"""Get WACZ file size from HEAD request"""
head_response = await client.head_object(Bucket=bucket, Key=key)
return head_response["ContentLength"]
async def fetch(client, bucket, key, start, length):
"""Fetch a byte range from a file in object storage"""
end = start + length - 1
response = await client.get_object(
Bucket=bucket, Key=key, Range=f"bytes={start}-{end}"
)
return await response["Body"].read()
def get_central_directory_metadata_from_eocd(eocd):
"""Get central directory start and size"""
cd_size = parse_little_endian_to_int(eocd[12:16])
cd_start = parse_little_endian_to_int(eocd[16:20])
return cd_start, cd_size
def get_central_directory_metadata_from_eocd64(eocd64):
"""Get central directory start and size for zip64"""
cd_size = parse_little_endian_to_int(eocd64[40:48])
cd_start = parse_little_endian_to_int(eocd64[48:56])
return cd_start, cd_size
def parse_little_endian_to_int(little_endian_bytes):
"""Convert little endian used in zip spec to int"""
byte_length = len(little_endian_bytes)
format_character = "q"
if byte_length == 4:
format_character = "i"
elif byte_length == 2:
format_character = "h"
return struct.unpack("<" + format_character, little_endian_bytes)[0]