* Fix(backend): Stream logs without causing OOM Also be smarter about when to use `heapq.merge` and when to use `itertools.chain`: If all the logs are coming from the same instance we `chain` them, otherwise we'll `merge` them iterator fixes: - group wacz files by instance by suffix, eg. -0.wacz, -1.wacz, -2.wacz - sort wacz files, and all logs within each wacz file - chain log iterators for all log files within wacz group - merge log iterators across wacz files in different groups - add type hints to help keep track of iterator helper functions - add iter_lines() from botocore, use that for line parsing for simplicity --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
199 lines
6.4 KiB
Python
199 lines
6.4 KiB
Python
"""
|
|
Methods for interacting with zip/WACZ files
|
|
"""
|
|
import io
|
|
import struct
|
|
import zipfile
|
|
import zlib
|
|
|
|
|
|
# ============================================================================
|
|
EOCD_RECORD_SIZE = 22
|
|
ZIP64_EOCD_RECORD_SIZE = 56
|
|
ZIP64_EOCD_LOCATOR_SIZE = 20
|
|
|
|
MAX_STANDARD_ZIP_SIZE = 4_294_967_295
|
|
|
|
CHUNK_SIZE = 1024 * 256
|
|
|
|
|
|
# ============================================================================
|
|
def sync_get_log_stream(client, bucket, key, log_zipinfo, cd_start):
|
|
"""Return uncompressed byte stream of log file in WACZ"""
|
|
# pylint: disable=too-many-locals
|
|
file_head = sync_fetch(
|
|
client, bucket, key, cd_start + log_zipinfo.header_offset + 26, 4
|
|
)
|
|
name_len = parse_little_endian_to_int(file_head[0:2])
|
|
extra_len = parse_little_endian_to_int(file_head[2:4])
|
|
|
|
content = sync_fetch_stream(
|
|
client,
|
|
bucket,
|
|
key,
|
|
cd_start + log_zipinfo.header_offset + 30 + name_len + extra_len,
|
|
log_zipinfo.compress_size,
|
|
)
|
|
|
|
if log_zipinfo.compress_type == zipfile.ZIP_DEFLATED:
|
|
uncompressed_content = zlib.decompressobj(-zlib.MAX_WBITS).decompress(content)
|
|
else:
|
|
uncompressed_content = content
|
|
|
|
return sync_iter_lines(uncompressed_content)
|
|
|
|
|
|
def sync_iter_lines(chunk_iter, keepends=True):
|
|
"""
|
|
Iter by lines, adapted from botocore
|
|
"""
|
|
pending = b""
|
|
for chunk in chunk_iter:
|
|
lines = (pending + chunk).splitlines(True)
|
|
for line in lines[:-1]:
|
|
yield line.splitlines(keepends)[0]
|
|
pending = lines[-1]
|
|
if pending:
|
|
yield pending.splitlines(keepends)[0]
|
|
|
|
|
|
async def get_zip_file(client, bucket, key):
|
|
"""Fetch enough of the WACZ file be able to read the zip filelist"""
|
|
file_size = await get_file_size(client, bucket, key)
|
|
eocd_record = await fetch(
|
|
client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
|
|
)
|
|
|
|
if file_size <= MAX_STANDARD_ZIP_SIZE:
|
|
cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
|
|
central_directory = await fetch(client, bucket, key, cd_start, cd_size)
|
|
return (
|
|
cd_start,
|
|
zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)),
|
|
)
|
|
|
|
zip64_eocd_record = await fetch(
|
|
client,
|
|
bucket,
|
|
key,
|
|
file_size
|
|
- (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
|
|
ZIP64_EOCD_RECORD_SIZE,
|
|
)
|
|
zip64_eocd_locator = await fetch(
|
|
client,
|
|
bucket,
|
|
key,
|
|
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
|
|
ZIP64_EOCD_LOCATOR_SIZE,
|
|
)
|
|
cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
|
|
central_directory = await fetch(client, bucket, key, cd_start, cd_size)
|
|
return (
|
|
cd_start,
|
|
zipfile.ZipFile(
|
|
io.BytesIO(
|
|
central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
|
|
)
|
|
),
|
|
)
|
|
|
|
|
|
def sync_get_zip_file(client, bucket, key):
|
|
"""Fetch enough of the WACZ file be able to read the zip filelist"""
|
|
file_size = sync_get_file_size(client, bucket, key)
|
|
eocd_record = sync_fetch(
|
|
client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
|
|
)
|
|
|
|
if file_size <= MAX_STANDARD_ZIP_SIZE:
|
|
cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
|
|
central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
|
|
with zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)) as zip_file:
|
|
return (cd_start, zip_file)
|
|
|
|
zip64_eocd_record = sync_fetch(
|
|
client,
|
|
bucket,
|
|
key,
|
|
file_size
|
|
- (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
|
|
ZIP64_EOCD_RECORD_SIZE,
|
|
)
|
|
zip64_eocd_locator = sync_fetch(
|
|
client,
|
|
bucket,
|
|
key,
|
|
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
|
|
ZIP64_EOCD_LOCATOR_SIZE,
|
|
)
|
|
cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
|
|
central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
|
|
with zipfile.ZipFile(
|
|
io.BytesIO(
|
|
central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
|
|
)
|
|
) as zip_file:
|
|
return (cd_start, zip_file)
|
|
|
|
|
|
async def get_file_size(client, bucket, key):
|
|
"""Get WACZ file size from HEAD request"""
|
|
head_response = await client.head_object(Bucket=bucket, Key=key)
|
|
return head_response["ContentLength"]
|
|
|
|
|
|
def sync_get_file_size(client, bucket, key):
|
|
"""Get WACZ file size from HEAD request"""
|
|
head_response = client.head_object(Bucket=bucket, Key=key)
|
|
return head_response["ContentLength"]
|
|
|
|
|
|
async def fetch(client, bucket, key, start, length):
|
|
"""Fetch a byte range from a file in object storage"""
|
|
end = start + length - 1
|
|
response = await client.get_object(
|
|
Bucket=bucket, Key=key, Range=f"bytes={start}-{end}"
|
|
)
|
|
return await response["Body"].read()
|
|
|
|
|
|
def sync_fetch(client, bucket, key, start, length):
|
|
"""Fetch a byte range from a file in object storage"""
|
|
end = start + length - 1
|
|
response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
|
|
return response["Body"].read()
|
|
|
|
|
|
def sync_fetch_stream(client, bucket, key, start, length):
|
|
"""Fetch a byte range from a file in object storage as a stream"""
|
|
end = start + length - 1
|
|
response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
|
|
return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
|
|
|
|
|
|
def get_central_directory_metadata_from_eocd(eocd):
|
|
"""Get central directory start and size"""
|
|
cd_size = parse_little_endian_to_int(eocd[12:16])
|
|
cd_start = parse_little_endian_to_int(eocd[16:20])
|
|
return cd_start, cd_size
|
|
|
|
|
|
def get_central_directory_metadata_from_eocd64(eocd64):
|
|
"""Get central directory start and size for zip64"""
|
|
cd_size = parse_little_endian_to_int(eocd64[40:48])
|
|
cd_start = parse_little_endian_to_int(eocd64[48:56])
|
|
return cd_start, cd_size
|
|
|
|
|
|
def parse_little_endian_to_int(little_endian_bytes):
|
|
"""Convert little endian used in zip spec to int"""
|
|
byte_length = len(little_endian_bytes)
|
|
format_character = "q"
|
|
if byte_length == 4:
|
|
format_character = "i"
|
|
elif byte_length == 2:
|
|
format_character = "h"
|
|
|
|
return struct.unpack("<" + format_character, little_endian_bytes)[0]
|