browsertrix/backend/btrixcloud/zip.py
Tessa Walsh 2efc461b9b
Implement sync streaming for finished crawl logs (#1168)
- Crawl logs streamed from WACZs using the sync boto client
2023-09-14 17:05:19 -07:00

185 lines
6.0 KiB
Python

"""
Methods for interacting with zip/WACZ files
"""
import io
import struct
import zipfile
import zlib
# ============================================================================
EOCD_RECORD_SIZE = 22
ZIP64_EOCD_RECORD_SIZE = 56
ZIP64_EOCD_LOCATOR_SIZE = 20
MAX_STANDARD_ZIP_SIZE = 4_294_967_295
CHUNK_SIZE = 1024 * 256
# ============================================================================
def sync_get_log_stream(client, bucket, key, log_zipinfo, cd_start):
"""Return uncompressed byte stream of log file in WACZ"""
# pylint: disable=too-many-locals
file_head = sync_fetch(
client, bucket, key, cd_start + log_zipinfo.header_offset + 26, 4
)
name_len = parse_little_endian_to_int(file_head[0:2])
extra_len = parse_little_endian_to_int(file_head[2:4])
content = sync_fetch_stream(
client,
bucket,
key,
cd_start + log_zipinfo.header_offset + 30 + name_len + extra_len,
log_zipinfo.compress_size,
)
if log_zipinfo.compress_type == zipfile.ZIP_DEFLATED:
uncompressed_content = zlib.decompressobj(-zlib.MAX_WBITS).decompress(content)
else:
uncompressed_content = content
return uncompressed_content
async def get_zip_file(client, bucket, key):
"""Fetch enough of the WACZ file be able to read the zip filelist"""
file_size = await get_file_size(client, bucket, key)
eocd_record = await fetch(
client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
)
if file_size <= MAX_STANDARD_ZIP_SIZE:
cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
central_directory = await fetch(client, bucket, key, cd_start, cd_size)
return (
cd_start,
zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)),
)
zip64_eocd_record = await fetch(
client,
bucket,
key,
file_size
- (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
ZIP64_EOCD_RECORD_SIZE,
)
zip64_eocd_locator = await fetch(
client,
bucket,
key,
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
ZIP64_EOCD_LOCATOR_SIZE,
)
cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
central_directory = await fetch(client, bucket, key, cd_start, cd_size)
return (
cd_start,
zipfile.ZipFile(
io.BytesIO(
central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
)
),
)
def sync_get_zip_file(client, bucket, key):
"""Fetch enough of the WACZ file be able to read the zip filelist"""
file_size = sync_get_file_size(client, bucket, key)
eocd_record = sync_fetch(
client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
)
if file_size <= MAX_STANDARD_ZIP_SIZE:
cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
with zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)) as zip_file:
return (cd_start, zip_file)
zip64_eocd_record = sync_fetch(
client,
bucket,
key,
file_size
- (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
ZIP64_EOCD_RECORD_SIZE,
)
zip64_eocd_locator = sync_fetch(
client,
bucket,
key,
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
ZIP64_EOCD_LOCATOR_SIZE,
)
cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
with zipfile.ZipFile(
io.BytesIO(
central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
)
) as zip_file:
return (cd_start, zip_file)
async def get_file_size(client, bucket, key):
"""Get WACZ file size from HEAD request"""
head_response = await client.head_object(Bucket=bucket, Key=key)
return head_response["ContentLength"]
def sync_get_file_size(client, bucket, key):
"""Get WACZ file size from HEAD request"""
head_response = client.head_object(Bucket=bucket, Key=key)
return head_response["ContentLength"]
async def fetch(client, bucket, key, start, length):
"""Fetch a byte range from a file in object storage"""
end = start + length - 1
response = await client.get_object(
Bucket=bucket, Key=key, Range=f"bytes={start}-{end}"
)
return await response["Body"].read()
def sync_fetch(client, bucket, key, start, length):
"""Fetch a byte range from a file in object storage"""
end = start + length - 1
response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
return response["Body"].read()
def sync_fetch_stream(client, bucket, key, start, length):
"""Fetch a byte range from a file in object storage as a stream"""
end = start + length - 1
response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
def get_central_directory_metadata_from_eocd(eocd):
"""Get central directory start and size"""
cd_size = parse_little_endian_to_int(eocd[12:16])
cd_start = parse_little_endian_to_int(eocd[16:20])
return cd_start, cd_size
def get_central_directory_metadata_from_eocd64(eocd64):
"""Get central directory start and size for zip64"""
cd_size = parse_little_endian_to_int(eocd64[40:48])
cd_start = parse_little_endian_to_int(eocd64[48:56])
return cd_start, cd_size
def parse_little_endian_to_int(little_endian_bytes):
"""Convert little endian used in zip spec to int"""
byte_length = len(little_endian_bytes)
format_character = "q"
if byte_length == 4:
format_character = "i"
elif byte_length == 2:
format_character = "h"
return struct.unpack("<" + format_character, little_endian_bytes)[0]