browsertrix/backend/btrixcloud/zip.py

"""
Methods for interacting with zip/WACZ files
"""
import io
import json
import os
import struct
import zipfile
import zlib

from fastapi import HTTPException


# ============================================================================
EOCD_RECORD_SIZE = 22
ZIP64_EOCD_RECORD_SIZE = 56
ZIP64_EOCD_LOCATOR_SIZE = 20

MAX_STANDARD_ZIP_SIZE = 4_294_967_295


# ============================================================================
async def extract_and_parse_log_file(client, bucket, key, log_zipinfo, cd_start):
    """Return parsed JSON from extracted and uncompressed log"""
    # pylint: disable=too-many-locals
    file_head = await fetch(
        client, bucket, key, cd_start + log_zipinfo.header_offset + 26, 4
    )
    name_len = parse_little_endian_to_int(file_head[0:2])
    extra_len = parse_little_endian_to_int(file_head[2:4])

    content = await fetch(
        client,
        bucket,
        key,
        cd_start + log_zipinfo.header_offset + 30 + name_len + extra_len,
        log_zipinfo.compress_size,
    )

    if log_zipinfo.compress_type == zipfile.ZIP_DEFLATED:
        uncompressed_content = zlib.decompressobj(-zlib.MAX_WBITS).decompress(content)
    else:
        uncompressed_content = content

    content_length = len(uncompressed_content)
    if not log_zipinfo.file_size == content_length:
        # pylint: disable=line-too-long
        detail = f"Error extracting log file {log_zipinfo.filename} from WACZ {os.path.basename(key)}."
        detail += f" Expected {log_zipinfo.file_size} bytes uncompressed but found {content_length}"
        print(detail, flush=True)
        raise HTTPException(status_code=500, detail=detail)

    parsed_log_lines = []

    for json_line in uncompressed_content.decode("utf-8").split("\n"):
        if not json_line:
            continue
        try:
            result = json.loads(json_line)
            parsed_log_lines.append(result)
        except json.JSONDecodeError as err:
            print(f"Error decoding json-l line: {json_line}. Error: {err}", flush=True)

    return parsed_log_lines


async def get_zip_file(client, bucket, key):
    """Fetch enough of the WACZ file be able to read the zip filelist"""
    file_size = await get_file_size(client, bucket, key)
    eocd_record = await fetch(
        client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
    )

    if file_size <= MAX_STANDARD_ZIP_SIZE:
        cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
        central_directory = await fetch(client, bucket, key, cd_start, cd_size)
        return (
            cd_start,
            zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)),
        )

    zip64_eocd_record = await fetch(
        client,
        bucket,
        key,
        file_size
        - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
        ZIP64_EOCD_RECORD_SIZE,
    )
    zip64_eocd_locator = await fetch(
        client,
        bucket,
        key,
        file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
        ZIP64_EOCD_LOCATOR_SIZE,
    )
    cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
    central_directory = await fetch(client, bucket, key, cd_start, cd_size)
    return (
        cd_start,
        zipfile.ZipFile(
            io.BytesIO(
                central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
            )
        ),
    )


async def get_file_size(client, bucket, key):
    """Get WACZ file size from HEAD request"""
    head_response = await client.head_object(Bucket=bucket, Key=key)
    return head_response["ContentLength"]


async def fetch(client, bucket, key, start, length):
    """Fetch a byte range from a file in object storage"""
    end = start + length - 1
    response = await client.get_object(
        Bucket=bucket, Key=key, Range=f"bytes={start}-{end}"
    )
    return await response["Body"].read()


def get_central_directory_metadata_from_eocd(eocd):
    """Get central directory start and size"""
    cd_size = parse_little_endian_to_int(eocd[12:16])
    cd_start = parse_little_endian_to_int(eocd[16:20])
    return cd_start, cd_size


def get_central_directory_metadata_from_eocd64(eocd64):
    """Get central directory start and size for zip64"""
    cd_size = parse_little_endian_to_int(eocd64[40:48])
    cd_start = parse_little_endian_to_int(eocd64[48:56])
    return cd_start, cd_size


def parse_little_endian_to_int(little_endian_bytes):
    """Convert little endian used in zip spec to int"""
    byte_length = len(little_endian_bytes)
    format_character = "q"
    if byte_length == 4:
        format_character = "i"
    elif byte_length == 2:
        format_character = "h"

    return struct.unpack("<" + format_character, little_endian_bytes)[0]