Fixes #1502 - Adds pages to database as they get added to Redis during crawl - Adds migration to add pages to database for older crawls from pages.jsonl and extraPages.jsonl files in WACZ - Adds GET, list GET, and PATCH update endpoints for pages - Adds POST (add), PATCH, and POST (delete) endpoints for page notes, each with their own id, timestamp, and user info in addition to text - Adds page_ops methods for 1. adding resources/urls to page, and 2. adding automated heuristics and supplemental info (mime, type, etc.) to page (for use in crawl QA job) - Modifies `Migration` class to accept kwargs so that we can pass in ops classes as needed for migrations - Deletes WACZ files and pages from database for failed crawls during crawl_finished process - Deletes crawl pages when a crawl is deleted Note: Requires a crawler version 1.0.0 beta3 or later, with support for `--writePagesToRedis` to populate pages at crawl completion. Beta 4 is configured in the test chart, which should be upgraded to stable 1.0.0 when it's released. Connected to https://github.com/webrecorder/browsertrix-crawler/pull/464 --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
		
			
				
	
	
		
			201 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			201 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Methods for interacting with zip/WACZ files
 | |
| """
 | |
| 
 | |
| import io
 | |
| import struct
 | |
| import zipfile
 | |
| import zlib
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| EOCD_RECORD_SIZE = 22
 | |
| ZIP64_EOCD_RECORD_SIZE = 56
 | |
| ZIP64_EOCD_LOCATOR_SIZE = 20
 | |
| 
 | |
| MAX_STANDARD_ZIP_SIZE = 4_294_967_295
 | |
| 
 | |
| CHUNK_SIZE = 1024 * 256
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| def sync_get_filestream(client, bucket, key, file_zipinfo, cd_start):
 | |
|     """Return uncompressed byte stream of file in WACZ"""
 | |
|     # pylint: disable=too-many-locals
 | |
|     file_head = sync_fetch(
 | |
|         client, bucket, key, cd_start + file_zipinfo.header_offset + 26, 4
 | |
|     )
 | |
|     name_len = parse_little_endian_to_int(file_head[0:2])
 | |
|     extra_len = parse_little_endian_to_int(file_head[2:4])
 | |
| 
 | |
|     content = sync_fetch_stream(
 | |
|         client,
 | |
|         bucket,
 | |
|         key,
 | |
|         cd_start + file_zipinfo.header_offset + 30 + name_len + extra_len,
 | |
|         file_zipinfo.compress_size,
 | |
|     )
 | |
| 
 | |
|     decompress = False
 | |
|     if file_zipinfo.compress_type == zipfile.ZIP_DEFLATED:
 | |
|         decompress = True
 | |
| 
 | |
|     return sync_iter_lines(content, decompress=decompress)
 | |
| 
 | |
| 
 | |
| def sync_iter_lines(chunk_iter, decompress=False, keepends=True):
 | |
|     """
 | |
|     Iter by lines, adapted from botocore
 | |
|     """
 | |
|     pending = b""
 | |
|     for chunk in chunk_iter:
 | |
|         if decompress:
 | |
|             chunk = zlib.decompressobj(-zlib.MAX_WBITS).decompress(chunk)
 | |
|         lines = (pending + chunk).splitlines(True)
 | |
|         for line in lines[:-1]:
 | |
|             yield line.splitlines(keepends)[0]
 | |
|         pending = lines[-1]
 | |
|     if pending:
 | |
|         yield pending.splitlines(keepends)[0]
 | |
| 
 | |
| 
 | |
| async def get_zip_file(client, bucket, key):
 | |
|     """Fetch enough of the WACZ file be able to read the zip filelist"""
 | |
|     file_size = await get_file_size(client, bucket, key)
 | |
|     eocd_record = await fetch(
 | |
|         client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
 | |
|     )
 | |
| 
 | |
|     if file_size <= MAX_STANDARD_ZIP_SIZE:
 | |
|         cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
 | |
|         central_directory = await fetch(client, bucket, key, cd_start, cd_size)
 | |
|         return (
 | |
|             cd_start,
 | |
|             zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)),
 | |
|         )
 | |
| 
 | |
|     zip64_eocd_record = await fetch(
 | |
|         client,
 | |
|         bucket,
 | |
|         key,
 | |
|         file_size
 | |
|         - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
 | |
|         ZIP64_EOCD_RECORD_SIZE,
 | |
|     )
 | |
|     zip64_eocd_locator = await fetch(
 | |
|         client,
 | |
|         bucket,
 | |
|         key,
 | |
|         file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
 | |
|         ZIP64_EOCD_LOCATOR_SIZE,
 | |
|     )
 | |
|     cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
 | |
|     central_directory = await fetch(client, bucket, key, cd_start, cd_size)
 | |
|     return (
 | |
|         cd_start,
 | |
|         zipfile.ZipFile(
 | |
|             io.BytesIO(
 | |
|                 central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
 | |
|             )
 | |
|         ),
 | |
|     )
 | |
| 
 | |
| 
 | |
| def sync_get_zip_file(client, bucket, key):
 | |
|     """Fetch enough of the WACZ file be able to read the zip filelist"""
 | |
|     file_size = sync_get_file_size(client, bucket, key)
 | |
|     eocd_record = sync_fetch(
 | |
|         client, bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE
 | |
|     )
 | |
| 
 | |
|     if file_size <= MAX_STANDARD_ZIP_SIZE:
 | |
|         cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
 | |
|         central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
 | |
|         with zipfile.ZipFile(io.BytesIO(central_directory + eocd_record)) as zip_file:
 | |
|             return (cd_start, zip_file)
 | |
| 
 | |
|     zip64_eocd_record = sync_fetch(
 | |
|         client,
 | |
|         bucket,
 | |
|         key,
 | |
|         file_size
 | |
|         - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
 | |
|         ZIP64_EOCD_RECORD_SIZE,
 | |
|     )
 | |
|     zip64_eocd_locator = sync_fetch(
 | |
|         client,
 | |
|         bucket,
 | |
|         key,
 | |
|         file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
 | |
|         ZIP64_EOCD_LOCATOR_SIZE,
 | |
|     )
 | |
|     cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
 | |
|     central_directory = sync_fetch(client, bucket, key, cd_start, cd_size)
 | |
|     with zipfile.ZipFile(
 | |
|         io.BytesIO(
 | |
|             central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record
 | |
|         )
 | |
|     ) as zip_file:
 | |
|         return (cd_start, zip_file)
 | |
| 
 | |
| 
 | |
| async def get_file_size(client, bucket, key):
 | |
|     """Get WACZ file size from HEAD request"""
 | |
|     head_response = await client.head_object(Bucket=bucket, Key=key)
 | |
|     return head_response["ContentLength"]
 | |
| 
 | |
| 
 | |
| def sync_get_file_size(client, bucket, key):
 | |
|     """Get WACZ file size from HEAD request"""
 | |
|     head_response = client.head_object(Bucket=bucket, Key=key)
 | |
|     return head_response["ContentLength"]
 | |
| 
 | |
| 
 | |
| async def fetch(client, bucket, key, start, length):
 | |
|     """Fetch a byte range from a file in object storage"""
 | |
|     end = start + length - 1
 | |
|     response = await client.get_object(
 | |
|         Bucket=bucket, Key=key, Range=f"bytes={start}-{end}"
 | |
|     )
 | |
|     return await response["Body"].read()
 | |
| 
 | |
| 
 | |
| def sync_fetch(client, bucket, key, start, length):
 | |
|     """Fetch a byte range from a file in object storage"""
 | |
|     end = start + length - 1
 | |
|     response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
 | |
|     return response["Body"].read()
 | |
| 
 | |
| 
 | |
| def sync_fetch_stream(client, bucket, key, start, length):
 | |
|     """Fetch a byte range from a file in object storage as a stream"""
 | |
|     end = start + length - 1
 | |
|     response = client.get_object(Bucket=bucket, Key=key, Range=f"bytes={start}-{end}")
 | |
|     return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
 | |
| 
 | |
| 
 | |
| def get_central_directory_metadata_from_eocd(eocd):
 | |
|     """Get central directory start and size"""
 | |
|     cd_size = parse_little_endian_to_int(eocd[12:16])
 | |
|     cd_start = parse_little_endian_to_int(eocd[16:20])
 | |
|     return cd_start, cd_size
 | |
| 
 | |
| 
 | |
| def get_central_directory_metadata_from_eocd64(eocd64):
 | |
|     """Get central directory start and size for zip64"""
 | |
|     cd_size = parse_little_endian_to_int(eocd64[40:48])
 | |
|     cd_start = parse_little_endian_to_int(eocd64[48:56])
 | |
|     return cd_start, cd_size
 | |
| 
 | |
| 
 | |
| def parse_little_endian_to_int(little_endian_bytes):
 | |
|     """Convert little endian used in zip spec to int"""
 | |
|     byte_length = len(little_endian_bytes)
 | |
|     format_character = "q"
 | |
|     if byte_length == 4:
 | |
|         format_character = "i"
 | |
|     elif byte_length == 2:
 | |
|         format_character = "h"
 | |
| 
 | |
|     return struct.unpack("<" + format_character, little_endian_bytes)[0]
 |