Fix: Stream log downloading from WACZ (#1225)
* Fix(backend): Stream logs without causing OOM Also be smarter about when to use `heapq.merge` and when to use `itertools.chain`: If all the logs are coming from the same instance we `chain` them, otherwise we'll `merge` them iterator fixes: - group wacz files by instance by suffix, eg. -0.wacz, -1.wacz, -2.wacz - sort wacz files, and all logs within each wacz file - chain log iterators for all log files within wacz group - merge log iterators across wacz files in different groups - add type hints to help keep track of iterator helper functions - add iter_lines() from botocore, use that for line parsing for simplicity --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
d6bc467c54
commit
037396f3d9
@ -1,13 +1,14 @@
|
||||
"""
|
||||
Storage API
|
||||
"""
|
||||
from typing import Optional, Union
|
||||
from typing import Optional, Union, Iterator, Iterable, List, Dict
|
||||
from urllib.parse import urlsplit
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import asyncio
|
||||
import heapq
|
||||
import json
|
||||
import itertools
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
@ -17,7 +18,7 @@ from stream_zip import stream_zip, NO_COMPRESSION_64
|
||||
import aiobotocore.session
|
||||
import boto3
|
||||
|
||||
from .models import Organization, DefaultStorage, S3Storage, User
|
||||
from .models import CrawlFile, Organization, DefaultStorage, S3Storage, User
|
||||
from .zip import (
|
||||
sync_get_zip_file,
|
||||
sync_get_log_stream,
|
||||
@ -348,33 +349,34 @@ def _parse_json(line):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def _sync_get_logs(wacz_files, log_levels, contexts, client, bucket, key):
|
||||
def _sync_get_logs(
|
||||
wacz_files: List[CrawlFile],
|
||||
log_levels: List[str],
|
||||
contexts: List[str],
|
||||
client,
|
||||
bucket: str,
|
||||
key: str,
|
||||
) -> Iterator[bytes]:
|
||||
"""Generate filtered stream of logs from specified WACZs sorted by timestamp"""
|
||||
|
||||
# pylint: disable=too-many-function-args
|
||||
def stream_log_bytes_as_line_dicts(stream_generator):
|
||||
"""Yield parsed JSON lines as dicts from stream generator."""
|
||||
last_line = ""
|
||||
try:
|
||||
while True:
|
||||
next_chunk = next(stream_generator)
|
||||
next_chunk = next_chunk.decode("utf-8", errors="ignore")
|
||||
chunk = last_line + next_chunk
|
||||
chunk_by_line = chunk.split("\n")
|
||||
last_line = chunk_by_line.pop()
|
||||
for line in chunk_by_line:
|
||||
if not line:
|
||||
continue
|
||||
json_dict = _parse_json(line)
|
||||
if json_dict:
|
||||
yield json_dict
|
||||
except StopIteration:
|
||||
if last_line:
|
||||
json_dict = _parse_json(last_line)
|
||||
if json_dict:
|
||||
yield json_dict
|
||||
def stream_log_lines(
|
||||
wacz_key, wacz_filename, cd_start, log_zipinfo
|
||||
) -> Iterator[dict]:
|
||||
"""Pass lines as json objects"""
|
||||
|
||||
def stream_json_lines(iterator, log_levels, contexts):
|
||||
print(f"Fetching log {log_zipinfo.filename} from {wacz_filename}", flush=True)
|
||||
|
||||
line_iter: Iterator[bytes] = sync_get_log_stream(
|
||||
client, bucket, wacz_key, log_zipinfo, cd_start
|
||||
)
|
||||
|
||||
for line in line_iter:
|
||||
yield _parse_json(line.decode("utf-8", errors="ignore"))
|
||||
|
||||
def stream_json_lines(
|
||||
iterator: Iterable[dict], log_levels: List[str], contexts: List[str]
|
||||
) -> Iterator[bytes]:
|
||||
"""Yield parsed JSON dicts as JSON-lines bytes after filtering as necessary"""
|
||||
for line_dict in iterator:
|
||||
if log_levels and line_dict["logLevel"] not in log_levels:
|
||||
@ -384,29 +386,50 @@ def _sync_get_logs(wacz_files, log_levels, contexts, client, bucket, key):
|
||||
json_str = json.dumps(line_dict, ensure_ascii=False) + "\n"
|
||||
yield json_str.encode("utf-8")
|
||||
|
||||
log_generators = []
|
||||
def organize_based_on_instance_number(
|
||||
wacz_files: List[CrawlFile],
|
||||
) -> List[List[CrawlFile]]:
|
||||
"""Place wacz_files into their own list based on instance number"""
|
||||
wacz_files.sort(key=lambda file: file.filename)
|
||||
waczs_groups: Dict[str, List[CrawlFile]] = {}
|
||||
for file in wacz_files:
|
||||
instance_number = file.filename[
|
||||
file.filename.rfind("-") + 1 : file.filename.rfind(".")
|
||||
]
|
||||
if instance_number in waczs_groups:
|
||||
waczs_groups[instance_number].append(file)
|
||||
else:
|
||||
waczs_groups[instance_number] = [file]
|
||||
return list(waczs_groups.values())
|
||||
|
||||
for wacz_file in wacz_files:
|
||||
wacz_key = key + wacz_file.filename
|
||||
cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key)
|
||||
log_generators: List[Iterator[dict]] = []
|
||||
|
||||
log_files = [
|
||||
f
|
||||
for f in zip_file.filelist
|
||||
if f.filename.startswith("logs/") and not f.is_dir()
|
||||
]
|
||||
waczs_groups = organize_based_on_instance_number(wacz_files)
|
||||
for instance_list in waczs_groups:
|
||||
wacz_log_streams: List[Iterator[dict]] = []
|
||||
|
||||
wacz_log_streams = []
|
||||
for wacz_file in instance_list:
|
||||
wacz_key = key + wacz_file.filename
|
||||
cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key)
|
||||
|
||||
for log_zipinfo in log_files:
|
||||
log_stream = sync_get_log_stream(
|
||||
client, bucket, wacz_key, log_zipinfo, cd_start
|
||||
)
|
||||
wacz_log_streams.extend(stream_log_bytes_as_line_dicts(log_stream))
|
||||
log_files = [
|
||||
f
|
||||
for f in zip_file.filelist
|
||||
if f.filename.startswith("logs/") and not f.is_dir()
|
||||
]
|
||||
log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename)
|
||||
|
||||
log_generators.append(wacz_log_streams)
|
||||
for log_zipinfo in log_files:
|
||||
wacz_log_streams.append(
|
||||
stream_log_lines(
|
||||
wacz_key, wacz_file.filename, cd_start, log_zipinfo
|
||||
)
|
||||
)
|
||||
|
||||
log_generators.append(itertools.chain(*wacz_log_streams))
|
||||
|
||||
heap_iter = heapq.merge(*log_generators, key=lambda entry: entry["timestamp"])
|
||||
|
||||
return stream_json_lines(heap_iter, log_levels, contexts)
|
||||
|
||||
|
||||
|
@ -40,7 +40,21 @@ def sync_get_log_stream(client, bucket, key, log_zipinfo, cd_start):
|
||||
else:
|
||||
uncompressed_content = content
|
||||
|
||||
return uncompressed_content
|
||||
return sync_iter_lines(uncompressed_content)
|
||||
|
||||
|
||||
def sync_iter_lines(chunk_iter, keepends=True):
|
||||
"""
|
||||
Iter by lines, adapted from botocore
|
||||
"""
|
||||
pending = b""
|
||||
for chunk in chunk_iter:
|
||||
lines = (pending + chunk).splitlines(True)
|
||||
for line in lines[:-1]:
|
||||
yield line.splitlines(keepends)[0]
|
||||
pending = lines[-1]
|
||||
if pending:
|
||||
yield pending.splitlines(keepends)[0]
|
||||
|
||||
|
||||
async def get_zip_file(client, bucket, key):
|
||||
|
Loading…
Reference in New Issue
Block a user