Fix: Stream log downloading from WACZ (#1225)

* Fix(backend): Stream logs without causing OOM Also be smarter about when to use `heapq.merge` and when to use `itertools.chain`: If all the logs are coming from the same instance we `chain` them, otherwise we'll `merge` them iterator fixes: - group wacz files by instance by suffix, eg. -0.wacz, -1.wacz, -2.wacz - sort wacz files, and all logs within each wacz file - chain log iterators for all log files within wacz group - merge log iterators across wacz files in different groups - add type hints to help keep track of iterator helper functions - add iter_lines() from botocore, use that for line parsing for simplicity --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2023-09-28 18:54:52 -07:00 · 2023-09-28 18:54:52 -07:00 · 037396f3d9
commit 037396f3d9
parent d6bc467c54
2 changed files with 79 additions and 42 deletions
--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@ -1,13 +1,14 @@
 """
 Storage API
 """
-from typing import Optional, Union
+from typing import Optional, Union, Iterator, Iterable, List, Dict
 from urllib.parse import urlsplit
 from contextlib import asynccontextmanager
 import asyncio
 import heapq
 import json
 import itertools
 from datetime import datetime
@ -17,7 +18,7 @@ from stream_zip import stream_zip, NO_COMPRESSION_64
 import aiobotocore.session
 import boto3
-from .models import Organization, DefaultStorage, S3Storage, User
+from .models import CrawlFile, Organization, DefaultStorage, S3Storage, User
 from .zip import (
    sync_get_zip_file,
    sync_get_log_stream,
@ -348,33 +349,34 @@ def _parse_json(line):
 # ============================================================================
-def _sync_get_logs(wacz_files, log_levels, contexts, client, bucket, key):
+def _sync_get_logs(
    wacz_files: List[CrawlFile],
    log_levels: List[str],
    contexts: List[str],
    client,
    bucket: str,
    key: str,
 ) -> Iterator[bytes]:
    """Generate filtered stream of logs from specified WACZs sorted by timestamp"""
    # pylint: disable=too-many-function-args
-    def stream_log_bytes_as_line_dicts(stream_generator):
+    def stream_log_lines(
-        """Yield parsed JSON lines as dicts from stream generator."""
+        wacz_key, wacz_filename, cd_start, log_zipinfo
-        last_line = ""
+    ) -> Iterator[dict]:
-        try:
+        """Pass lines as json objects"""
            while True:
                next_chunk = next(stream_generator)
                next_chunk = next_chunk.decode("utf-8", errors="ignore")
                chunk = last_line + next_chunk
                chunk_by_line = chunk.split("\n")
                last_line = chunk_by_line.pop()
                for line in chunk_by_line:
                    if not line:
                        continue
                    json_dict = _parse_json(line)
                    if json_dict:
                        yield json_dict
        except StopIteration:
            if last_line:
                json_dict = _parse_json(last_line)
                if json_dict:
                    yield json_dict
-    def stream_json_lines(iterator, log_levels, contexts):
+        print(f"Fetching log {log_zipinfo.filename} from {wacz_filename}", flush=True)
        line_iter: Iterator[bytes] = sync_get_log_stream(
            client, bucket, wacz_key, log_zipinfo, cd_start
        )
        for line in line_iter:
            yield _parse_json(line.decode("utf-8", errors="ignore"))
    def stream_json_lines(
        iterator: Iterable[dict], log_levels: List[str], contexts: List[str]
    ) -> Iterator[bytes]:
        """Yield parsed JSON dicts as JSON-lines bytes after filtering as necessary"""
        for line_dict in iterator:
            if log_levels and line_dict["logLevel"] not in log_levels:
@ -384,29 +386,50 @@ def _sync_get_logs(wacz_files, log_levels, contexts, client, bucket, key):
            json_str = json.dumps(line_dict, ensure_ascii=False) + "\n"
            yield json_str.encode("utf-8")
-    log_generators = []
+    def organize_based_on_instance_number(
        wacz_files: List[CrawlFile],
    ) -> List[List[CrawlFile]]:
        """Place wacz_files into their own list based on instance number"""
        wacz_files.sort(key=lambda file: file.filename)
        waczs_groups: Dict[str, List[CrawlFile]] = {}
        for file in wacz_files:
            instance_number = file.filename[
                file.filename.rfind("-") + 1 : file.filename.rfind(".")
            ]
            if instance_number in waczs_groups:
                waczs_groups[instance_number].append(file)
            else:
                waczs_groups[instance_number] = [file]
        return list(waczs_groups.values())
-    for wacz_file in wacz_files:
+    log_generators: List[Iterator[dict]] = []
        wacz_key = key + wacz_file.filename
        cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key)
-        log_files = [
+    waczs_groups = organize_based_on_instance_number(wacz_files)
-            f
+    for instance_list in waczs_groups:
-            for f in zip_file.filelist
+        wacz_log_streams: List[Iterator[dict]] = []
            if f.filename.startswith("logs/") and not f.is_dir()
        ]
-        wacz_log_streams = []
+        for wacz_file in instance_list:
            wacz_key = key + wacz_file.filename
            cd_start, zip_file = sync_get_zip_file(client, bucket, wacz_key)
-        for log_zipinfo in log_files:
+            log_files = [
-            log_stream = sync_get_log_stream(
+                f
-                client, bucket, wacz_key, log_zipinfo, cd_start
+                for f in zip_file.filelist
-            )
+                if f.filename.startswith("logs/") and not f.is_dir()
-            wacz_log_streams.extend(stream_log_bytes_as_line_dicts(log_stream))
+            ]
            log_files.sort(key=lambda log_zipinfo: log_zipinfo.filename)
-        log_generators.append(wacz_log_streams)
+            for log_zipinfo in log_files:
                wacz_log_streams.append(
                    stream_log_lines(
                        wacz_key, wacz_file.filename, cd_start, log_zipinfo
                    )
                )
        log_generators.append(itertools.chain(*wacz_log_streams))
    heap_iter = heapq.merge(*log_generators, key=lambda entry: entry["timestamp"])
    return stream_json_lines(heap_iter, log_levels, contexts)
--- a/backend/btrixcloud/zip.py
+++ b/backend/btrixcloud/zip.py
@ -40,7 +40,21 @@ def sync_get_log_stream(client, bucket, key, log_zipinfo, cd_start):
    else:
        uncompressed_content = content
-    return uncompressed_content
+    return sync_iter_lines(uncompressed_content)
 def sync_iter_lines(chunk_iter, keepends=True):
    """
    Iter by lines, adapted from botocore
    """
    pending = b""
    for chunk in chunk_iter:
        lines = (pending + chunk).splitlines(True)
        for line in lines[:-1]:
            yield line.splitlines(keepends)[0]
        pending = lines[-1]
    if pending:
        yield pending.splitlines(keepends)[0]
 async def get_zip_file(client, bucket, key):