From 2060ee78b4bc3aef82172008a9a7e9f5f9c14d45 Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Thu, 19 Dec 2024 18:41:47 -0800
Subject: [PATCH] Support Presigning for use with custom domain (#2249)

If access_endpoint_url is provided:
- Use virtual host addressing style, so presigned URLs are of the form
`https://bucket.s3-host.example.com/path/` instead of
`https://s3-host.example.com/bucket/path/`
- Allow for replacing `https://bucket.s3-host.example.com/path/` ->
`https://my-custom-domain.example.com/path/`, where
`https://my-custom-domain.example.com/path/` is the access_endpoint_url
- Remove old `use_access_for_presign` which is no longer used
- Fixes #2248
- docs: update deployment docs storages section to mention custom storages, access_endpoint_url

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
---
 backend/btrixcloud/models.py               |  1 -
 backend/btrixcloud/storages.py             | 55 ++++++++---------
 chart/values.yaml                          |  1 +
 frontend/docs/docs/deploy/customization.md | 71 +++++++++++++++++++---
 4 files changed, 89 insertions(+), 39 deletions(-)

diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index 6630ae18..84113324 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -1180,7 +1180,6 @@ class S3Storage(BaseModel):
     secret_key: str
     access_endpoint_url: str
     region: str = ""
-    use_access_for_presign: bool = True
 
 
 # ============================================================================
diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py
index 50b9557a..3c796485 100644
--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@@ -29,6 +29,7 @@ from zipfile import ZipInfo
 from fastapi import Depends, HTTPException
 from stream_zip import stream_zip, NO_COMPRESSION_64, Method
 from remotezip import RemoteZip
+from aiobotocore.config import AioConfig
 
 import aiobotocore.session
 import requests
@@ -50,7 +51,7 @@ from .models import (
     AddedResponseName,
 )
 
-from .utils import is_bool, slug_from_name
+from .utils import slug_from_name
 from .version import __version__
 
 
@@ -77,15 +78,12 @@ class StorageOps:
     org_ops: OrgOps
     crawl_manager: CrawlManager
 
-    is_local_minio: bool
     frontend_origin: str
 
     def __init__(self, org_ops, crawl_manager) -> None:
         self.org_ops = org_ops
         self.crawl_manager = crawl_manager
 
-        self.is_local_minio = is_bool(os.environ.get("IS_LOCAL_MINIO"))
-
         frontend_origin = os.environ.get(
             "FRONTEND_ORIGIN", "http://browsertrix-cloud-frontend"
         )
@@ -138,12 +136,7 @@ class StorageOps:
         if bucket_name:
             endpoint_url += bucket_name + "/"
 
-        if self.is_local_minio:
-            access_endpoint_url = "/data/"
-            use_access_for_presign = False
-        else:
-            access_endpoint_url = storage.get("access_endpoint_url") or endpoint_url
-            use_access_for_presign = is_bool(storage.get("use_access_for_presign"))
+        access_endpoint_url = storage.get("access_endpoint_url") or endpoint_url
 
         return S3Storage(
             access_key=storage["access_key"],
@@ -152,7 +145,6 @@ class StorageOps:
             endpoint_url=endpoint_url,
             endpoint_no_bucket_url=endpoint_no_bucket_url,
             access_endpoint_url=access_endpoint_url,
-            use_access_for_presign=use_access_for_presign,
         )
 
     async def add_custom_storage(
@@ -177,7 +169,6 @@ class StorageOps:
             endpoint_url=endpoint_url,
             endpoint_no_bucket_url=endpoint_no_bucket_url,
             access_endpoint_url=storagein.access_endpoint_url or storagein.endpoint_url,
-            use_access_for_presign=True,
         )
 
         try:
@@ -264,12 +255,12 @@ class StorageOps:
 
     @asynccontextmanager
     async def get_s3_client(
-        self, storage: S3Storage, use_access=False
+        self, storage: S3Storage, for_presign=False
     ) -> AsyncIterator[tuple[AIOS3Client, str, str]]:
         """context manager for s3 client"""
-        endpoint_url = (
-            storage.endpoint_url if not use_access else storage.access_endpoint_url
-        )
+        # parse bucket and key from standard endpoint_url
+        endpoint_url = storage.endpoint_url
+
         if not endpoint_url.endswith("/"):
             endpoint_url += "/"
 
@@ -280,12 +271,17 @@ class StorageOps:
 
         session = aiobotocore.session.get_session()
 
+        config = None
+        if for_presign and storage.access_endpoint_url != storage.endpoint_url:
+            config = AioConfig(s3={"addressing_style": "virtual"})
+
         async with session.create_client(
             "s3",
-            region_name=storage.region,
+            region_name=storage.region or "us-east-1",
             endpoint_url=endpoint_url,
             aws_access_key_id=storage.access_key,
             aws_secret_access_key=storage.secret_key,
+            config=config,
         ) as client:
             yield client, bucket, key
 
@@ -454,11 +450,11 @@ class StorageOps:
 
         s3storage = self.get_org_storage_by_ref(org, crawlfile.storage)
 
-        async with self.get_s3_client(s3storage, s3storage.use_access_for_presign) as (
-            client,
-            bucket,
-            key,
-        ):
+        async with self.get_s3_client(
+            s3storage,
+            for_presign=True,
+        ) as (client, bucket, key):
+            orig_key = key
             key += crawlfile.filename
 
             presigned_url = await client.generate_presigned_url(
@@ -466,12 +462,15 @@ class StorageOps:
             )
 
             if (
-                not s3storage.use_access_for_presign
-                and s3storage.access_endpoint_url
+                s3storage.access_endpoint_url
                 and s3storage.access_endpoint_url != s3storage.endpoint_url
             ):
+                parts = urlsplit(s3storage.endpoint_url)
+                host_endpoint_url = (
+                    f"{parts.scheme}://{bucket}.{parts.netloc}/{orig_key}"
+                )
                 presigned_url = presigned_url.replace(
-                    s3storage.endpoint_url, s3storage.access_endpoint_url
+                    host_endpoint_url, s3storage.access_endpoint_url
                 )
 
         return presigned_url
@@ -490,11 +489,7 @@ class StorageOps:
 
         s3storage = self.get_org_storage_by_ref(org, storage)
 
-        async with self.get_s3_client(s3storage) as (
-            client,
-            bucket,
-            key,
-        ):
+        async with self.get_s3_client(s3storage) as (client, bucket, key):
             key += filename
             response = await client.delete_object(Bucket=bucket, Key=key)
             status_code = response["ResponseMetadata"]["HTTPStatusCode"]
diff --git a/chart/values.yaml b/chart/values.yaml
index 80b8ed35..3a300ced 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -405,6 +405,7 @@ storages:
     bucket_name: *local_bucket_name
 
     endpoint_url: "http://local-minio.default:9000/"
+    access_endpoint_url: "/data/"
 
 
 # optional: duration in minutes for WACZ download links to be valid
diff --git a/frontend/docs/docs/deploy/customization.md b/frontend/docs/docs/deploy/customization.md
index 5b15bff9..e7c72872 100644
--- a/frontend/docs/docs/deploy/customization.md
+++ b/frontend/docs/docs/deploy/customization.md
@@ -32,7 +32,14 @@ crawler_channels:
 
 ## Storage
 
-The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, as can be seen in the default configuration:
+The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.
+
+### Using Local Minio Storage
+
+Browsertrix includes a built-in Minio storage service, which is enabled by default (`minio_local: true` is set).
+
+The configuration for this is as follows:
+
 
 ```yaml
 storages:
@@ -40,14 +47,60 @@ storages:
     type: "s3"
     access_key: "ADMIN"
     secret_key: "PASSW0RD"
-    bucket_name: *local_bucket_name
+    bucket_name: btrix-data
 
     endpoint_url: "http://local-minio.default:9000/"
+    access_endpoint_url: /data/
 ```
 
-It is possible to add one or more replica storage locations. If replica locations are enabled, all stored content in the application will be automatically replicated to each configured replica storage location in background jobs after being stored in the default primary storage. If replica locations are enabled, at least one must be set as the default replica location for primary backups. This is indicated with `is_default_replica: True`. If more than one storage location is configured, the primary storage must also be indicated with `is_default_primary: True`.
+The `access_key` and `secret_key` should be changed, otherwise no additional changes are needed, and all local data will be stored in this Minio instance by default.
 
-For example, here is what a storage configuration with two replica locations, one in another bucket on the same Minio S3 service as primary storage as well as another in an external S3 provider:
+The S3 bucket is accessible via `/data/` path on the same host Browsertrix is running on, eg. `http://localhost:30870/data/`.
+
+
+### Using External S3 Storage Providers
+
+Browsertrix can also be used with external S3 storage providers, which can be configured as follows:
+
+```yaml
+storages:
+  - name: default
+    type: "s3"
+    access_key: "accesskey"
+    secret_key: "secret"
+
+    endpoint_url: "https://s3provider.example.com/bucket/path/"
+    access_endpoint_url: "https://my-custom-domain.example.com/path/" #optional
+    is_default_primary: true
+```
+
+
+When using an external S3 provider, a custom `access_endpoint_url` can be provided, and the `bucket_name` need to be specified separately.
+This URL is used for direct access to WACZ files, and can be used to specify a custom domain to access the bucket.
+
+The `endpoint_url` should be provided in 'path prefix' form (with the bucket after the path), eg:
+`https://s3provider.example.com/bucket/path/`.
+
+Browsertrix will handle presigning S3 URLs so that WACZ files (and other data) can be accessed directly, using URLs of the form: `https://s3provider.example.com/bucket/path/to/files/crawl.wacz?signature...`
+
+Since the local Minio service is not used, `minio_local: false` can be set to save resource in not deploying Minio.
+
+
+### Custom Access Endpoint URL
+
+It may be useful to provide a custom access endpoint for accessing WACZ files and other data. if the `access_endpoint_url` is provided,
+it should be in 'virtual host' form (the bucket is not added to the path, but is assumed to be the in the host).
+
+The host portion of the URL is then replaced with the `access_endpoint_url`. For example, given `endpoint_url: https://s3provider.example.com/bucket/path/` and `access_endpoint_url: https://my-custom-domain.example.com/path/`, a URL to a WACZ files in 'virtual host' form may be `https://bucket.s3provider.example.com/path/to/files/crawl.wacz?signature...`.
+
+The `https://bucket.s3provider.example.com/path/` is then replaced with the `https://my-custom-domain.example.com/path/`, and the final URL becomes `https://my-custom-domain.example.com/path/to/files/crawl.wacz?signature...`.
+
+
+### Storage Replicas
+
+It is possible to add one or more replica storage locations. If replica locations are enabled, all stored content in the application will be automatically replicated to each configured replica storage location in background jobs after being stored in the default primary storage. If replica locations are enabled, at least one must be set as the default replica location for primary backups. This is indicated with `is_default_replica: true`. If more than one storage location is configured, the primary storage must also be indicated with `is_default_primary: true`.
+
+For example, here is what a storage configuration with two replica locations, one in another bucket on the same local Minio S3 service as primary storage as well as another in an external S3 provider:
 
 ```yaml
 storages:
@@ -55,10 +108,11 @@ storages:
     type: "s3"
     access_key: "ADMIN"
     secret_key: "PASSW0RD"
-    bucket_name: *local_bucket_name
+    bucket_name: btrix-data
+    access_endpoint_url: /data/
 
     endpoint_url: "http://local-minio.default:9000/"
-    is_default_primary: True
+    is_default_primary: true
 
   - name: "replica-0"
     type: "s3"
@@ -67,7 +121,7 @@ storages:
     bucket_name: "replica-0"
 
     endpoint_url: "http://local-minio.default:9000/"
-    is_default_replica: True
+    is_default_replica: true
 
   - name: "replica-1"
     type: "s3"
@@ -75,7 +129,8 @@ storages:
     secret_key: "secret"
     bucket_name: "replica-1"
 
-    endpoint_url: "http://s3provider.example.com"
+    endpoint_url: "https://s3provider.example.com/bucket/path/"
+    access_endpoint_url: "https://my-custom-domain.example.com/path/"
 ```
 
 ## Horizontal Autoscaling