storage: use s3v4 signature for presigning urls (#2611)

Use V4 ('s3v4') signature version for for all presigning URLs to support
backblaze, fixes #2472
- add 'access_addressing_style' to be able to choose virtual/path
addressing for access endpoint (default to 'virtual' as before)
- fix minio presigning with v4 by using 'path' addressing style for
minio
- if path matches '/data/' for internal minio bucket, then always use
'path'
- also make minio access path '/data/' configurable

also simplify running in any namespace with default settings:
- don't hardcode 'local-minio.default'
- in crawlers namespace, add a 'local-minio' externalName service which
maps to the main namespace service.
This commit is contained in:
Ilya Kreymer 2025-05-19 15:44:36 -07:00 committed by GitHub
parent 4b1e416eb6
commit f1fd11c031
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 56 additions and 10 deletions

View File

@ -1666,6 +1666,7 @@ class S3StorageIn(BaseModel):
endpoint_url: str
bucket: str
access_endpoint_url: Optional[str] = None
access_addressing_style: Literal["virtual", "path"] = "virtual"
region: str = ""
@ -1680,6 +1681,7 @@ class S3Storage(BaseModel):
access_key: str
secret_key: str
access_endpoint_url: str
access_addressing_style: Literal["virtual", "path"] = "virtual"
region: str = ""

View File

@ -70,7 +70,7 @@ CHUNK_SIZE = 1024 * 256
# ============================================================================
# pylint: disable=broad-except,raise-missing-from
# pylint: disable=broad-except,raise-missing-from,too-many-instance-attributes
class StorageOps:
"""All storage handling, download/upload operations"""
@ -104,6 +104,8 @@ class StorageOps:
default_namespace = os.environ.get("DEFAULT_NAMESPACE", "default")
self.frontend_origin = f"{frontend_origin}.{default_namespace}"
self.local_minio_access_path = os.environ.get("LOCAL_MINIO_ACCESS_PATH")
with open(os.environ["STORAGES_JSON"], encoding="utf-8") as fh:
storage_list = json.loads(fh.read())
@ -158,6 +160,10 @@ class StorageOps:
access_endpoint_url = storage.get("access_endpoint_url") or endpoint_url
addressing_style = storage.get("access_addressing_style", "virtual")
if access_endpoint_url == self.local_minio_access_path:
addressing_style = "path"
return S3Storage(
access_key=storage["access_key"],
secret_key=storage["secret_key"],
@ -165,6 +171,7 @@ class StorageOps:
endpoint_url=endpoint_url,
endpoint_no_bucket_url=endpoint_no_bucket_url,
access_endpoint_url=access_endpoint_url,
access_addressing_style=addressing_style,
)
async def add_custom_storage(
@ -189,6 +196,7 @@ class StorageOps:
endpoint_url=endpoint_url,
endpoint_no_bucket_url=endpoint_no_bucket_url,
access_endpoint_url=storagein.access_endpoint_url or storagein.endpoint_url,
access_addressing_style=storagein.access_addressing_style,
)
try:
@ -291,9 +299,12 @@ class StorageOps:
session = aiobotocore.session.get_session()
config = None
s3 = None
if for_presign and storage.access_endpoint_url != storage.endpoint_url:
config = AioConfig(s3={"addressing_style": "virtual"})
s3 = {"addressing_style": storage.access_addressing_style}
config = AioConfig(signature_version="s3v4", s3=s3)
async with session.create_client(
"s3",
@ -498,9 +509,12 @@ class StorageOps:
s3storage.access_endpoint_url
and s3storage.access_endpoint_url != s3storage.endpoint_url
):
virtual = s3storage.access_addressing_style == "virtual"
parts = urlsplit(s3storage.endpoint_url)
host_endpoint_url = (
f"{parts.scheme}://{bucket}.{parts.netloc}/{orig_key}"
if virtual
else f"{parts.scheme}://{parts.netloc}/{bucket}/{orig_key}"
)
presigned_url = presigned_url.replace(
host_endpoint_url, s3storage.access_endpoint_url

View File

@ -14,7 +14,7 @@ data:
FRONTEND_ORIGIN: {{ .Values.frontend_alias | default "http://browsertrix-cloud-frontend" }}
CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}.svc.cluster.local"
CRAWLER_FQDN_SUFFIX: ".{{ .Values.crawler_namespace }}{{ .Values.fqdn_suffix }}"
DEFAULT_ORG: "{{ .Values.default_org }}"
@ -53,6 +53,8 @@ data:
IS_LOCAL_MINIO: "{{ .Values.minio_local }}"
LOCAL_MINIO_ACCESS_PATH: "{{ .Values.minio_access_path }}"
STORAGES_JSON: "/ops-configs/storages.json"
CRAWLER_CHANNELS_JSON: "/ops-configs/crawler_channels.json"

View File

@ -41,7 +41,7 @@ spec:
value: {{ .Values.name }}-backend
- name: CRAWLER_FQDN_SUFFIX
value: ".{{ .Values.crawler_namespace }}.svc.cluster.local"
value: ".{{ .Values.crawler_namespace }}{{ .Values.fqdn_suffix }}"
- name: NGINX_ENTRYPOINT_WORKER_PROCESSES_AUTOTUNE
value: "1"
@ -60,7 +60,10 @@ spec:
- name: LOCAL_BUCKET
value: "{{ .Values.minio_local_bucket_name }}"
{{- end }}
- name: LOCAL_ACCESS_PATH
value: "{{ .Values.minio_access_path }}"
{{- end }}
{{- if .Values.inject_extra }}
- name: INJECT_EXTRA

View File

@ -136,6 +136,23 @@ spec:
{{- end }}
name: minio
---
apiVersion: v1
kind: Service
metadata:
namespace: {{ .Values.crawler_namespace }}
name: local-minio
labels:
app: local-minio
spec:
type: ExternalName
externalName: "local-minio.{{ .Release.Namespace }}{{ .Values.fqdn_suffix }}"
ports:
- port: 9000
{{- if .Values.minio_local_console_port }}
---
apiVersion: v1

View File

@ -398,6 +398,9 @@ minio_pull_policy: "IfNotPresent"
minio_local_bucket_name: &local_bucket_name "btrix-data"
# path for serving from local minio bucket
minio_access_path: &minio_access_path "/data/"
minio_cpu: "10m"
minio_memory: "1024Mi"
@ -413,8 +416,8 @@ storages:
secret_key: "PASSW0RD"
bucket_name: *local_bucket_name
endpoint_url: "http://local-minio.default:9000/"
access_endpoint_url: "/data/"
endpoint_url: "http://local-minio:9000/"
access_endpoint_url: *minio_access_path
# optional: duration in minutes for WACZ download links to be valid
@ -495,6 +498,9 @@ signer_memory: "50Mi"
# Other Settings
# =========================================
# default FQDN suffix, shouldn't need to change
fqdn_suffix: .svc.cluster.local
# Optional: configure load balancing annotations
# service:
# annotations:

View File

@ -7,7 +7,9 @@ if [ -z "$LOCAL_MINIO_HOST" ]; then
echo "no local minio, clearing out minio route"
echo "" >/etc/nginx/includes/minio.conf
else
echo "local minio: replacing \$LOCAL_MINIO_HOST with \"$LOCAL_MINIO_HOST\", \$LOCAL_BUCKET with \"$LOCAL_BUCKET\""
LOCAL_ACCESS_PATH=$(printf '%s\n' "$LOCAL_ACCESS_PATH" | sed -e 's/[\/&]/\\&/g')
echo "local minio: replacing \$LOCAL_MINIO_HOST with \"$LOCAL_MINIO_HOST\", \$LOCAL_BUCKET with \"$LOCAL_BUCKET\", \$LOCAL_ACCESS_PATH with \"$LOCAL_ACCESS_PATH\""
sed -i "s/\$LOCAL_ACCESS_PATH/$LOCAL_ACCESS_PATH/g" /etc/nginx/includes/minio.conf
sed -i "s/\$LOCAL_MINIO_HOST/$LOCAL_MINIO_HOST/g" /etc/nginx/includes/minio.conf
sed -i "s/\$LOCAL_BUCKET/$LOCAL_BUCKET/g" /etc/nginx/includes/minio.conf
fi

View File

@ -1,4 +1,4 @@
location /data/ {
location $LOCAL_ACCESS_PATH {
proxy_pass http://$LOCAL_MINIO_HOST/$LOCAL_BUCKET/;
proxy_redirect off;
proxy_buffering off;