From 3af94ca03dfe3782b28f752900d06b0a683b1bcd Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Wed, 16 Jul 2025 10:48:24 -0700
Subject: [PATCH] Ensure replay.json returns correct origin for pagesQueryUrl
 (#2741)

- Use the Host + X-Forwarded-Proto header from API request
- Fixes #2740, better fix for #2720 avoiding need for separate alias
---
 backend/btrixcloud/basecrawls.py          | 20 ++++++++++++-------
 backend/btrixcloud/crawls.py              | 18 ++++++++++++-----
 backend/btrixcloud/uploads.py             | 24 +++++++++++++++++------
 backend/btrixcloud/utils.py               |  4 ++--
 backend/test/test_collections.py          | 14 +++++++------
 backend/test/test_stop_cancel_crawl.py    |  9 +++++++++
 backend/test_nightly/test_org_deletion.py |  2 --
 7 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
index 633c4c8b..38aec2c7 100644
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@@ -18,7 +18,7 @@ import os
 import urllib.parse
 
 import asyncio
-from fastapi import HTTPException, Depends
+from fastapi import HTTPException, Depends, Request
 from fastapi.responses import StreamingResponse
 import pymongo
 
@@ -1057,27 +1057,33 @@ def init_base_crawls_api(app, user_dep, *args):
         tags=["all-crawls"],
         response_model=CrawlOutWithResources,
     )
-    async def get_base_crawl(crawl_id: str, org: Organization = Depends(org_crawl_dep)):
-        return await ops.get_crawl_out(crawl_id, org)
+    async def get_base_crawl(
+        crawl_id: str, request: Request, org: Organization = Depends(org_crawl_dep)
+    ):
+        return await ops.get_crawl_out(crawl_id, org, headers=dict(request.headers))
 
     @app.get(
         "/orgs/all/all-crawls/{crawl_id}/replay.json",
         tags=["all-crawls"],
         response_model=CrawlOutWithResources,
     )
-    async def get_base_crawl_admin(crawl_id, user: User = Depends(user_dep)):
+    async def get_base_crawl_admin(
+        crawl_id, request: Request, user: User = Depends(user_dep)
+    ):
         if not user.is_superuser:
             raise HTTPException(status_code=403, detail="Not Allowed")
 
-        return await ops.get_crawl_out(crawl_id, None)
+        return await ops.get_crawl_out(crawl_id, None, headers=dict(request.headers))
 
     @app.get(
         "/orgs/{oid}/all-crawls/{crawl_id}/replay.json",
         tags=["all-crawls"],
         response_model=CrawlOutWithResources,
     )
-    async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
-        return await ops.get_crawl_out(crawl_id, org)
+    async def get_crawl_out(
+        crawl_id, request: Request, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.get_crawl_out(crawl_id, org, headers=dict(request.headers))
 
     @app.get(
         "/orgs/{oid}/all-crawls/{crawl_id}/download",
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index 2891957a..b4573f0a 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -12,7 +12,7 @@ from uuid import UUID
 
 from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator
 
-from fastapi import Depends, HTTPException
+from fastapi import Depends, HTTPException, Request
 from fastapi.responses import StreamingResponse
 from redis import asyncio as exceptions
 from redis.asyncio.client import Redis
@@ -1345,19 +1345,27 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
         tags=["crawls"],
         response_model=CrawlOutWithResources,
     )
-    async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)):
+    async def get_crawl_admin(
+        crawl_id, request: Request, user: User = Depends(user_dep)
+    ):
         if not user.is_superuser:
             raise HTTPException(status_code=403, detail="Not Allowed")
 
-        return await ops.get_crawl_out(crawl_id, None, "crawl")
+        return await ops.get_crawl_out(
+            crawl_id, None, "crawl", headers=dict(request.headers)
+        )
 
     @app.get(
         "/orgs/{oid}/crawls/{crawl_id}/replay.json",
         tags=["crawls"],
         response_model=CrawlOutWithResources,
     )
-    async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
-        return await ops.get_crawl_out(crawl_id, org, "crawl")
+    async def get_crawl_out(
+        crawl_id, request: Request, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.get_crawl_out(
+            crawl_id, org, "crawl", headers=dict(request.headers)
+        )
 
     @app.get(
         "/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes
diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py
index 74798c8c..23c0f125 100644
--- a/backend/btrixcloud/uploads.py
+++ b/backend/btrixcloud/uploads.py
@@ -367,27 +367,39 @@ def init_uploads_api(app, user_dep, *args):
         tags=["uploads"],
         response_model=CrawlOut,
     )
-    async def get_upload(crawlid: str, org: Organization = Depends(org_crawl_dep)):
-        return await ops.get_crawl_out(crawlid, org, "upload")
+    async def get_upload(
+        crawlid: str, request: Request, org: Organization = Depends(org_crawl_dep)
+    ):
+        return await ops.get_crawl_out(
+            crawlid, org, "upload", headers=dict(request.headers)
+        )
 
     @app.get(
         "/orgs/all/uploads/{crawl_id}/replay.json",
         tags=["uploads"],
         response_model=CrawlOutWithResources,
     )
-    async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)):
+    async def get_upload_replay_admin(
+        crawl_id, request: Request, user: User = Depends(user_dep)
+    ):
         if not user.is_superuser:
             raise HTTPException(status_code=403, detail="Not Allowed")
 
-        return await ops.get_crawl_out(crawl_id, None, "upload")
+        return await ops.get_crawl_out(
+            crawl_id, None, "upload", headers=dict(request.headers)
+        )
 
     @app.get(
         "/orgs/{oid}/uploads/{crawl_id}/replay.json",
         tags=["uploads"],
         response_model=CrawlOutWithResources,
     )
-    async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)):
-        return await ops.get_crawl_out(crawl_id, org, "upload")
+    async def get_upload_replay(
+        crawl_id, request: Request, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.get_crawl_out(
+            crawl_id, org, "upload", headers=dict(request.headers)
+        )
 
     @app.get(
         "/orgs/{oid}/uploads/{crawl_id}/download",
diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py
index 9b8935ff..fc71c98d 100644
--- a/backend/btrixcloud/utils.py
+++ b/backend/btrixcloud/utils.py
@@ -181,8 +181,8 @@ def get_origin(headers) -> str:
     if not headers:
         return default_origin
 
-    scheme = headers.get("X-Forwarded-Proto")
-    host = headers.get("Host")
+    scheme = headers.get("x-forwarded-proto")
+    host = headers.get("host")
     if not scheme or not host:
         return default_origin
 
diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py
index 7073f0b1..f2608f91 100644
--- a/backend/test/test_collections.py
+++ b/backend/test/test_collections.py
@@ -401,7 +401,7 @@ def test_get_collection(crawler_auth_headers, default_org_id):
 def test_get_collection_replay(crawler_auth_headers, default_org_id):
     r = requests.get(
         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/replay.json",
-        headers=crawler_auth_headers,
+        headers={"host": "custom-domain.example.com", **crawler_auth_headers},
     )
     assert r.status_code == 200
     data = r.json()
@@ -421,8 +421,9 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id):
     assert data["dateLatest"]
     assert data["defaultThumbnailName"]
     assert data["initialPages"]
-    assert data["pagesQueryUrl"].endswith(
-        f"/orgs/{default_org_id}/collections/{_coll_id}/pages"
+    assert (
+        data["pagesQueryUrl"]
+        == f"http://custom-domain.example.com/api/orgs/{default_org_id}/collections/{_coll_id}/pages"
     )
     assert data["downloadUrl"] is None
     assert "preloadResources" in data
@@ -455,12 +456,13 @@ def test_collection_public(crawler_auth_headers, default_org_id):
 
     r = requests.get(
         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
-        headers=crawler_auth_headers,
+        headers={"host": "custom-domain.example.com", **crawler_auth_headers},
     )
     data = r.json()
     assert data["initialPages"]
-    assert data["pagesQueryUrl"].endswith(
-        f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
+    assert (
+        data["pagesQueryUrl"]
+        == f"http://custom-domain.example.com/api/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
     )
     assert data["downloadUrl"] is not None
     assert "preloadResources" in data
diff --git a/backend/test/test_stop_cancel_crawl.py b/backend/test/test_stop_cancel_crawl.py
index f4f847e8..b82f718b 100644
--- a/backend/test/test_stop_cancel_crawl.py
+++ b/backend/test/test_stop_cancel_crawl.py
@@ -176,3 +176,12 @@ def test_stop_crawl_partial(
     assert data["stopping"] == True
 
     assert len(data["resources"]) == 1
+
+
+def test_crawl_with_hostname(default_org_id, crawler_auth_headers):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+        headers={"X-Forwarded-Proto": "https", "host": "custom-domain.example.com", **crawler_auth_headers},
+    )
+    assert r.status_code == 200
+    assert r.json()["pagesQueryUrl"].startswith("https://custom-domain.example.com/")
diff --git a/backend/test_nightly/test_org_deletion.py b/backend/test_nightly/test_org_deletion.py
index 291ba01b..a8692588 100644
--- a/backend/test_nightly/test_org_deletion.py
+++ b/backend/test_nightly/test_org_deletion.py
@@ -170,8 +170,6 @@ def test_delete_org_crawl_running(
         except:
             time.sleep(10)
 
-
-
         attempts += 1
 
     # Check that org was deleted