From 3af94ca03dfe3782b28f752900d06b0a683b1bcd Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 16 Jul 2025 10:48:24 -0700 Subject: [PATCH] Ensure replay.json returns correct origin for pagesQueryUrl (#2741) - Use the Host + X-Forwarded-Proto header from API request - Fixes #2740, better fix for #2720 avoiding need for separate alias --- backend/btrixcloud/basecrawls.py | 20 ++++++++++++------- backend/btrixcloud/crawls.py | 18 ++++++++++++----- backend/btrixcloud/uploads.py | 24 +++++++++++++++++------ backend/btrixcloud/utils.py | 4 ++-- backend/test/test_collections.py | 14 +++++++------ backend/test/test_stop_cancel_crawl.py | 9 +++++++++ backend/test_nightly/test_org_deletion.py | 2 -- 7 files changed, 63 insertions(+), 28 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 633c4c8b..38aec2c7 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -18,7 +18,7 @@ import os import urllib.parse import asyncio -from fastapi import HTTPException, Depends +from fastapi import HTTPException, Depends, Request from fastapi.responses import StreamingResponse import pymongo @@ -1057,27 +1057,33 @@ def init_base_crawls_api(app, user_dep, *args): tags=["all-crawls"], response_model=CrawlOutWithResources, ) - async def get_base_crawl(crawl_id: str, org: Organization = Depends(org_crawl_dep)): - return await ops.get_crawl_out(crawl_id, org) + async def get_base_crawl( + crawl_id: str, request: Request, org: Organization = Depends(org_crawl_dep) + ): + return await ops.get_crawl_out(crawl_id, org, headers=dict(request.headers)) @app.get( "/orgs/all/all-crawls/{crawl_id}/replay.json", tags=["all-crawls"], response_model=CrawlOutWithResources, ) - async def get_base_crawl_admin(crawl_id, user: User = Depends(user_dep)): + async def get_base_crawl_admin( + crawl_id, request: Request, user: User = Depends(user_dep) + ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl_out(crawl_id, None) + return await ops.get_crawl_out(crawl_id, None, headers=dict(request.headers)) @app.get( "/orgs/{oid}/all-crawls/{crawl_id}/replay.json", tags=["all-crawls"], response_model=CrawlOutWithResources, ) - async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl_out(crawl_id, org) + async def get_crawl_out( + crawl_id, request: Request, org: Organization = Depends(org_viewer_dep) + ): + return await ops.get_crawl_out(crawl_id, org, headers=dict(request.headers)) @app.get( "/orgs/{oid}/all-crawls/{crawl_id}/download", diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 2891957a..b4573f0a 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -12,7 +12,7 @@ from uuid import UUID from typing import Optional, List, Dict, Union, Any, Sequence, AsyncIterator -from fastapi import Depends, HTTPException +from fastapi import Depends, HTTPException, Request from fastapi.responses import StreamingResponse from redis import asyncio as exceptions from redis.asyncio.client import Redis @@ -1345,19 +1345,27 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): tags=["crawls"], response_model=CrawlOutWithResources, ) - async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)): + async def get_crawl_admin( + crawl_id, request: Request, user: User = Depends(user_dep) + ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl_out(crawl_id, None, "crawl") + return await ops.get_crawl_out( + crawl_id, None, "crawl", headers=dict(request.headers) + ) @app.get( "/orgs/{oid}/crawls/{crawl_id}/replay.json", tags=["crawls"], response_model=CrawlOutWithResources, ) - async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl_out(crawl_id, org, "crawl") + async def get_crawl_out( + crawl_id, request: Request, org: Organization = Depends(org_viewer_dep) + ): + return await ops.get_crawl_out( + crawl_id, org, "crawl", headers=dict(request.headers) + ) @app.get( "/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 74798c8c..23c0f125 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -367,27 +367,39 @@ def init_uploads_api(app, user_dep, *args): tags=["uploads"], response_model=CrawlOut, ) - async def get_upload(crawlid: str, org: Organization = Depends(org_crawl_dep)): - return await ops.get_crawl_out(crawlid, org, "upload") + async def get_upload( + crawlid: str, request: Request, org: Organization = Depends(org_crawl_dep) + ): + return await ops.get_crawl_out( + crawlid, org, "upload", headers=dict(request.headers) + ) @app.get( "/orgs/all/uploads/{crawl_id}/replay.json", tags=["uploads"], response_model=CrawlOutWithResources, ) - async def get_upload_replay_admin(crawl_id, user: User = Depends(user_dep)): + async def get_upload_replay_admin( + crawl_id, request: Request, user: User = Depends(user_dep) + ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") - return await ops.get_crawl_out(crawl_id, None, "upload") + return await ops.get_crawl_out( + crawl_id, None, "upload", headers=dict(request.headers) + ) @app.get( "/orgs/{oid}/uploads/{crawl_id}/replay.json", tags=["uploads"], response_model=CrawlOutWithResources, ) - async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)): - return await ops.get_crawl_out(crawl_id, org, "upload") + async def get_upload_replay( + crawl_id, request: Request, org: Organization = Depends(org_viewer_dep) + ): + return await ops.get_crawl_out( + crawl_id, org, "upload", headers=dict(request.headers) + ) @app.get( "/orgs/{oid}/uploads/{crawl_id}/download", diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index 9b8935ff..fc71c98d 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -181,8 +181,8 @@ def get_origin(headers) -> str: if not headers: return default_origin - scheme = headers.get("X-Forwarded-Proto") - host = headers.get("Host") + scheme = headers.get("x-forwarded-proto") + host = headers.get("host") if not scheme or not host: return default_origin diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 7073f0b1..f2608f91 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -401,7 +401,7 @@ def test_get_collection(crawler_auth_headers, default_org_id): def test_get_collection_replay(crawler_auth_headers, default_org_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/replay.json", - headers=crawler_auth_headers, + headers={"host": "custom-domain.example.com", **crawler_auth_headers}, ) assert r.status_code == 200 data = r.json() @@ -421,8 +421,9 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["dateLatest"] assert data["defaultThumbnailName"] assert data["initialPages"] - assert data["pagesQueryUrl"].endswith( - f"/orgs/{default_org_id}/collections/{_coll_id}/pages" + assert ( + data["pagesQueryUrl"] + == f"http://custom-domain.example.com/api/orgs/{default_org_id}/collections/{_coll_id}/pages" ) assert data["downloadUrl"] is None assert "preloadResources" in data @@ -455,12 +456,13 @@ def test_collection_public(crawler_auth_headers, default_org_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json", - headers=crawler_auth_headers, + headers={"host": "custom-domain.example.com", **crawler_auth_headers}, ) data = r.json() assert data["initialPages"] - assert data["pagesQueryUrl"].endswith( - f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages" + assert ( + data["pagesQueryUrl"] + == f"http://custom-domain.example.com/api/orgs/{default_org_id}/collections/{_coll_id}/public/pages" ) assert data["downloadUrl"] is not None assert "preloadResources" in data diff --git a/backend/test/test_stop_cancel_crawl.py b/backend/test/test_stop_cancel_crawl.py index f4f847e8..b82f718b 100644 --- a/backend/test/test_stop_cancel_crawl.py +++ b/backend/test/test_stop_cancel_crawl.py @@ -176,3 +176,12 @@ def test_stop_crawl_partial( assert data["stopping"] == True assert len(data["resources"]) == 1 + + +def test_crawl_with_hostname(default_org_id, crawler_auth_headers): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers={"X-Forwarded-Proto": "https", "host": "custom-domain.example.com", **crawler_auth_headers}, + ) + assert r.status_code == 200 + assert r.json()["pagesQueryUrl"].startswith("https://custom-domain.example.com/") diff --git a/backend/test_nightly/test_org_deletion.py b/backend/test_nightly/test_org_deletion.py index 291ba01b..a8692588 100644 --- a/backend/test_nightly/test_org_deletion.py +++ b/backend/test_nightly/test_org_deletion.py @@ -170,8 +170,6 @@ def test_delete_org_crawl_running( except: time.sleep(10) - - attempts += 1 # Check that org was deleted