From d3e241ad030d6274f71ff4e3219d7a2802dab3a7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Fri, 1 Aug 2025 02:20:58 -0400 Subject: [PATCH] Validate seed files on backend and add tests (#2781) Fixes #2780 This PR adds additional backend validation for seed file uploads to fail a seed upload if no valid seeds are found. It adds two new test cases to ensure seed uploads will fail for binary files and for text files that do not contain any valid URLs. --- backend/btrixcloud/file_uploads.py | 20 ++++++++++++++++---- backend/test/data/invalid-seedfile.txt | 2 ++ backend/test/test_files.py | 22 ++++++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 backend/test/data/invalid-seedfile.txt diff --git a/backend/btrixcloud/file_uploads.py b/backend/btrixcloud/file_uploads.py index 82bf42c7..3c34bdd5 100644 --- a/backend/btrixcloud/file_uploads.py +++ b/backend/btrixcloud/file_uploads.py @@ -24,7 +24,7 @@ from .models import ( PaginatedUserFileResponse, ) from .pagination import DEFAULT_PAGE_SIZE, paginated_format -from .utils import dt_now +from .utils import dt_now, is_url from .storages import StorageOps, CHUNK_SIZE if TYPE_CHECKING: @@ -245,6 +245,13 @@ class FileUploadOps: ) first_seed, seed_count = await self._parse_seed_info_from_file(file_obj, org) + if not first_seed or seed_count == 0: + print( + f"{upload_type} stream upload failed: invalid seed file", + flush=True, + ) + await self.storage_ops.delete_file_object(org, file_obj) + raise HTTPException(status_code=400, detail="invalid_seed_file") # Save file to database file_to_insert = SeedFile( @@ -293,9 +300,14 @@ class FileUploadOps: if not line: continue - if not first_seed: - first_seed = line.decode("utf-8").strip() - seed_count += 1 + try: + seed_url = line.decode("utf-8").strip() + if not first_seed and is_url(seed_url): + first_seed = seed_url + seed_count += 1 + # pylint: disable=broad-exception-caught + except Exception: + pass return first_seed, seed_count diff --git a/backend/test/data/invalid-seedfile.txt b/backend/test/data/invalid-seedfile.txt new file mode 100644 index 00000000..d77eb3fa --- /dev/null +++ b/backend/test/data/invalid-seedfile.txt @@ -0,0 +1,2 @@ +notaurl +{"also": "not a url"} diff --git a/backend/test/test_files.py b/backend/test/test_files.py index 01a9ea3c..aab896aa 100644 --- a/backend/test/test_files.py +++ b/backend/test/test_files.py @@ -97,3 +97,25 @@ def test_delete_seed_file(crawler_auth_headers, default_org_id): headers=crawler_auth_headers, ) assert r.status_code == 404 + + +def test_invalid_seed_file_upload(crawler_auth_headers, default_org_id): + # Ensure we can't upload a binary file as a seed file + with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh: + r = requests.put( + f"{API_PREFIX}/orgs/{default_org_id}/files/seedFile?filename=imposter.txt", + headers=crawler_auth_headers, + data=read_in_chunks(fh), + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_seed_file" + + # Ensure "seed file" with no valid seeds also fails to upload + with open(os.path.join(curr_dir, "data", "invalid-seedfile.txt"), "rb") as fh: + r = requests.put( + f"{API_PREFIX}/orgs/{default_org_id}/files/seedFile?filename=novalidseeds.txt", + headers=crawler_auth_headers, + data=read_in_chunks(fh), + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_seed_file"