Validate seed files on backend and add tests (#2781)

Fixes #2780 

This PR adds additional backend validation for seed file uploads to fail
a seed upload if no valid seeds are found. It adds two new test cases to
ensure seed uploads will fail for binary files and for text files that
do not contain any valid URLs.
This commit is contained in:
Tessa Walsh 2025-08-01 02:20:58 -04:00 committed by GitHub
parent 7047cbeda9
commit d3e241ad03
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 40 additions and 4 deletions

View File

@ -24,7 +24,7 @@ from .models import (
PaginatedUserFileResponse,
)
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .utils import dt_now
from .utils import dt_now, is_url
from .storages import StorageOps, CHUNK_SIZE
if TYPE_CHECKING:
@ -245,6 +245,13 @@ class FileUploadOps:
)
first_seed, seed_count = await self._parse_seed_info_from_file(file_obj, org)
if not first_seed or seed_count == 0:
print(
f"{upload_type} stream upload failed: invalid seed file",
flush=True,
)
await self.storage_ops.delete_file_object(org, file_obj)
raise HTTPException(status_code=400, detail="invalid_seed_file")
# Save file to database
file_to_insert = SeedFile(
@ -293,9 +300,14 @@ class FileUploadOps:
if not line:
continue
if not first_seed:
first_seed = line.decode("utf-8").strip()
seed_count += 1
try:
seed_url = line.decode("utf-8").strip()
if not first_seed and is_url(seed_url):
first_seed = seed_url
seed_count += 1
# pylint: disable=broad-exception-caught
except Exception:
pass
return first_seed, seed_count

View File

@ -0,0 +1,2 @@
notaurl
{"also": "not a url"}

View File

@ -97,3 +97,25 @@ def test_delete_seed_file(crawler_auth_headers, default_org_id):
headers=crawler_auth_headers,
)
assert r.status_code == 404
def test_invalid_seed_file_upload(crawler_auth_headers, default_org_id):
# Ensure we can't upload a binary file as a seed file
with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/files/seedFile?filename=imposter.txt",
headers=crawler_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_seed_file"
# Ensure "seed file" with no valid seeds also fails to upload
with open(os.path.join(curr_dir, "data", "invalid-seedfile.txt"), "rb") as fh:
r = requests.put(
f"{API_PREFIX}/orgs/{default_org_id}/files/seedFile?filename=novalidseeds.txt",
headers=crawler_auth_headers,
data=read_in_chunks(fh),
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_seed_file"