Validate seed files on backend and add tests (#2781)
Fixes #2780 This PR adds additional backend validation for seed file uploads to fail a seed upload if no valid seeds are found. It adds two new test cases to ensure seed uploads will fail for binary files and for text files that do not contain any valid URLs.
This commit is contained in:
parent
7047cbeda9
commit
d3e241ad03
@ -24,7 +24,7 @@ from .models import (
|
||||
PaginatedUserFileResponse,
|
||||
)
|
||||
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
||||
from .utils import dt_now
|
||||
from .utils import dt_now, is_url
|
||||
from .storages import StorageOps, CHUNK_SIZE
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -245,6 +245,13 @@ class FileUploadOps:
|
||||
)
|
||||
|
||||
first_seed, seed_count = await self._parse_seed_info_from_file(file_obj, org)
|
||||
if not first_seed or seed_count == 0:
|
||||
print(
|
||||
f"{upload_type} stream upload failed: invalid seed file",
|
||||
flush=True,
|
||||
)
|
||||
await self.storage_ops.delete_file_object(org, file_obj)
|
||||
raise HTTPException(status_code=400, detail="invalid_seed_file")
|
||||
|
||||
# Save file to database
|
||||
file_to_insert = SeedFile(
|
||||
@ -293,9 +300,14 @@ class FileUploadOps:
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if not first_seed:
|
||||
first_seed = line.decode("utf-8").strip()
|
||||
seed_count += 1
|
||||
try:
|
||||
seed_url = line.decode("utf-8").strip()
|
||||
if not first_seed and is_url(seed_url):
|
||||
first_seed = seed_url
|
||||
seed_count += 1
|
||||
# pylint: disable=broad-exception-caught
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return first_seed, seed_count
|
||||
|
||||
|
2
backend/test/data/invalid-seedfile.txt
Normal file
2
backend/test/data/invalid-seedfile.txt
Normal file
@ -0,0 +1,2 @@
|
||||
notaurl
|
||||
{"also": "not a url"}
|
@ -97,3 +97,25 @@ def test_delete_seed_file(crawler_auth_headers, default_org_id):
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
||||
|
||||
def test_invalid_seed_file_upload(crawler_auth_headers, default_org_id):
|
||||
# Ensure we can't upload a binary file as a seed file
|
||||
with open(os.path.join(curr_dir, "data", "thumbnail.jpg"), "rb") as fh:
|
||||
r = requests.put(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/files/seedFile?filename=imposter.txt",
|
||||
headers=crawler_auth_headers,
|
||||
data=read_in_chunks(fh),
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_seed_file"
|
||||
|
||||
# Ensure "seed file" with no valid seeds also fails to upload
|
||||
with open(os.path.join(curr_dir, "data", "invalid-seedfile.txt"), "rb") as fh:
|
||||
r = requests.put(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/files/seedFile?filename=novalidseeds.txt",
|
||||
headers=crawler_auth_headers,
|
||||
data=read_in_chunks(fh),
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_seed_file"
|
||||
|
Loading…
Reference in New Issue
Block a user