Add ISO-639-1 language code validation to backend (#2602)

- Add backend validation for language codes
- Add migration to look for invalid ISO-639-1 language codes in
workflows, crawls, and org crawling defaults, and fix any found
This commit is contained in:
Tessa Walsh 2025-05-13 16:54:33 -04:00 committed by GitHub
parent e17772145e
commit 1492397656
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 338 additions and 2 deletions

View File

@ -46,7 +46,13 @@ from .models import (
CrawlerProxies,
ValidateCustomBehavior,
)
from .utils import dt_now, slug_from_name, validate_regexes, is_url
from .utils import (
dt_now,
slug_from_name,
validate_regexes,
validate_language_code,
is_url,
)
if TYPE_CHECKING:
from .orgs import OrgOps
@ -235,6 +241,9 @@ class CrawlConfigOps:
self._validate_link_selectors(config_in.config.selectLinks)
if config_in.config.lang:
validate_language_code(config_in.config.lang)
if config_in.config.customBehaviors:
for url in config_in.config.customBehaviors:
self._validate_custom_behavior_url_syntax(url)
@ -406,6 +415,9 @@ class CrawlConfigOps:
for url in update.config.customBehaviors:
self._validate_custom_behavior_url_syntax(url)
if update.config and update.config.lang:
validate_language_code(update.config.lang)
# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (

View File

@ -32,7 +32,7 @@ else:
) = PageOps = BackgroundJobOps = object
CURR_DB_VERSION = "0045"
CURR_DB_VERSION = "0046"
# ============================================================================

View File

@ -0,0 +1,265 @@
"""
Migration 0046 - Invalid language codes
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0046"
ISO_639_1_CODES = [
"aa",
"ab",
"af",
"ak",
"am",
"ar",
"an",
"as",
"av",
"ae",
"ay",
"az",
"ba",
"bm",
"be",
"bn",
"bi",
"bo",
"bs",
"br",
"bg",
"ca",
"cs",
"ch",
"ce",
"cu",
"cv",
"kw",
"co",
"cr",
"cy",
"da",
"de",
"dv",
"dz",
"el",
"en",
"eo",
"et",
"eu",
"ee",
"fo",
"fa",
"fj",
"fi",
"fr",
"fy",
"ff",
"gd",
"ga",
"gl",
"gv",
"gn",
"gu",
"ht",
"ha",
"sh",
"he",
"hz",
"hi",
"ho",
"hr",
"hu",
"hy",
"ig",
"io",
"ii",
"iu",
"ie",
"ia",
"id",
"ik",
"is",
"it",
"jv",
"ja",
"kl",
"kn",
"ks",
"ka",
"kr",
"kk",
"km",
"ki",
"rw",
"ky",
"kv",
"kg",
"ko",
"kj",
"ku",
"lo",
"la",
"lv",
"li",
"ln",
"lt",
"lb",
"lu",
"lg",
"mh",
"ml",
"mr",
"mk",
"mg",
"mt",
"mn",
"mi",
"ms",
"my",
"na",
"nv",
"nr",
"nd",
"ng",
"ne",
"nl",
"nn",
"nb",
"no",
"ny",
"oc",
"oj",
"or",
"om",
"os",
"pa",
"pi",
"pl",
"pt",
"ps",
"qu",
"rm",
"ro",
"rn",
"ru",
"sg",
"sa",
"si",
"sk",
"sl",
"se",
"sm",
"sn",
"sd",
"so",
"st",
"es",
"sq",
"sc",
"sr",
"ss",
"su",
"sw",
"sv",
"ty",
"ta",
"tt",
"te",
"tg",
"tl",
"th",
"ti",
"to",
"tn",
"ts",
"tk",
"tr",
"tw",
"ug",
"uk",
"ur",
"uz",
"ve",
"vi",
"vo",
"wa",
"wo",
"xh",
"yi",
"yo",
"za",
"zh",
"zu",
]
# pylint: disable=duplicate-code
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
Replace any invalid ISO-639-1 language codes that may be saved in
the database with "en".
"""
configs_mdb = self.mdb["crawl_configs"]
crawls_mdb = self.mdb["crawls"]
orgs_mdb = self.mdb["organizations"]
# Workflows
try:
result = await configs_mdb.update_many(
{"config.lang": {"$nin": [None, *ISO_639_1_CODES]}},
{"$set": {"config.lang": "en"}},
)
print(
f"Fixed invalid language code for {result.modified_count} workflows",
flush=True,
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update invalid language codes for crawl workflows: {err}",
flush=True,
)
# Crawls
try:
result = await crawls_mdb.update_many(
{"config.lang": {"$nin": [None, *ISO_639_1_CODES]}},
{"$set": {"config.lang": "en"}},
)
print(
f"Fixed invalid language code for {result.modified_count} crawls",
flush=True,
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update invalid language codes for crawls: {err}",
flush=True,
)
# Org crawling defaults
try:
result = await orgs_mdb.update_many(
{"crawlingDefaults.lang": {"$nin": [None, *ISO_639_1_CODES]}},
{"$set": {"crawlingDefaults.lang": "en"}},
)
print(
f"Fixed invalid language code for {result.modified_count} orgs",
flush=True,
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update invalid language codes for org crawling defaults: {err}",
flush=True,
)

View File

@ -86,6 +86,7 @@ from .utils import (
slug_from_name,
validate_slug,
get_duplicate_key_error_field,
validate_language_code,
JSONSerializer,
)
@ -654,6 +655,9 @@ class OrgOps:
self, org: Organization, defaults: CrawlConfigDefaults
):
"""Update crawling defaults"""
if defaults.lang:
validate_language_code(defaults.lang)
res = await self.orgs.find_one_and_update(
{"_id": org.id},
{"$set": {"crawlingDefaults": defaults.model_dump()}},

View File

@ -16,6 +16,7 @@ from uuid import UUID
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from iso639 import is_language
from pymongo.errors import DuplicateKeyError
from slugify import slugify
@ -193,3 +194,9 @@ def validate_regexes(regexes: List[str]):
except re.error:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="invalid_regex")
def validate_language_code(lang: str):
"""Validate ISO-639-1 language code, raise HTTPException if invalid"""
if not is_language(lang, "pt1"):
raise HTTPException(status_code=400, detail="invalid_lang")

View File

@ -28,3 +28,4 @@ types-pyYAML
remotezip
json-stream
aiostream
iso639-lang>=2.6.0

View File

@ -172,6 +172,7 @@ def test_update_config_invalid_exclude_regex(
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_update_config_invalid_link_selector(
crawler_auth_headers, default_org_id, sample_crawl_data
):
@ -191,6 +192,20 @@ def test_update_config_invalid_link_selector(
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
def test_update_config_invalid_lang(
crawler_auth_headers, default_org_id, sample_crawl_data
):
for invalid_code in ("f", "fra", "french"):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"lang": invalid_code}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_lang"
def test_verify_default_select_links(
crawler_auth_headers, default_org_id, sample_crawl_data
):
@ -577,6 +592,20 @@ def test_add_crawl_config_invalid_exclude_regex(
assert r.json()["detail"] == "invalid_regex"
def test_add_crawl_config_invalid_lang(
crawler_auth_headers, default_org_id, sample_crawl_data
):
for invalid_code in ("f", "fra", "french"):
sample_crawl_data["config"]["lang"] = invalid_code
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_lang"
def test_add_crawl_config_invalid_link_selectors(
crawler_auth_headers, default_org_id, sample_crawl_data
):

View File

@ -81,6 +81,20 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id):
]
def test_update_org_crawling_defaults_invalid_lang(admin_auth_headers, default_org_id):
for invalid_code in ("f", "fra", "french"):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling",
headers=admin_auth_headers,
json={
"lang": "invalid_code",
},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_lang"
def test_rename_org(admin_auth_headers, default_org_id):
UPDATED_NAME = "updated org name"
UPDATED_SLUG = "updated-org-name"

View File

@ -2340,6 +2340,9 @@ https://archiveweb.page/images/${"logo.svg"}`}
"Page exclusion contains invalid regex",
);
break;
case APIErrorDetail.InvalidLang:
errorDetailMessage = msg("Invalid language code");
break;
default:
break;
}

View File

@ -40,6 +40,7 @@ export type APISortQuery<T = Record<string, unknown>> = {
export enum APIErrorDetail {
InvalidLinkSelector = "invalid_link_selector",
InvalidRegex = "invalid_regex",
InvalidLang = "invalid_lang",
InvalidCustomBehavior = "invalid_custom_behavior",
CustomBehaviorNotFound = "custom_behavior_not_found",
CustomBehaviorBranchNotFound = "custom_behavior_branch_not_found",