From 1492397656840f33ba02bd604395e96039211211 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 13 May 2025 16:54:33 -0400 Subject: [PATCH] Add ISO-639-1 language code validation to backend (#2602) - Add backend validation for language codes - Add migration to look for invalid ISO-639-1 language codes in workflows, crawls, and org crawling defaults, and fix any found --- backend/btrixcloud/crawlconfigs.py | 14 +- backend/btrixcloud/db.py | 2 +- .../migrations/migration_0046_invalid_lang.py | 265 ++++++++++++++++++ backend/btrixcloud/orgs.py | 4 + backend/btrixcloud/utils.py | 7 + backend/requirements.txt | 1 + backend/test/test_crawlconfigs.py | 29 ++ backend/test/test_org.py | 14 + .../crawl-workflows/workflow-editor.ts | 3 + frontend/src/types/api.ts | 1 + 10 files changed, 338 insertions(+), 2 deletions(-) create mode 100644 backend/btrixcloud/migrations/migration_0046_invalid_lang.py diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index c0ab98f6..28cbc6a5 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -46,7 +46,13 @@ from .models import ( CrawlerProxies, ValidateCustomBehavior, ) -from .utils import dt_now, slug_from_name, validate_regexes, is_url +from .utils import ( + dt_now, + slug_from_name, + validate_regexes, + validate_language_code, + is_url, +) if TYPE_CHECKING: from .orgs import OrgOps @@ -235,6 +241,9 @@ class CrawlConfigOps: self._validate_link_selectors(config_in.config.selectLinks) + if config_in.config.lang: + validate_language_code(config_in.config.lang) + if config_in.config.customBehaviors: for url in config_in.config.customBehaviors: self._validate_custom_behavior_url_syntax(url) @@ -406,6 +415,9 @@ class CrawlConfigOps: for url in update.config.customBehaviors: self._validate_custom_behavior_url_syntax(url) + if update.config and update.config.lang: + validate_language_code(update.config.lang) + # indicates if any k8s crawl config settings changed changed = False changed = changed or ( diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index 471de598..e27b499d 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -32,7 +32,7 @@ else: ) = PageOps = BackgroundJobOps = object -CURR_DB_VERSION = "0045" +CURR_DB_VERSION = "0046" # ============================================================================ diff --git a/backend/btrixcloud/migrations/migration_0046_invalid_lang.py b/backend/btrixcloud/migrations/migration_0046_invalid_lang.py new file mode 100644 index 00000000..98541a0b --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0046_invalid_lang.py @@ -0,0 +1,265 @@ +""" +Migration 0046 - Invalid language codes +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0046" + +ISO_639_1_CODES = [ + "aa", + "ab", + "af", + "ak", + "am", + "ar", + "an", + "as", + "av", + "ae", + "ay", + "az", + "ba", + "bm", + "be", + "bn", + "bi", + "bo", + "bs", + "br", + "bg", + "ca", + "cs", + "ch", + "ce", + "cu", + "cv", + "kw", + "co", + "cr", + "cy", + "da", + "de", + "dv", + "dz", + "el", + "en", + "eo", + "et", + "eu", + "ee", + "fo", + "fa", + "fj", + "fi", + "fr", + "fy", + "ff", + "gd", + "ga", + "gl", + "gv", + "gn", + "gu", + "ht", + "ha", + "sh", + "he", + "hz", + "hi", + "ho", + "hr", + "hu", + "hy", + "ig", + "io", + "ii", + "iu", + "ie", + "ia", + "id", + "ik", + "is", + "it", + "jv", + "ja", + "kl", + "kn", + "ks", + "ka", + "kr", + "kk", + "km", + "ki", + "rw", + "ky", + "kv", + "kg", + "ko", + "kj", + "ku", + "lo", + "la", + "lv", + "li", + "ln", + "lt", + "lb", + "lu", + "lg", + "mh", + "ml", + "mr", + "mk", + "mg", + "mt", + "mn", + "mi", + "ms", + "my", + "na", + "nv", + "nr", + "nd", + "ng", + "ne", + "nl", + "nn", + "nb", + "no", + "ny", + "oc", + "oj", + "or", + "om", + "os", + "pa", + "pi", + "pl", + "pt", + "ps", + "qu", + "rm", + "ro", + "rn", + "ru", + "sg", + "sa", + "si", + "sk", + "sl", + "se", + "sm", + "sn", + "sd", + "so", + "st", + "es", + "sq", + "sc", + "sr", + "ss", + "su", + "sw", + "sv", + "ty", + "ta", + "tt", + "te", + "tg", + "tl", + "th", + "ti", + "to", + "tn", + "ts", + "tk", + "tr", + "tw", + "ug", + "uk", + "ur", + "uz", + "ve", + "vi", + "vo", + "wa", + "wo", + "xh", + "yi", + "yo", + "za", + "zh", + "zu", +] + + +# pylint: disable=duplicate-code +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + async def migrate_up(self): + """Perform migration up. + + Replace any invalid ISO-639-1 language codes that may be saved in + the database with "en". + """ + configs_mdb = self.mdb["crawl_configs"] + crawls_mdb = self.mdb["crawls"] + orgs_mdb = self.mdb["organizations"] + + # Workflows + try: + result = await configs_mdb.update_many( + {"config.lang": {"$nin": [None, *ISO_639_1_CODES]}}, + {"$set": {"config.lang": "en"}}, + ) + print( + f"Fixed invalid language code for {result.modified_count} workflows", + flush=True, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to update invalid language codes for crawl workflows: {err}", + flush=True, + ) + + # Crawls + try: + result = await crawls_mdb.update_many( + {"config.lang": {"$nin": [None, *ISO_639_1_CODES]}}, + {"$set": {"config.lang": "en"}}, + ) + print( + f"Fixed invalid language code for {result.modified_count} crawls", + flush=True, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to update invalid language codes for crawls: {err}", + flush=True, + ) + + # Org crawling defaults + try: + result = await orgs_mdb.update_many( + {"crawlingDefaults.lang": {"$nin": [None, *ISO_639_1_CODES]}}, + {"$set": {"crawlingDefaults.lang": "en"}}, + ) + print( + f"Fixed invalid language code for {result.modified_count} orgs", + flush=True, + ) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to update invalid language codes for org crawling defaults: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index bbaa790c..40130a31 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -86,6 +86,7 @@ from .utils import ( slug_from_name, validate_slug, get_duplicate_key_error_field, + validate_language_code, JSONSerializer, ) @@ -654,6 +655,9 @@ class OrgOps: self, org: Organization, defaults: CrawlConfigDefaults ): """Update crawling defaults""" + if defaults.lang: + validate_language_code(defaults.lang) + res = await self.orgs.find_one_and_update( {"_id": org.id}, {"$set": {"crawlingDefaults": defaults.model_dump()}}, diff --git a/backend/btrixcloud/utils.py b/backend/btrixcloud/utils.py index ad72a2ad..0c145c31 100644 --- a/backend/btrixcloud/utils.py +++ b/backend/btrixcloud/utils.py @@ -16,6 +16,7 @@ from uuid import UUID from fastapi import HTTPException from fastapi.responses import StreamingResponse +from iso639 import is_language from pymongo.errors import DuplicateKeyError from slugify import slugify @@ -193,3 +194,9 @@ def validate_regexes(regexes: List[str]): except re.error: # pylint: disable=raise-missing-from raise HTTPException(status_code=400, detail="invalid_regex") + + +def validate_language_code(lang: str): + """Validate ISO-639-1 language code, raise HTTPException if invalid""" + if not is_language(lang, "pt1"): + raise HTTPException(status_code=400, detail="invalid_lang") diff --git a/backend/requirements.txt b/backend/requirements.txt index e1a863cc..21c7d4ee 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -28,3 +28,4 @@ types-pyYAML remotezip json-stream aiostream +iso639-lang>=2.6.0 diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index b94eed1f..eaafb10c 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -172,6 +172,7 @@ def test_update_config_invalid_exclude_regex( assert r.status_code == 400 assert r.json()["detail"] == "invalid_regex" + def test_update_config_invalid_link_selector( crawler_auth_headers, default_org_id, sample_crawl_data ): @@ -191,6 +192,20 @@ def test_update_config_invalid_link_selector( assert r.status_code == 400 assert r.json()["detail"] == "invalid_link_selector" + +def test_update_config_invalid_lang( + crawler_auth_headers, default_org_id, sample_crawl_data +): + for invalid_code in ("f", "fra", "french"): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={"config": {"lang": invalid_code}}, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_lang" + + def test_verify_default_select_links( crawler_auth_headers, default_org_id, sample_crawl_data ): @@ -577,6 +592,20 @@ def test_add_crawl_config_invalid_exclude_regex( assert r.json()["detail"] == "invalid_regex" +def test_add_crawl_config_invalid_lang( + crawler_auth_headers, default_org_id, sample_crawl_data +): + for invalid_code in ("f", "fra", "french"): + sample_crawl_data["config"]["lang"] = invalid_code + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=crawler_auth_headers, + json=sample_crawl_data, + ) + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_lang" + + def test_add_crawl_config_invalid_link_selectors( crawler_auth_headers, default_org_id, sample_crawl_data ): diff --git a/backend/test/test_org.py b/backend/test/test_org.py index 755e3711..5c41a0fa 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -81,6 +81,20 @@ def test_update_org_crawling_defaults(admin_auth_headers, default_org_id): ] +def test_update_org_crawling_defaults_invalid_lang(admin_auth_headers, default_org_id): + for invalid_code in ("f", "fra", "french"): + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/defaults/crawling", + headers=admin_auth_headers, + json={ + "lang": "invalid_code", + }, + ) + + assert r.status_code == 400 + assert r.json()["detail"] == "invalid_lang" + + def test_rename_org(admin_auth_headers, default_org_id): UPDATED_NAME = "updated org name" UPDATED_SLUG = "updated-org-name" diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 0a02c63e..d5bad688 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -2340,6 +2340,9 @@ https://archiveweb.page/images/${"logo.svg"}`} "Page exclusion contains invalid regex", ); break; + case APIErrorDetail.InvalidLang: + errorDetailMessage = msg("Invalid language code"); + break; default: break; } diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 2491fb52..c5a3c28f 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -40,6 +40,7 @@ export type APISortQuery> = { export enum APIErrorDetail { InvalidLinkSelector = "invalid_link_selector", InvalidRegex = "invalid_regex", + InvalidLang = "invalid_lang", InvalidCustomBehavior = "invalid_custom_behavior", CustomBehaviorNotFound = "custom_behavior_not_found", CustomBehaviorBranchNotFound = "custom_behavior_branch_not_found",