browsertrix/backend/btrixcloud/migrations/migration_0046_invalid_lang.py
Tessa Walsh 1492397656
Add ISO-639-1 language code validation to backend (#2602)
- Add backend validation for language codes
- Add migration to look for invalid ISO-639-1 language codes in
workflows, crawls, and org crawling defaults, and fix any found
2025-05-13 16:54:33 -04:00

266 lines
4.2 KiB
Python

"""
Migration 0046 - Invalid language codes
"""
from btrixcloud.migrations import BaseMigration
MIGRATION_VERSION = "0046"
ISO_639_1_CODES = [
"aa",
"ab",
"af",
"ak",
"am",
"ar",
"an",
"as",
"av",
"ae",
"ay",
"az",
"ba",
"bm",
"be",
"bn",
"bi",
"bo",
"bs",
"br",
"bg",
"ca",
"cs",
"ch",
"ce",
"cu",
"cv",
"kw",
"co",
"cr",
"cy",
"da",
"de",
"dv",
"dz",
"el",
"en",
"eo",
"et",
"eu",
"ee",
"fo",
"fa",
"fj",
"fi",
"fr",
"fy",
"ff",
"gd",
"ga",
"gl",
"gv",
"gn",
"gu",
"ht",
"ha",
"sh",
"he",
"hz",
"hi",
"ho",
"hr",
"hu",
"hy",
"ig",
"io",
"ii",
"iu",
"ie",
"ia",
"id",
"ik",
"is",
"it",
"jv",
"ja",
"kl",
"kn",
"ks",
"ka",
"kr",
"kk",
"km",
"ki",
"rw",
"ky",
"kv",
"kg",
"ko",
"kj",
"ku",
"lo",
"la",
"lv",
"li",
"ln",
"lt",
"lb",
"lu",
"lg",
"mh",
"ml",
"mr",
"mk",
"mg",
"mt",
"mn",
"mi",
"ms",
"my",
"na",
"nv",
"nr",
"nd",
"ng",
"ne",
"nl",
"nn",
"nb",
"no",
"ny",
"oc",
"oj",
"or",
"om",
"os",
"pa",
"pi",
"pl",
"pt",
"ps",
"qu",
"rm",
"ro",
"rn",
"ru",
"sg",
"sa",
"si",
"sk",
"sl",
"se",
"sm",
"sn",
"sd",
"so",
"st",
"es",
"sq",
"sc",
"sr",
"ss",
"su",
"sw",
"sv",
"ty",
"ta",
"tt",
"te",
"tg",
"tl",
"th",
"ti",
"to",
"tn",
"ts",
"tk",
"tr",
"tw",
"ug",
"uk",
"ur",
"uz",
"ve",
"vi",
"vo",
"wa",
"wo",
"xh",
"yi",
"yo",
"za",
"zh",
"zu",
]
# pylint: disable=duplicate-code
class Migration(BaseMigration):
"""Migration class."""
# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)
async def migrate_up(self):
"""Perform migration up.
Replace any invalid ISO-639-1 language codes that may be saved in
the database with "en".
"""
configs_mdb = self.mdb["crawl_configs"]
crawls_mdb = self.mdb["crawls"]
orgs_mdb = self.mdb["organizations"]
# Workflows
try:
result = await configs_mdb.update_many(
{"config.lang": {"$nin": [None, *ISO_639_1_CODES]}},
{"$set": {"config.lang": "en"}},
)
print(
f"Fixed invalid language code for {result.modified_count} workflows",
flush=True,
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update invalid language codes for crawl workflows: {err}",
flush=True,
)
# Crawls
try:
result = await crawls_mdb.update_many(
{"config.lang": {"$nin": [None, *ISO_639_1_CODES]}},
{"$set": {"config.lang": "en"}},
)
print(
f"Fixed invalid language code for {result.modified_count} crawls",
flush=True,
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update invalid language codes for crawls: {err}",
flush=True,
)
# Org crawling defaults
try:
result = await orgs_mdb.update_many(
{"crawlingDefaults.lang": {"$nin": [None, *ISO_639_1_CODES]}},
{"$set": {"crawlingDefaults.lang": "en"}},
)
print(
f"Fixed invalid language code for {result.modified_count} orgs",
flush=True,
)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update invalid language codes for org crawling defaults: {err}",
flush=True,
)