Source code for kerb.preprocessing.language

"""Language detection functionality."""

import re
from typing import List

from .enums import LanguageDetectionMode
from .types import LanguageResult


[docs] def detect_language( text: str, mode: LanguageDetectionMode = LanguageDetectionMode.FAST ) -> LanguageResult: """Detect text language with multiple strategies. Uses langdetect library if available, otherwise falls back to heuristic-based detection supporting 50+ languages. Args: text: Input text mode: Detection mode - FAST: Quick heuristic-based detection - ACCURATE: Use langdetect library if available - SIMPLE: Basic character range detection Returns: LanguageResult with detected language and confidence Examples: >>> result = detect_language("Hello world") >>> result.language 'en' >>> result = detect_language("Bonjour le monde") >>> result.language 'fr' >>> result = detect_language("こんにちは世界") >>> result.language 'ja' """ if not text or len(text.strip()) < 3: return LanguageResult(language="unknown", confidence=0.0) # Try accurate detection with langdetect if available and requested if mode == LanguageDetectionMode.ACCURATE: try: from langdetect import detect_langs results = detect_langs(text) if results: # Return top result with alternatives alternatives = [(r.lang, r.prob) for r in results[1:4]] return LanguageResult( language=results[0].lang, confidence=results[0].prob, alternatives=alternatives, ) except ImportError: # Fall back to heuristic detection pass except Exception: # langdetect can fail on certain inputs pass # Heuristic-based detection return _detect_language_heuristic(text, mode)
[docs] def detect_language_batch( texts: List[str], mode: LanguageDetectionMode = LanguageDetectionMode.FAST ) -> List[LanguageResult]: """Batch language detection. Args: texts: List of input texts mode: Detection mode Returns: List of LanguageResult objects Examples: >>> results = detect_language_batch(["Hello", "Bonjour"]) >>> [r.language for r in results] ['en', 'fr'] """ return [detect_language(text, mode) for text in texts]
[docs] def is_language(text: str, language: str, threshold: float = 0.5) -> bool: """Check if text is specific language. Args: text: Input text language: Language code to check (e.g., 'en', 'fr') threshold: Confidence threshold Returns: True if text is detected as specified language Examples: >>> is_language("Hello world", "en") True """ result = detect_language(text) return result.language == language and result.confidence >= threshold
[docs] def filter_by_language( texts: List[str], language: str, threshold: float = 0.5 ) -> List[str]: """Filter texts by language. Args: texts: List of texts language: Language code to filter for threshold: Confidence threshold Returns: List of texts in specified language Examples: >>> filter_by_language(["Hello", "Bonjour"], "en") ['Hello'] """ return [text for text in texts if is_language(text, language, threshold)]
[docs] def get_supported_languages() -> List[str]: """Get list of supported languages. Returns heuristic-supported languages. With langdetect library installed, 55+ languages are supported. Without it, 20+ languages are supported through character-based and pattern detection. Returns: List of language codes Examples: >>> langs = get_supported_languages() >>> "en" in langs True >>> len(langs) >= 20 True """ # Languages supported by heuristic detection heuristic_langs = [ "en", # English "fr", # French "de", # German "es", # Spanish "pt", # Portuguese "it", # Italian "nl", # Dutch "pl", # Polish "ro", # Romanian "cs", # Czech "tr", # Turkish "sv", # Swedish "no", # Norwegian "da", # Danish "fi", # Finnish "hu", # Hungarian "ru", # Russian "ar", # Arabic "he", # Hebrew "zh", # Chinese "ja", # Japanese "ko", # Korean "th", # Thai "hi", # Hindi "el", # Greek "unknown", ] try: # If langdetect is available, it supports 55+ languages from langdetect import PROFILES_DIRECTORY return heuristic_langs + [ "af", "sq", "am", "bg", "bn", "ca", "hr", "et", "tl", "ka", "gu", "ht", "he", "id", "ga", "kn", "lv", "lt", "mk", "ml", "mr", "mn", "ne", "pa", "fa", "sk", "sl", "so", "sw", "ta", "te", "uk", "ur", "vi", "cy", "yi", ] except ImportError: return heuristic_langs
# ============================================================================ # Helper Functions # ============================================================================ def _detect_language_heuristic( text: str, mode: LanguageDetectionMode ) -> LanguageResult: """Heuristic-based language detection supporting 50+ languages.""" text_lower = text.lower() # Character-based script detection (highest priority) script_result = _detect_by_script(text) if script_result.confidence > 0.85: return script_result # Latin-script language detection with n-gram and diacritic analysis if script_result.language in ["en", "unknown"]: latin_result = _detect_latin_language(text_lower) if latin_result.confidence > 0.6: return latin_result # Return script-based result if nothing better found return ( script_result if script_result.confidence > 0.3 else LanguageResult(language="unknown", confidence=0.2) ) def _detect_by_script(text: str) -> LanguageResult: """Detect language by character script/range.""" # Count characters in different Unicode ranges char_counts = { "latin": 0, "cyrillic": 0, "arabic": 0, "hebrew": 0, "cjk": 0, "hiragana": 0, "katakana": 0, "hangul": 0, "thai": 0, "devanagari": 0, "greek": 0, } for char in text: code = ord(char) # Latin (including extended) if (0x0041 <= code <= 0x007A) or (0x00C0 <= code <= 0x024F): char_counts["latin"] += 1 # Cyrillic elif 0x0400 <= code <= 0x04FF: char_counts["cyrillic"] += 1 # Arabic elif (0x0600 <= code <= 0x06FF) or (0x0750 <= code <= 0x077F): char_counts["arabic"] += 1 # Hebrew elif 0x0590 <= code <= 0x05FF: char_counts["hebrew"] += 1 # Greek elif 0x0370 <= code <= 0x03FF: char_counts["greek"] += 1 # Devanagari (Hindi, Marathi, Nepali) elif 0x0900 <= code <= 0x097F: char_counts["devanagari"] += 1 # Thai elif 0x0E00 <= code <= 0x0E7F: char_counts["thai"] += 1 # Hangul (Korean) elif ( (0x1100 <= code <= 0x11FF) or (0x3130 <= code <= 0x318F) or (0xAC00 <= code <= 0xD7AF) ): char_counts["hangul"] += 1 # Hiragana (Japanese) elif 0x3040 <= code <= 0x309F: char_counts["hiragana"] += 1 # Katakana (Japanese) elif 0x30A0 <= code <= 0x30FF: char_counts["katakana"] += 1 # CJK Unified Ideographs (Chinese/Japanese/Korean) elif (0x4E00 <= code <= 0x9FFF) or (0x3400 <= code <= 0x4DBF): char_counts["cjk"] += 1 total_chars = sum(char_counts.values()) if total_chars < 3: return LanguageResult(language="unknown", confidence=0.0) # Detect by dominant script if char_counts["arabic"] / total_chars > 0.3: return LanguageResult( language="ar", confidence=min(0.95, char_counts["arabic"] / total_chars + 0.2), ) if char_counts["hebrew"] / total_chars > 0.3: return LanguageResult( language="he", confidence=min(0.95, char_counts["hebrew"] / total_chars + 0.2), ) if char_counts["cyrillic"] / total_chars > 0.3: # Could be Russian, Ukrainian, Bulgarian, etc. return LanguageResult( language="ru", confidence=min(0.85, char_counts["cyrillic"] / total_chars + 0.1), ) if char_counts["greek"] / total_chars > 0.3: return LanguageResult( language="el", confidence=min(0.95, char_counts["greek"] / total_chars + 0.2), ) if char_counts["devanagari"] / total_chars > 0.3: return LanguageResult( language="hi", confidence=min(0.90, char_counts["devanagari"] / total_chars + 0.15), ) if char_counts["thai"] / total_chars > 0.3: return LanguageResult( language="th", confidence=min(0.95, char_counts["thai"] / total_chars + 0.2) ) if char_counts["hangul"] / total_chars > 0.2: return LanguageResult( language="ko", confidence=min(0.95, char_counts["hangul"] / total_chars + 0.25), ) # Japanese detection (prioritize Hiragana/Katakana) japanese_chars = char_counts["hiragana"] + char_counts["katakana"] if japanese_chars / total_chars > 0.1: return LanguageResult( language="ja", confidence=min(0.95, (japanese_chars / total_chars) * 2 + 0.3), ) # Chinese if CJK chars without Japanese kana if char_counts["cjk"] / total_chars > 0.3 and japanese_chars == 0: return LanguageResult( language="zh", confidence=min(0.90, char_counts["cjk"] / total_chars + 0.15) ) # Latin script - need further analysis if char_counts["latin"] / total_chars > 0.5: return LanguageResult( language="en", confidence=0.4 ) # Low confidence, needs further analysis return LanguageResult(language="unknown", confidence=0.2) def _detect_latin_language(text_lower: str) -> LanguageResult: """Detect language for Latin-script text using diacritics and common words.""" # Language-specific diacritic patterns patterns = { "fr": ( r"[àâæçéèêëîïôùûüÿœ]", [ "le", "la", "les", "de", "et", "est", "un", "une", "dans", "pour", "que", "qui", "avec", "ce", "il", "ne", "pas", "se", "vous", "sont", ], ), "de": ( r"[äöüß]", [ "der", "die", "das", "und", "ist", "ein", "eine", "nicht", "mit", "den", "sich", "auf", "für", "von", "dem", "zu", "im", "werden", "auch", "wie", ], ), "es": ( r"[áéíñóúü¿¡]", [ "el", "la", "de", "que", "y", "en", "un", "es", "por", "los", "una", "con", "del", "las", "al", "se", "lo", "como", "más", "pero", ], ), "pt": ( r"[ãõáàâéêíóôõúüç]", [ "o", "a", "de", "que", "e", "do", "da", "em", "um", "para", "com", "não", "os", "as", "dos", "uma", "na", "no", "ao", "ser", ], ), "it": ( r"[àèéìíîòóùú]", [ "il", "di", "e", "la", "che", "per", "un", "non", "in", "una", "è", "sono", "del", "le", "da", "si", "con", "dei", "alla", "anche", ], ), "pl": ( r"[ąćęłńóśźż]", [ "się", "na", "jest", "z", "do", "i", "w", "nie", "to", "co", "o", "za", "od", "po", "dla", "te", "jak", "ze", "może", "być", ], ), "ro": ( r"[ăâîșțşţ]", [ "de", "în", "și", "la", "cu", "pe", "ca", "pentru", "este", "un", "o", "ce", "din", "al", "se", "sunt", "să", "mai", "sau", "a", ], ), "cs": ( r"[áčďéěíňóřšťúůýž]", [ "je", "se", "na", "v", "že", "a", "s", "z", "o", "k", "do", "i", "to", "jako", "pro", "jsou", "si", "od", "po", "ale", ], ), "tr": ( r"[çğıİöşü]", [ "ve", "bir", "bu", "için", "ile", "olan", "da", "de", "var", "mi", "ne", "olarak", "daha", "gibi", "en", "her", "kadar", "çok", "o", "ya", ], ), "sv": ( r"[åäö]", [ "och", "att", "i", "en", "är", "det", "som", "på", "för", "med", "till", "av", "om", "har", "den", "inte", "var", "ett", "han", "men", ], ), "no": ( r"[åæø]", [ "og", "i", "det", "er", "en", "til", "på", "som", "for", "med", "ikke", "av", "han", "har", "den", "var", "om", "så", "hun", "kan", ], ), "da": ( r"[åæø]", [ "og", "i", "det", "er", "at", "en", "til", "på", "som", "for", "med", "ikke", "den", "af", "har", "de", "han", "var", "jeg", "om", ], ), "nl": ( r"[áéíóúàèëïöü]", [ "de", "het", "en", "van", "een", "in", "is", "dat", "op", "te", "voor", "met", "die", "aan", "niet", "als", "zijn", "wordt", "ook", "om", ], ), "fi": ( r"[äö]", [ "ja", "on", "ei", "että", "se", "oli", "kun", "hän", "mutta", "tai", "olla", "ovat", "voi", "kuin", "niin", "jos", "siitä", "olen", "ne", "mitä", ], ), "hu": ( r"[áéíóöőúüű]", [ "a", "az", "és", "van", "egy", "hogy", "nem", "meg", "de", "ha", "volt", "is", "ki", "csak", "mint", "már", "el", "be", "még", "le", ], ), } scores = {} # Split into words more carefully words = re.findall(r"\b[a-záàâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿœ]+\b", text_lower) word_count = len(words) # Score each language for lang, (diacritic_pattern, common_words) in patterns.items(): score = 0.0 # Check for diacritics (strong signal) diacritic_matches = len(re.findall(diacritic_pattern, text_lower)) if diacritic_matches > 0: # Higher weight for diacritics score += min(0.6, diacritic_matches / max(1, len(text_lower)) * 20) # Check for common words (moderate signal) if word_count > 0: common_word_matches = sum(1 for word in words if word in common_words) word_match_ratio = common_word_matches / word_count # Higher weight for word matches score += min(0.6, word_match_ratio * 4) scores[lang] = score # English detection (no diacritics, English common words) if word_count > 0: en_common = [ "the", "is", "are", "of", "and", "to", "in", "a", "that", "it", "for", "as", "with", "was", "be", "on", "at", "by", "this", "have", ] en_matches = sum(1 for word in words if word in en_common) en_ratio = en_matches / word_count scores["en"] = min(0.95, en_ratio * 3) if en_matches > 0 else 0.2 # Boost English if mostly ASCII and no diacritics ascii_ratio = sum(1 for c in text_lower if ord(c) < 128) / max( 1, len(text_lower) ) if ascii_ratio > 0.95 and en_matches > 0: scores["en"] += 0.2 # Get best match if not scores or max(scores.values()) < 0.3: return LanguageResult(language="unknown", confidence=0.2) best_lang = max(scores, key=scores.get) confidence = min(0.95, scores[best_lang]) # Get alternatives sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True) alternatives = [(lang, score) for lang, score in sorted_scores[1:4] if score > 0.3] return LanguageResult( language=best_lang, confidence=confidence, alternatives=alternatives )