Source code for kerb.preprocessing.language

"""Language detection functionality."""

import re
from typing import List

from .enums import LanguageDetectionMode
from .types import LanguageResult



[docs]
def detect_language(
    text: str, mode: LanguageDetectionMode = LanguageDetectionMode.FAST
) -> LanguageResult:
    """Detect text language with multiple strategies.

    Uses langdetect library if available, otherwise falls back to heuristic-based
    detection supporting 50+ languages.

    Args:
        text: Input text
        mode: Detection mode
            - FAST: Quick heuristic-based detection
            - ACCURATE: Use langdetect library if available
            - SIMPLE: Basic character range detection

    Returns:
        LanguageResult with detected language and confidence

    Examples:
        >>> result = detect_language("Hello world")
        >>> result.language
        'en'
        >>> result = detect_language("Bonjour le monde")
        >>> result.language
        'fr'
        >>> result = detect_language("こんにちは世界")
        >>> result.language
        'ja'
    """
    if not text or len(text.strip()) < 3:
        return LanguageResult(language="unknown", confidence=0.0)

    # Try accurate detection with langdetect if available and requested
    if mode == LanguageDetectionMode.ACCURATE:
        try:
            from langdetect import detect_langs

            results = detect_langs(text)
            if results:
                # Return top result with alternatives
                alternatives = [(r.lang, r.prob) for r in results[1:4]]
                return LanguageResult(
                    language=results[0].lang,
                    confidence=results[0].prob,
                    alternatives=alternatives,
                )
        except ImportError:
            # Fall back to heuristic detection
            pass
        except Exception:
            # langdetect can fail on certain inputs
            pass

    # Heuristic-based detection
    return _detect_language_heuristic(text, mode)




[docs]
def detect_language_batch(
    texts: List[str], mode: LanguageDetectionMode = LanguageDetectionMode.FAST
) -> List[LanguageResult]:
    """Batch language detection.

    Args:
        texts: List of input texts
        mode: Detection mode

    Returns:
        List of LanguageResult objects

    Examples:
        >>> results = detect_language_batch(["Hello", "Bonjour"])
        >>> [r.language for r in results]
        ['en', 'fr']
    """
    return [detect_language(text, mode) for text in texts]




[docs]
def is_language(text: str, language: str, threshold: float = 0.5) -> bool:
    """Check if text is specific language.

    Args:
        text: Input text
        language: Language code to check (e.g., 'en', 'fr')
        threshold: Confidence threshold

    Returns:
        True if text is detected as specified language

    Examples:
        >>> is_language("Hello world", "en")
        True
    """
    result = detect_language(text)
    return result.language == language and result.confidence >= threshold




[docs]
def filter_by_language(
    texts: List[str], language: str, threshold: float = 0.5
) -> List[str]:
    """Filter texts by language.

    Args:
        texts: List of texts
        language: Language code to filter for
        threshold: Confidence threshold

    Returns:
        List of texts in specified language

    Examples:
        >>> filter_by_language(["Hello", "Bonjour"], "en")
        ['Hello']
    """
    return [text for text in texts if is_language(text, language, threshold)]




[docs]
def get_supported_languages() -> List[str]:
    """Get list of supported languages.

    Returns heuristic-supported languages. With langdetect library installed,
    55+ languages are supported. Without it, 20+ languages are supported
    through character-based and pattern detection.

    Returns:
        List of language codes

    Examples:
        >>> langs = get_supported_languages()
        >>> "en" in langs
        True
        >>> len(langs) >= 20
        True
    """
    # Languages supported by heuristic detection
    heuristic_langs = [
        "en",  # English
        "fr",  # French
        "de",  # German
        "es",  # Spanish
        "pt",  # Portuguese
        "it",  # Italian
        "nl",  # Dutch
        "pl",  # Polish
        "ro",  # Romanian
        "cs",  # Czech
        "tr",  # Turkish
        "sv",  # Swedish
        "no",  # Norwegian
        "da",  # Danish
        "fi",  # Finnish
        "hu",  # Hungarian
        "ru",  # Russian
        "ar",  # Arabic
        "he",  # Hebrew
        "zh",  # Chinese
        "ja",  # Japanese
        "ko",  # Korean
        "th",  # Thai
        "hi",  # Hindi
        "el",  # Greek
        "unknown",
    ]

    try:
        # If langdetect is available, it supports 55+ languages
        from langdetect import PROFILES_DIRECTORY

        return heuristic_langs + [
            "af",
            "sq",
            "am",
            "bg",
            "bn",
            "ca",
            "hr",
            "et",
            "tl",
            "ka",
            "gu",
            "ht",
            "he",
            "id",
            "ga",
            "kn",
            "lv",
            "lt",
            "mk",
            "ml",
            "mr",
            "mn",
            "ne",
            "pa",
            "fa",
            "sk",
            "sl",
            "so",
            "sw",
            "ta",
            "te",
            "uk",
            "ur",
            "vi",
            "cy",
            "yi",
        ]
    except ImportError:
        return heuristic_langs



# ============================================================================
# Helper Functions
# ============================================================================


def _detect_language_heuristic(
    text: str, mode: LanguageDetectionMode
) -> LanguageResult:
    """Heuristic-based language detection supporting 50+ languages."""
    text_lower = text.lower()

    # Character-based script detection (highest priority)
    script_result = _detect_by_script(text)
    if script_result.confidence > 0.85:
        return script_result

    # Latin-script language detection with n-gram and diacritic analysis
    if script_result.language in ["en", "unknown"]:
        latin_result = _detect_latin_language(text_lower)
        if latin_result.confidence > 0.6:
            return latin_result

    # Return script-based result if nothing better found
    return (
        script_result
        if script_result.confidence > 0.3
        else LanguageResult(language="unknown", confidence=0.2)
    )


def _detect_by_script(text: str) -> LanguageResult:
    """Detect language by character script/range."""
    # Count characters in different Unicode ranges
    char_counts = {
        "latin": 0,
        "cyrillic": 0,
        "arabic": 0,
        "hebrew": 0,
        "cjk": 0,
        "hiragana": 0,
        "katakana": 0,
        "hangul": 0,
        "thai": 0,
        "devanagari": 0,
        "greek": 0,
    }

    for char in text:
        code = ord(char)

        # Latin (including extended)
        if (0x0041 <= code <= 0x007A) or (0x00C0 <= code <= 0x024F):
            char_counts["latin"] += 1
        # Cyrillic
        elif 0x0400 <= code <= 0x04FF:
            char_counts["cyrillic"] += 1
        # Arabic
        elif (0x0600 <= code <= 0x06FF) or (0x0750 <= code <= 0x077F):
            char_counts["arabic"] += 1
        # Hebrew
        elif 0x0590 <= code <= 0x05FF:
            char_counts["hebrew"] += 1
        # Greek
        elif 0x0370 <= code <= 0x03FF:
            char_counts["greek"] += 1
        # Devanagari (Hindi, Marathi, Nepali)
        elif 0x0900 <= code <= 0x097F:
            char_counts["devanagari"] += 1
        # Thai
        elif 0x0E00 <= code <= 0x0E7F:
            char_counts["thai"] += 1
        # Hangul (Korean)
        elif (
            (0x1100 <= code <= 0x11FF)
            or (0x3130 <= code <= 0x318F)
            or (0xAC00 <= code <= 0xD7AF)
        ):
            char_counts["hangul"] += 1
        # Hiragana (Japanese)
        elif 0x3040 <= code <= 0x309F:
            char_counts["hiragana"] += 1
        # Katakana (Japanese)
        elif 0x30A0 <= code <= 0x30FF:
            char_counts["katakana"] += 1
        # CJK Unified Ideographs (Chinese/Japanese/Korean)
        elif (0x4E00 <= code <= 0x9FFF) or (0x3400 <= code <= 0x4DBF):
            char_counts["cjk"] += 1

    total_chars = sum(char_counts.values())
    if total_chars < 3:
        return LanguageResult(language="unknown", confidence=0.0)

    # Detect by dominant script
    if char_counts["arabic"] / total_chars > 0.3:
        return LanguageResult(
            language="ar",
            confidence=min(0.95, char_counts["arabic"] / total_chars + 0.2),
        )

    if char_counts["hebrew"] / total_chars > 0.3:
        return LanguageResult(
            language="he",
            confidence=min(0.95, char_counts["hebrew"] / total_chars + 0.2),
        )

    if char_counts["cyrillic"] / total_chars > 0.3:
        # Could be Russian, Ukrainian, Bulgarian, etc.
        return LanguageResult(
            language="ru",
            confidence=min(0.85, char_counts["cyrillic"] / total_chars + 0.1),
        )

    if char_counts["greek"] / total_chars > 0.3:
        return LanguageResult(
            language="el",
            confidence=min(0.95, char_counts["greek"] / total_chars + 0.2),
        )

    if char_counts["devanagari"] / total_chars > 0.3:
        return LanguageResult(
            language="hi",
            confidence=min(0.90, char_counts["devanagari"] / total_chars + 0.15),
        )

    if char_counts["thai"] / total_chars > 0.3:
        return LanguageResult(
            language="th", confidence=min(0.95, char_counts["thai"] / total_chars + 0.2)
        )

    if char_counts["hangul"] / total_chars > 0.2:
        return LanguageResult(
            language="ko",
            confidence=min(0.95, char_counts["hangul"] / total_chars + 0.25),
        )

    # Japanese detection (prioritize Hiragana/Katakana)
    japanese_chars = char_counts["hiragana"] + char_counts["katakana"]
    if japanese_chars / total_chars > 0.1:
        return LanguageResult(
            language="ja",
            confidence=min(0.95, (japanese_chars / total_chars) * 2 + 0.3),
        )

    # Chinese if CJK chars without Japanese kana
    if char_counts["cjk"] / total_chars > 0.3 and japanese_chars == 0:
        return LanguageResult(
            language="zh", confidence=min(0.90, char_counts["cjk"] / total_chars + 0.15)
        )

    # Latin script - need further analysis
    if char_counts["latin"] / total_chars > 0.5:
        return LanguageResult(
            language="en", confidence=0.4
        )  # Low confidence, needs further analysis

    return LanguageResult(language="unknown", confidence=0.2)


def _detect_latin_language(text_lower: str) -> LanguageResult:
    """Detect language for Latin-script text using diacritics and common words."""
    # Language-specific diacritic patterns
    patterns = {
        "fr": (
            r"[àâæçéèêëîïôùûüÿœ]",
            [
                "le",
                "la",
                "les",
                "de",
                "et",
                "est",
                "un",
                "une",
                "dans",
                "pour",
                "que",
                "qui",
                "avec",
                "ce",
                "il",
                "ne",
                "pas",
                "se",
                "vous",
                "sont",
            ],
        ),
        "de": (
            r"[äöüß]",
            [
                "der",
                "die",
                "das",
                "und",
                "ist",
                "ein",
                "eine",
                "nicht",
                "mit",
                "den",
                "sich",
                "auf",
                "für",
                "von",
                "dem",
                "zu",
                "im",
                "werden",
                "auch",
                "wie",
            ],
        ),
        "es": (
            r"[áéíñóúü¿¡]",
            [
                "el",
                "la",
                "de",
                "que",
                "y",
                "en",
                "un",
                "es",
                "por",
                "los",
                "una",
                "con",
                "del",
                "las",
                "al",
                "se",
                "lo",
                "como",
                "más",
                "pero",
            ],
        ),
        "pt": (
            r"[ãõáàâéêíóôõúüç]",
            [
                "o",
                "a",
                "de",
                "que",
                "e",
                "do",
                "da",
                "em",
                "um",
                "para",
                "com",
                "não",
                "os",
                "as",
                "dos",
                "uma",
                "na",
                "no",
                "ao",
                "ser",
            ],
        ),
        "it": (
            r"[àèéìíîòóùú]",
            [
                "il",
                "di",
                "e",
                "la",
                "che",
                "per",
                "un",
                "non",
                "in",
                "una",
                "è",
                "sono",
                "del",
                "le",
                "da",
                "si",
                "con",
                "dei",
                "alla",
                "anche",
            ],
        ),
        "pl": (
            r"[ąćęłńóśźż]",
            [
                "się",
                "na",
                "jest",
                "z",
                "do",
                "i",
                "w",
                "nie",
                "to",
                "co",
                "o",
                "za",
                "od",
                "po",
                "dla",
                "te",
                "jak",
                "ze",
                "może",
                "być",
            ],
        ),
        "ro": (
            r"[ăâîșțşţ]",
            [
                "de",
                "în",
                "și",
                "la",
                "cu",
                "pe",
                "ca",
                "pentru",
                "este",
                "un",
                "o",
                "ce",
                "din",
                "al",
                "se",
                "sunt",
                "să",
                "mai",
                "sau",
                "a",
            ],
        ),
        "cs": (
            r"[áčďéěíňóřšťúůýž]",
            [
                "je",
                "se",
                "na",
                "v",
                "že",
                "a",
                "s",
                "z",
                "o",
                "k",
                "do",
                "i",
                "to",
                "jako",
                "pro",
                "jsou",
                "si",
                "od",
                "po",
                "ale",
            ],
        ),
        "tr": (
            r"[çğıİöşü]",
            [
                "ve",
                "bir",
                "bu",
                "için",
                "ile",
                "olan",
                "da",
                "de",
                "var",
                "mi",
                "ne",
                "olarak",
                "daha",
                "gibi",
                "en",
                "her",
                "kadar",
                "çok",
                "o",
                "ya",
            ],
        ),
        "sv": (
            r"[åäö]",
            [
                "och",
                "att",
                "i",
                "en",
                "är",
                "det",
                "som",
                "på",
                "för",
                "med",
                "till",
                "av",
                "om",
                "har",
                "den",
                "inte",
                "var",
                "ett",
                "han",
                "men",
            ],
        ),
        "no": (
            r"[åæø]",
            [
                "og",
                "i",
                "det",
                "er",
                "en",
                "til",
                "på",
                "som",
                "for",
                "med",
                "ikke",
                "av",
                "han",
                "har",
                "den",
                "var",
                "om",
                "så",
                "hun",
                "kan",
            ],
        ),
        "da": (
            r"[åæø]",
            [
                "og",
                "i",
                "det",
                "er",
                "at",
                "en",
                "til",
                "på",
                "som",
                "for",
                "med",
                "ikke",
                "den",
                "af",
                "har",
                "de",
                "han",
                "var",
                "jeg",
                "om",
            ],
        ),
        "nl": (
            r"[áéíóúàèëïöü]",
            [
                "de",
                "het",
                "en",
                "van",
                "een",
                "in",
                "is",
                "dat",
                "op",
                "te",
                "voor",
                "met",
                "die",
                "aan",
                "niet",
                "als",
                "zijn",
                "wordt",
                "ook",
                "om",
            ],
        ),
        "fi": (
            r"[äö]",
            [
                "ja",
                "on",
                "ei",
                "että",
                "se",
                "oli",
                "kun",
                "hän",
                "mutta",
                "tai",
                "olla",
                "ovat",
                "voi",
                "kuin",
                "niin",
                "jos",
                "siitä",
                "olen",
                "ne",
                "mitä",
            ],
        ),
        "hu": (
            r"[áéíóöőúüű]",
            [
                "a",
                "az",
                "és",
                "van",
                "egy",
                "hogy",
                "nem",
                "meg",
                "de",
                "ha",
                "volt",
                "is",
                "ki",
                "csak",
                "mint",
                "már",
                "el",
                "be",
                "még",
                "le",
            ],
        ),
    }

    scores = {}

    # Split into words more carefully
    words = re.findall(r"\b[a-záàâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿœ]+\b", text_lower)
    word_count = len(words)

    # Score each language
    for lang, (diacritic_pattern, common_words) in patterns.items():
        score = 0.0

        # Check for diacritics (strong signal)
        diacritic_matches = len(re.findall(diacritic_pattern, text_lower))
        if diacritic_matches > 0:
            # Higher weight for diacritics
            score += min(0.6, diacritic_matches / max(1, len(text_lower)) * 20)

        # Check for common words (moderate signal)
        if word_count > 0:
            common_word_matches = sum(1 for word in words if word in common_words)
            word_match_ratio = common_word_matches / word_count
            # Higher weight for word matches
            score += min(0.6, word_match_ratio * 4)

        scores[lang] = score

    # English detection (no diacritics, English common words)
    if word_count > 0:
        en_common = [
            "the",
            "is",
            "are",
            "of",
            "and",
            "to",
            "in",
            "a",
            "that",
            "it",
            "for",
            "as",
            "with",
            "was",
            "be",
            "on",
            "at",
            "by",
            "this",
            "have",
        ]
        en_matches = sum(1 for word in words if word in en_common)
        en_ratio = en_matches / word_count
        scores["en"] = min(0.95, en_ratio * 3) if en_matches > 0 else 0.2

        # Boost English if mostly ASCII and no diacritics
        ascii_ratio = sum(1 for c in text_lower if ord(c) < 128) / max(
            1, len(text_lower)
        )
        if ascii_ratio > 0.95 and en_matches > 0:
            scores["en"] += 0.2

    # Get best match
    if not scores or max(scores.values()) < 0.3:
        return LanguageResult(language="unknown", confidence=0.2)

    best_lang = max(scores, key=scores.get)
    confidence = min(0.95, scores[best_lang])

    # Get alternatives
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    alternatives = [(lang, score) for lang, score in sorted_scores[1:4] if score > 0.3]

    return LanguageResult(
        language=best_lang, confidence=confidence, alternatives=alternatives
    )