Source code for kerb.preprocessing.types

"""Data types for text preprocessing."""

from dataclasses import dataclass, field
from typing import List, Tuple

from .enums import NormalizationLevel


[docs] @dataclass class LanguageResult: """Language detection result.""" language: str confidence: float alternatives: List[Tuple[str, float]] = field(default_factory=list)
[docs] @dataclass class QualityMetrics: """Text quality metrics.""" length: int word_count: int avg_word_length: float sentence_count: int avg_sentence_length: float special_char_ratio: float digit_ratio: float uppercase_ratio: float readability_score: float
[docs] @dataclass class NormalizationConfig: """Configuration for text normalization operations. Attributes: level: Normalization intensity level lowercase: Convert to lowercase remove_urls: Remove URLs from text remove_emails: Remove email addresses remove_extra_spaces: Remove redundant whitespace """ level: NormalizationLevel = NormalizationLevel.STANDARD lowercase: bool = False remove_urls: bool = True remove_emails: bool = True remove_extra_spaces: bool = True