Source code for kerb.preprocessing.enums

"""Enumeration types for text preprocessing."""

from enum import Enum


[docs] class NormalizationLevel(Enum): """Text normalization intensity.""" MINIMAL = "minimal" # Only basic whitespace normalization STANDARD = "standard" # Standard cleaning (recommended) AGGRESSIVE = "aggressive" # Remove most non-alphanumeric content
[docs] class LanguageDetectionMode(Enum): """Language detection strategy.""" FAST = "fast" # Fast heuristic-based detection ACCURATE = "accurate" # More accurate but slower SIMPLE = "simple" # Simple character-based detection
[docs] class DeduplicationMode(Enum): """Deduplication strategy.""" EXACT = "exact" # Exact string matching FUZZY = "fuzzy" # Fuzzy matching (similar strings) SEMANTIC = "semantic" # Semantic similarity (requires embeddings)
[docs] class ContentType(Enum): """Text content type classification.""" PLAIN_TEXT = "plain_text" CODE = "code" MARKDOWN = "markdown" HTML = "html" JSON = "json" MIXED = "mixed" UNKNOWN = "unknown"