"""Text normalization, cleaning, and manipulation operations."""
import html
import re
import unicodedata
from typing import List, Optional, Union
from kerb.core.enums import CaseMode, TruncateStrategy, validate_enum_or_string
from .enums import NormalizationLevel
from .types import NormalizationConfig
# ============================================================================
# Text Normalization & Cleaning
# ============================================================================
[docs]
def normalize_text(
text: str,
level: NormalizationLevel = NormalizationLevel.STANDARD,
lowercase: bool = False,
remove_urls: bool = True,
remove_emails: bool = True,
remove_extra_spaces: bool = True,
config: Optional[NormalizationConfig] = None,
) -> str:
"""Comprehensive text normalization with configurable intensity.
Args:
text: Input text to normalize
level: Normalization intensity level (ignored if config is provided)
lowercase: Convert to lowercase (ignored if config is provided)
remove_urls: Remove URLs from text (ignored if config is provided)
remove_emails: Remove email addresses (ignored if config is provided)
remove_extra_spaces: Remove redundant whitespace (ignored if config is provided)
config: NormalizationConfig object with all parameters (recommended)
Returns:
Normalized text
Examples:
>>> # Using config object (recommended)
>>> from kerb.preprocessing import NormalizationConfig, NormalizationLevel
>>> config = NormalizationConfig(
... level=NormalizationLevel.STANDARD,
... lowercase=True,
... remove_urls=True
... )
>>> normalized = normalize_text("Check this: https://example.com", config=config)
>>> # Using individual parameters (backward compatible)
>>> normalized = normalize_text("HELLO WORLD", lowercase=True)
"""
# Use config if provided, otherwise use individual parameters
if config is not None:
level = config.level
lowercase = config.lowercase
remove_urls = config.remove_urls
remove_emails = config.remove_emails
remove_extra_spaces = config.remove_extra_spaces
if not text:
return text
result = text
# Always normalize unicode
result = normalize_unicode(result)
# URLs and emails
if remove_urls:
result = _remove_urls(result)
if remove_emails:
result = _remove_emails(result)
# Level-specific processing
if level == NormalizationLevel.MINIMAL:
if remove_extra_spaces:
result = normalize_whitespace(result)
elif level == NormalizationLevel.STANDARD:
result = normalize_quotes(result)
result = normalize_dashes(result)
if remove_extra_spaces:
result = normalize_whitespace(result)
result = remove_control_chars(result)
elif level == NormalizationLevel.AGGRESSIVE:
result = normalize_quotes(result)
result = normalize_dashes(result)
result = remove_special_chars(result, keep_basic=True)
if remove_extra_spaces:
result = normalize_whitespace(result)
result = remove_control_chars(result)
# Case normalization
if lowercase:
result = result.lower()
return result.strip()
[docs]
def normalize_whitespace(text: str) -> str:
"""Normalize whitespace and newlines.
Args:
text: Input text
Returns:
Text with normalized whitespace
Examples:
>>> normalize_whitespace("Hello world\\n\\n\\ntest")
'Hello world\\n\\ntest'
"""
if not text:
return text
# Replace multiple spaces with single space
text = re.sub(r"[ \t]+", " ", text)
# Replace multiple newlines with double newline (preserve paragraphs)
text = re.sub(r"\n\s*\n\s*\n+", "\n\n", text)
# Remove trailing/leading whitespace from lines
lines = [line.strip() for line in text.split("\n")]
text = "\n".join(lines)
return text.strip()
[docs]
def normalize_unicode(text: str, form: str = "NFKC") -> str:
"""Normalize unicode characters.
Args:
text: Input text
form: Unicode normalization form (NFC, NFD, NFKC, NFKD)
Returns:
Unicode-normalized text
Examples:
>>> normalize_unicode("café") # Normalizes different accent representations
'café'
"""
if not text:
return text
return unicodedata.normalize(form, text)
[docs]
def normalize_quotes(text: str) -> str:
"""Convert smart quotes to standard quotes.
Args:
text: Input text
Returns:
Text with standard quotes
Examples:
>>> normalize_quotes('"Hello" and 'world'")
'"Hello" and \\'world\\''
"""
if not text:
return text
# Smart double quotes to standard
text = text.replace('"', '"').replace('"', '"')
# Smart single quotes to standard
text = text.replace(""", "'").replace(""", "'")
# Prime and backtick variations
text = text.replace("‛", "'").replace("‚", "'")
text = text.replace("„", '"').replace("‟", '"')
return text
[docs]
def normalize_dashes(text: str) -> str:
"""Convert various dashes to standard forms.
Args:
text: Input text
Returns:
Text with standard dashes
Examples:
>>> normalize_dashes("em—dash and en–dash")
'em-dash and en-dash'
"""
if not text:
return text
# Convert em and en dashes to hyphen
text = text.replace("—", "-").replace("–", "-")
# Other dash variants
text = text.replace("―", "-").replace("‐", "-")
text = text.replace("‑", "-").replace("⁃", "-")
return text
[docs]
def remove_accents(text: str) -> str:
"""Remove diacritical marks from text.
Args:
text: Input text
Returns:
Text without accents
Examples:
>>> remove_accents("café résumé")
'cafe resume'
"""
if not text:
return text
# Decompose unicode characters
nfkd = unicodedata.normalize("NFKD", text)
# Filter out combining marks
return "".join([c for c in nfkd if not unicodedata.combining(c)])
[docs]
def clean_html(text: str, keep_newlines: bool = True) -> str:
"""Remove HTML tags and entities.
Args:
text: Input text with HTML
keep_newlines: Keep newlines from <br> and <p> tags
Returns:
Plain text without HTML
Examples:
>>> clean_html("<p>Hello <b>world</b></p>")
'Hello world'
"""
if not text:
return text
# Convert common tags to newlines if requested
if keep_newlines:
text = re.sub(r"<br\s*/?>|</p>|</div>|</li>", "\n", text, flags=re.IGNORECASE)
# Remove all HTML tags
text = re.sub(r"<[^>]+>", "", text)
# Unescape HTML entities
text = html.unescape(text)
# Clean up whitespace
text = normalize_whitespace(text)
return text.strip()
[docs]
def clean_markdown(text: str, keep_structure: bool = False) -> str:
"""Remove or normalize markdown formatting.
Args:
text: Input markdown text
keep_structure: Keep basic structure (headings, lists)
Returns:
Plain or lightly formatted text
Examples:
>>> clean_markdown("# Hello **world**")
'Hello world'
"""
if not text:
return text
result = text
# Remove code blocks
result = re.sub(r"```[\s\S]*?```", "", result)
result = re.sub(r"`[^`]+`", "", result)
# Remove links but keep text
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
# Remove images
result = re.sub(r"!\[([^\]]*)\]\([^)]+\)", "", result)
if not keep_structure:
# Remove headings markers
result = re.sub(r"^#+\s+", "", result, flags=re.MULTILINE)
# Remove list markers
result = re.sub(r"^\s*[-*+]\s+", "", result, flags=re.MULTILINE)
result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE)
# Remove emphasis
result = re.sub(r"\*\*([^*]+)\*\*", r"\1", result)
result = re.sub(r"\*([^*]+)\*", r"\1", result)
result = re.sub(r"__([^_]+)__", r"\1", result)
result = re.sub(r"_([^_]+)_", r"\1", result)
# Remove strikethrough
result = re.sub(r"~~([^~]+)~~", r"\1", result)
# Clean up whitespace
result = normalize_whitespace(result)
return result.strip()
[docs]
def remove_urls(text: str, replacement: str = "") -> str:
"""Remove or replace URLs.
Args:
text: Input text
replacement: String to replace URLs with
Returns:
Text without URLs
Examples:
>>> remove_urls("Check https://example.com for info")
'Check for info'
"""
return _remove_urls(text, replacement)
def _remove_urls(text: str, replacement: str = "") -> str:
"""Internal URL removal."""
if not text:
return text
# Match http(s) URLs
text = re.sub(
r'https?://[^\s<>"{}|\\^`\[\]]+', replacement, text, flags=re.IGNORECASE
)
# Match www URLs
text = re.sub(r'www\.[^\s<>"{}|\\^`\[\]]+', replacement, text, flags=re.IGNORECASE)
return text
[docs]
def remove_emails(text: str, replacement: str = "") -> str:
"""Remove or replace email addresses.
Args:
text: Input text
replacement: String to replace emails with
Returns:
Text without email addresses
Examples:
>>> remove_emails("Contact me@example.com")
'Contact '
"""
return _remove_emails(text, replacement)
def _remove_emails(text: str, replacement: str = "") -> str:
"""Internal email removal."""
if not text:
return text
text = re.sub(
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", replacement, text
)
return text
[docs]
def remove_phone_numbers(text: str, replacement: str = "") -> str:
"""Remove or replace phone numbers.
Args:
text: Input text
replacement: String to replace phone numbers with
Returns:
Text without phone numbers
Examples:
>>> remove_phone_numbers("Call 555-123-4567")
'Call '
"""
if not text:
return text
# Various phone number formats
patterns = [
r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b", # 555-123-4567
r"\b\(\d{3}\)\s*\d{3}[-.\s]?\d{4}\b", # (555) 123-4567
r"\b\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b", # International
]
for pattern in patterns:
text = re.sub(pattern, replacement, text)
return text
[docs]
def remove_special_chars(text: str, keep_basic: bool = True) -> str:
"""Remove special characters with options.
Args:
text: Input text
keep_basic: Keep basic punctuation (.,!?;:)
Returns:
Text with special characters removed
Examples:
>>> remove_special_chars("Hello@#$world!")
'Hello world!'
"""
if not text:
return text
if keep_basic:
# Keep letters, numbers, spaces, and basic punctuation
text = re.sub(r"[^a-zA-Z0-9\s.,!?;:\'-]", " ", text)
else:
# Keep only letters, numbers, and spaces
text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
# Clean up extra spaces
text = re.sub(r"\s+", " ", text)
return text.strip()
[docs]
def remove_control_chars(text: str) -> str:
"""Remove control characters.
Args:
text: Input text
Returns:
Text without control characters
Examples:
>>> remove_control_chars("Hello\\x00world\\x01")
'Helloworld'
"""
if not text:
return text
# Keep newlines and tabs, remove other control characters
text = "".join(
char
for char in text
if unicodedata.category(char)[0] != "C" or char in "\n\t\r"
)
return text
[docs]
def strip_punctuation(text: str, keep_internal: bool = True) -> str:
"""Remove punctuation with options.
Args:
text: Input text
keep_internal: Keep punctuation within words (e.g., apostrophes)
Returns:
Text with punctuation removed
Examples:
>>> strip_punctuation("Hello, world!")
'Hello world'
"""
if not text:
return text
if keep_internal:
# Remove punctuation at word boundaries
text = re.sub(r"(?<!\w)[^\w\s]+|[^\w\s]+(?!\w)", " ", text)
else:
# Remove all punctuation
text = re.sub(r"[^\w\s]", " ", text)
# Clean up spaces
text = re.sub(r"\s+", " ", text)
return text.strip()
# ============================================================================
# Case Handling
# ============================================================================
[docs]
def normalize_case(text: str, mode: Union[CaseMode, str] = "sentence") -> str:
"""Smart case normalization.
Args:
text: Input text
mode: Case mode (CaseMode enum or string: "lower", "upper", "title", "sentence")
Returns:
Case-normalized text
Examples:
>>> normalize_case("HELLO WORLD", mode=CaseMode.SENTENCE)
'Hello world'
>>> normalize_case("hello world", mode="title")
'Hello World'
"""
if not text:
return text
# Validate and normalize mode
mode_val = validate_enum_or_string(mode, CaseMode, "mode")
if isinstance(mode_val, CaseMode):
mode_str = mode_val.value
else:
mode_str = mode_val
if mode_str == "lower":
return text.lower()
elif mode_str == "upper":
return text.upper()
elif mode_str == "title":
return to_title_case(text)
elif mode_str == "sentence":
return to_sentence_case(text)
else:
return text
[docs]
def to_title_case(text: str) -> str:
"""Convert to title case.
Args:
text: Input text
Returns:
Title-cased text
Examples:
>>> to_title_case("hello world from python")
'Hello World From Python'
"""
if not text:
return text
# Simple title case
return text.title()
[docs]
def to_sentence_case(text: str) -> str:
"""Convert to sentence case.
Args:
text: Input text
Returns:
Sentence-cased text
Examples:
>>> to_sentence_case("hello world. this is a test.")
'Hello world. This is a test.'
"""
if not text:
return text
# Split into sentences and capitalize first letter of each
sentences = re.split(r"([.!?]+\s+)", text)
result = []
for i, part in enumerate(sentences):
if i % 2 == 0 and part: # Actual sentence text
result.append(part[0].upper() + part[1:].lower() if len(part) > 0 else part)
else:
result.append(part)
return "".join(result)
[docs]
def preserve_acronyms(text: str, acronyms: Optional[List[str]] = None) -> str:
"""Smart case conversion preserving acronyms.
Args:
text: Input text
acronyms: List of acronyms to preserve (default: common ones)
Returns:
Text with preserved acronyms
Examples:
>>> preserve_acronyms("nasa and fbi are agencies", ["NASA", "FBI"])
'NASA and FBI are agencies'
"""
if not text:
return text
if acronyms is None:
acronyms = ["NASA", "FBI", "CIA", "USA", "UK", "UN", "EU", "WHO", "NATO"]
result = text
for acronym in acronyms:
# Case-insensitive replacement
pattern = re.compile(re.escape(acronym), re.IGNORECASE)
result = pattern.sub(acronym, result)
return result
# ============================================================================
# Utilities
# ============================================================================
[docs]
def truncate_text(
text: str,
max_length: int,
strategy: Union[TruncateStrategy, str] = "end",
suffix: str = "...",
) -> str:
"""Truncate text intelligently.
Args:
text: Input text
max_length: Maximum length
strategy: Truncation strategy (TruncateStrategy enum or string: "end", "middle", "start", "smart")
suffix: Suffix to add when truncated
Returns:
Truncated text
Examples:
>>> truncate_text("Hello world", max_length=8)
'Hello...'
>>> truncate_text("Hello world", max_length=8, strategy=TruncateStrategy.MIDDLE)
'He...ld'
>>> truncate_text("This is a sentence. And another one.", max_length=20, strategy="smart")
'This is a sentence....'
"""
if not text or len(text) <= max_length:
return text
# Validate and normalize strategy
strategy_val = validate_enum_or_string(strategy, TruncateStrategy, "strategy")
if isinstance(strategy_val, TruncateStrategy):
strategy_str = strategy_val.value
else:
strategy_str = strategy_val
# Account for suffix length
available_length = max_length - len(suffix)
if available_length <= 0:
return text[:max_length]
if strategy_str == "end":
return text[:available_length] + suffix
elif strategy_str == "start":
return suffix + text[-available_length:]
elif strategy_str == "middle":
half = available_length // 2
return text[:half] + suffix + text[-(available_length - half) :]
elif strategy_str == "smart":
# Try to truncate at sentence boundary
truncated = text[:available_length]
# Find last sentence ending
last_period = truncated.rfind(".")
last_question = truncated.rfind("?")
last_exclamation = truncated.rfind("!")
last_sentence_end = max(last_period, last_question, last_exclamation)
if last_sentence_end > available_length * 0.7:
# Good sentence boundary found
return text[: last_sentence_end + 1] + suffix
# Fall back to word boundary
last_space = truncated.rfind(" ")
if last_space > available_length * 0.8:
return text[:last_space] + suffix
# No good boundary, just truncate
return truncated + suffix
else:
return text[:available_length] + suffix
[docs]
def split_long_text(
text: str, max_length: int, overlap: int = 0, preserve_words: bool = True
) -> List[str]:
"""Split text exceeding length limit.
Args:
text: Input text
max_length: Maximum length per chunk
overlap: Overlap between chunks
preserve_words: Don't split words
Returns:
List of text chunks
Examples:
>>> split_long_text("Hello world test", max_length=8)
['Hello', 'world', 'test']
"""
if not text or len(text) <= max_length:
return [text] if text else []
chunks = []
if preserve_words:
words = text.split()
current_chunk = []
current_length = 0
for word in words:
word_length = len(word) + (1 if current_chunk else 0) # +1 for space
if current_length + word_length > max_length and current_chunk:
chunks.append(" ".join(current_chunk))
# Handle overlap
if overlap > 0:
overlap_words = []
overlap_length = 0
for w in reversed(current_chunk):
if overlap_length + len(w) + 1 <= overlap:
overlap_words.insert(0, w)
overlap_length += len(w) + 1
else:
break
current_chunk = overlap_words
current_length = overlap_length
else:
current_chunk = []
current_length = 0
current_chunk.append(word)
current_length += word_length
if current_chunk:
chunks.append(" ".join(current_chunk))
else:
# Character-based splitting
start = 0
while start < len(text):
end = start + max_length
chunks.append(text[start:end])
start = end - overlap
return chunks