Source code for kerb.document.cleaners

"""Text cleaning utilities.

This module provides functions for cleaning and normalizing text:
- General text cleaning
- Newline normalization
"""

import re


[docs] def clean_text( text: str, normalize_whitespace: bool = True, remove_urls: bool = False, remove_emails: bool = False, remove_special_chars: bool = False, lowercase: bool = False, ) -> str: """Clean and normalize text. Args: text (str): Text to clean normalize_whitespace (bool): Normalize whitespace to single spaces remove_urls (bool): Remove URLs remove_emails (bool): Remove email addresses remove_special_chars (bool): Remove special characters lowercase (bool): Convert to lowercase Returns: str: Cleaned text Examples: >>> text = "Check out https://example.com for more info!" >>> clean_text(text, normalize_whitespace=True, remove_urls=True) 'Check out for more info!' """ cleaned = text if remove_urls: cleaned = re.sub(r"https?://\S+", "", cleaned) cleaned = re.sub(r"www\.\S+", "", cleaned) if remove_emails: cleaned = re.sub( r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", cleaned ) if remove_special_chars: cleaned = re.sub(r'[^a-zA-Z0-9\s.,!?;:\-\'"()]', "", cleaned) if normalize_whitespace: cleaned = re.sub(r"\s+", " ", cleaned) cleaned = cleaned.strip() if lowercase: cleaned = cleaned.lower() return cleaned
[docs] def remove_extra_newlines(text: str, max_consecutive: int = 2) -> str: """Remove excessive newlines from text. Args: text (str): Text to process max_consecutive (int): Maximum consecutive newlines to keep Returns: str: Text with limited newlines Examples: >>> remove_extra_newlines("Hello\\n\\n\\n\\nWorld", max_consecutive=2) 'Hello\\n\\nWorld' """ pattern = r"\n{" + str(max_consecutive + 1) + r",}" replacement = "\n" * max_consecutive return re.sub(pattern, replacement, text)