Source code for kerb.preprocessing.transforms

"""Text transformation operations."""

import re
from typing import List, Optional


[docs] def expand_contractions(text: str) -> str: """Expand English contractions. Args: text: Input text with contractions Returns: Text with expanded contractions Examples: >>> expand_contractions("I'm doesn't can't") "I am does not cannot" """ if not text: return text contractions = { "ain't": "am not", "aren't": "are not", "can't": "cannot", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "i'd": "I would", "i'll": "I will", "i'm": "I am", "i've": "I have", "isn't": "is not", "it's": "it is", "let's": "let us", "shouldn't": "should not", "that's": "that is", "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not", "what's": "what is", "won't": "will not", "wouldn't": "would not", "you'd": "you would", "you'll": "you will", "you're": "you are", "you've": "you have", } result = text for contraction, expansion in contractions.items(): # Case-insensitive replacement with word boundaries pattern = r"\b" + re.escape(contraction) + r"\b" result = re.sub(pattern, expansion, result, flags=re.IGNORECASE) return result
[docs] def standardize_numbers(text: str) -> str: """Convert number words to digits. Args: text: Input text Returns: Text with standardized numbers Examples: >>> standardize_numbers("I have three apples and five oranges") 'I have 3 apples and 5 oranges' """ if not text: return text number_words = { "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19", "twenty": "20", "thirty": "30", "forty": "40", "fifty": "50", "sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90", "hundred": "100", "thousand": "1000", } result = text for word, digit in number_words.items(): pattern = r"\b" + word + r"\b" result = re.sub(pattern, digit, result, flags=re.IGNORECASE) return result
[docs] def standardize_dates(text: str) -> str: """Normalize date formats. Args: text: Input text with dates Returns: Text with standardized dates (YYYY-MM-DD) Examples: >>> standardize_dates("Meeting on 12/25/2024") 'Meeting on 2024-12-25' """ if not text: return text result = text # Match MM/DD/YYYY format result = re.sub( r"\b(\d{1,2})/(\d{1,2})/(\d{4})\b", lambda m: f"{m.group(3)}-{m.group(1).zfill(2)}-{m.group(2).zfill(2)}", result, ) # Match DD-MM-YYYY format result = re.sub( r"\b(\d{1,2})-(\d{1,2})-(\d{4})\b", lambda m: f"{m.group(3)}-{m.group(2).zfill(2)}-{m.group(1).zfill(2)}", result, ) return result
[docs] def extract_entities(text: str, entity_type: Optional[str] = None) -> List[str]: """Extract named entities (basic). Args: text: Input text entity_type: Type of entities to extract (None for all) Returns: List of extracted entities Examples: >>> extract_entities("Apple Inc. is in California") ['Apple Inc.', 'California'] """ if not text: return [] # Simple pattern-based entity extraction entities = [] # Capitalized words (potential names/places) capitalized = re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text) entities.extend(capitalized) # Organizations (with Inc., LLC, etc.) orgs = re.findall( r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s+(?:Inc|LLC|Corp|Ltd)\.?\b", text ) entities.extend(orgs) return list(set(entities))
[docs] def segment_sentences(text: str) -> List[str]: """Sentence segmentation. Args: text: Input text Returns: List of sentences Examples: >>> segment_sentences("Hello world. How are you?") ['Hello world.', 'How are you?'] """ if not text: return [] # Split on sentence terminators followed by space and capital letter sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text) return [s.strip() for s in sentences if s.strip()]
[docs] def segment_words(text: str) -> List[str]: """Word segmentation (tokenization). Args: text: Input text Returns: List of words Examples: >>> segment_words("Hello, world!") ['Hello', 'world'] """ if not text: return [] # Split on whitespace and punctuation words = re.findall(r"\b\w+\b", text) return words