Source code for kerb.document.preprocessors

"""Format-specific text preprocessing utilities.

This module provides preprocessing functions for specific document formats:
- PDF text preprocessing
- HTML text preprocessing
- Markdown preprocessing
"""

import re


[docs] def preprocess_pdf_text(text: str) -> str: """Preprocess text extracted from PDF. PDFs often have formatting artifacts like broken lines, extra spaces, etc. Args: text (str): Text extracted from PDF Returns: str: Cleaned text Examples: >>> pdf_text = "This is a sen-\\ntence with line break." >>> preprocess_pdf_text(pdf_text) 'This is a sentence with line break.' """ # Fix hyphenated line breaks text = re.sub(r"(\w)-\s*\n\s*(\w)", r"\1\2", text) # Fix line breaks mid-sentence text = re.sub(r"(\w)\s*\n\s*(\w)", r"\1 \2", text) # Normalize whitespace (inline instead of removed function) text = " ".join(text.split()) return text
[docs] def preprocess_html_text(html: str) -> str: """Preprocess HTML to extract clean text. Args: html (str): HTML content Returns: str: Cleaned text Examples: >>> html = '<div>Hello <span>World</span></div>' >>> preprocess_html_text(html) 'Hello World' """ from .extractors import extract_text_from_html text = extract_text_from_html(html) text = " ".join(text.split()) return text
[docs] def preprocess_markdown(text: str, keep_structure: bool = True) -> str: """Preprocess Markdown text. Args: text (str): Markdown text keep_structure (bool): Keep headings and structure markers Returns: str: Processed text Examples: >>> md = "# Title\\n\\nSome **bold** text" >>> preprocess_markdown(md, keep_structure=False) 'Title\\n\\nSome bold text' """ from .extractors import strip_markdown if not keep_structure: text = strip_markdown(text) text = " ".join(text.split()) return text