Source code for kerb.document.extractors

"""Text extraction utilities.

This module provides functions for extracting text from various formats:
- HTML text extraction
- Markdown stripping
- Sentence splitting
- Paragraph splitting
"""

import re
from typing import List



[docs]
def extract_text_from_html(html: str, remove_scripts: bool = True) -> str:
    """Extract plain text from HTML content.

    Args:
        html (str): HTML content
        remove_scripts (bool): Remove script and style tags

    Returns:
        str: Extracted plain text

    Examples:
        >>> html = '<html><body><p>Hello World</p></body></html>'
        >>> extract_text_from_html(html)
        'Hello World'
    """
    text = html

    # Remove script and style tags
    if remove_scripts:
        text = re.sub(
            r"<script[^>]*>.*?</script>", "", text, flags=re.DOTALL | re.IGNORECASE
        )
        text = re.sub(
            r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE
        )

    # Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)

    # Decode HTML entities
    text = text.replace("&nbsp;", " ")
    text = text.replace("&lt;", "<")
    text = text.replace("&gt;", ">")
    text = text.replace("&amp;", "&")
    text = text.replace("&quot;", '"')
    text = text.replace("&#39;", "'")

    # Clean up whitespace
    text = re.sub(r"\s+", " ", text)
    text = text.strip()

    return text




[docs]
def strip_markdown(text: str) -> str:
    """Remove Markdown formatting from text.

    Args:
        text (str): Markdown text

    Returns:
        str: Plain text without Markdown formatting

    Examples:
        >>> strip_markdown("# Hello **World**")
        'Hello World'
    """
    # Remove headers
    text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)

    # Remove bold and italic
    text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", text)  # Bold italic
    text = re.sub(r"\*\*(.+?)\*\*", r"\1", text)  # Bold
    text = re.sub(r"\*(.+?)\*", r"\1", text)  # Italic
    text = re.sub(r"__(.+?)__", r"\1", text)  # Bold
    text = re.sub(r"_(.+?)_", r"\1", text)  # Italic

    # Remove inline code
    text = re.sub(r"`(.+?)`", r"\1", text)

    # Remove code blocks
    text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)

    # Remove links
    text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)

    # Remove images
    text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", r"\1", text)

    # Remove blockquotes
    text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)

    # Remove horizontal rules
    text = re.sub(r"^[-*_]{3,}$", "", text, flags=re.MULTILINE)

    # Remove list markers
    text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE)
    text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE)

    return text.strip()




[docs]
def split_into_sentences(text: str) -> List[str]:
    """Split text into sentences.

    Args:
        text (str): Text to split

    Returns:
        List[str]: List of sentences

    Examples:
        >>> split_into_sentences("Hello world. This is a test!")
        ['Hello world.', 'This is a test!']
    """
    # Simple sentence splitting
    sentences = re.split(r"(?<=[.!?])\s+", text)
    return [s.strip() for s in sentences if s.strip()]




[docs]
def split_into_paragraphs(text: str) -> List[str]:
    """Split text into paragraphs.

    Args:
        text (str): Text to split

    Returns:
        List[str]: List of paragraphs

    Examples:
        >>> split_into_paragraphs("Para 1\\n\\nPara 2\\n\\nPara 3")
        ['Para 1', 'Para 2', 'Para 3']
    """
    paragraphs = re.split(r"\n\s*\n", text)
    return [p.strip() for p in paragraphs if p.strip()]