"""Text extraction utilities.
This module provides functions for extracting text from various formats:
- HTML text extraction
- Markdown stripping
- Sentence splitting
- Paragraph splitting
"""
import re
from typing import List
[docs]
def strip_markdown(text: str) -> str:
"""Remove Markdown formatting from text.
Args:
text (str): Markdown text
Returns:
str: Plain text without Markdown formatting
Examples:
>>> strip_markdown("# Hello **World**")
'Hello World'
"""
# Remove headers
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE)
# Remove bold and italic
text = re.sub(r"\*\*\*(.+?)\*\*\*", r"\1", text) # Bold italic
text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) # Bold
text = re.sub(r"\*(.+?)\*", r"\1", text) # Italic
text = re.sub(r"__(.+?)__", r"\1", text) # Bold
text = re.sub(r"_(.+?)_", r"\1", text) # Italic
# Remove inline code
text = re.sub(r"`(.+?)`", r"\1", text)
# Remove code blocks
text = re.sub(r"```.*?```", "", text, flags=re.DOTALL)
# Remove links
text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)
# Remove images
text = re.sub(r"!\[([^\]]*)\]\([^\)]+\)", r"\1", text)
# Remove blockquotes
text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE)
# Remove horizontal rules
text = re.sub(r"^[-*_]{3,}$", "", text, flags=re.MULTILINE)
# Remove list markers
text = re.sub(r"^\s*[-*+]\s+", "", text, flags=re.MULTILINE)
text = re.sub(r"^\s*\d+\.\s+", "", text, flags=re.MULTILINE)
return text.strip()
[docs]
def split_into_sentences(text: str) -> List[str]:
"""Split text into sentences.
Args:
text (str): Text to split
Returns:
List[str]: List of sentences
Examples:
>>> split_into_sentences("Hello world. This is a test!")
['Hello world.', 'This is a test!']
"""
# Simple sentence splitting
sentences = re.split(r"(?<=[.!?])\s+", text)
return [s.strip() for s in sentences if s.strip()]
[docs]
def split_into_paragraphs(text: str) -> List[str]:
"""Split text into paragraphs.
Args:
text (str): Text to split
Returns:
List[str]: List of paragraphs
Examples:
>>> split_into_paragraphs("Para 1\\n\\nPara 2\\n\\nPara 3")
['Para 1', 'Para 2', 'Para 3']
"""
paragraphs = re.split(r"\n\s*\n", text)
return [p.strip() for p in paragraphs if p.strip()]