Source code for kerb.parsing.text
"""Text extraction utilities.
This module provides functions for extracting structured content from text,
including XML tags, markdown sections, lists, and tables.
"""
import re
from typing import Dict, List
[docs]
def extract_xml_tag(text: str, tag: str) -> List[str]:
"""Extract content from XML-style tags.
Args:
text (str): Text containing XML tags
tag (str): Tag name to extract (without < >)
Returns:
List[str]: List of tag contents
Examples:
>>> extract_xml_tag('<answer>42</answer>', 'answer')
['42']
"""
pattern = f"<{tag}>(.*?)</{tag}>"
matches = re.findall(pattern, text, re.DOTALL)
return [match.strip() for match in matches]
[docs]
def extract_markdown_sections(text: str, heading_level: int = 2) -> Dict[str, str]:
"""Extract sections from markdown by heading level.
Args:
text (str): Markdown text
heading_level (int): Heading level to split on (1-6)
Returns:
Dict[str, str]: Mapping of heading names to section content
"""
heading_pattern = f'^{"#" * heading_level}\\s+(.+?)$'
sections = {}
lines = text.split("\n")
current_heading = None
current_content = []
for line in lines:
match = re.match(heading_pattern, line)
if match:
# Save previous section
if current_heading:
sections[current_heading] = "\n".join(current_content).strip()
# Start new section
current_heading = match.group(1).strip()
current_content = []
else:
if current_heading:
current_content.append(line)
# Save last section
if current_heading:
sections[current_heading] = "\n".join(current_content).strip()
return sections
[docs]
def extract_list_items(text: str, ordered: bool = False) -> List[str]:
"""Extract list items from markdown text.
Args:
text (str): Markdown text
ordered (bool): Extract ordered lists (1. 2. 3.) vs unordered (- * +)
Returns:
List[str]: List items
"""
if ordered:
pattern = r"^\d+\.\s+(.+)$"
else:
pattern = r"^[-*+]\s+(.+)$"
items = []
for line in text.split("\n"):
match = re.match(pattern, line.strip())
if match:
items.append(match.group(1))
return items
[docs]
def parse_markdown_table(text: str) -> List[Dict[str, str]]:
"""Parse a markdown table into a list of dictionaries.
Args:
text (str): Markdown table text
Returns:
List[Dict[str, str]]: List of rows as dictionaries
Examples:
>>> table = '''
... | Name | Age |
... |------|-----|
... | John | 30 |
... | Jane | 25 |
... '''
>>> parse_markdown_table(table)
[{'Name': 'John', 'Age': '30'}, {'Name': 'Jane', 'Age': '25'}]
"""
lines = [line.strip() for line in text.strip().split("\n") if line.strip()]
if len(lines) < 2:
return []
# Parse header
header = [col.strip() for col in lines[0].split("|") if col.strip()]
# Skip separator line (line with dashes)
data_lines = [line for line in lines[2:] if not re.match(r"^[\s|:-]+$", line)]
# Parse rows
rows = []
for line in data_lines:
# Split by | and get all parts
parts = line.split("|")
# Filter out empty strings but keep track of positions
values = [val.strip() for val in parts]
# Remove leading/trailing empty strings from pipe at start/end
if values and values[0] == "":
values = values[1:]
if values and values[-1] == "":
values = values[:-1]
if len(values) == len(header):
rows.append(dict(zip(header, values)))
return rows