Source code for kerb.parsing.text

"""Text extraction utilities.

This module provides functions for extracting structured content from text,
including XML tags, markdown sections, lists, and tables.
"""

import re
from typing import Dict, List


[docs] def extract_xml_tag(text: str, tag: str) -> List[str]: """Extract content from XML-style tags. Args: text (str): Text containing XML tags tag (str): Tag name to extract (without < >) Returns: List[str]: List of tag contents Examples: >>> extract_xml_tag('<answer>42</answer>', 'answer') ['42'] """ pattern = f"<{tag}>(.*?)</{tag}>" matches = re.findall(pattern, text, re.DOTALL) return [match.strip() for match in matches]
[docs] def extract_markdown_sections(text: str, heading_level: int = 2) -> Dict[str, str]: """Extract sections from markdown by heading level. Args: text (str): Markdown text heading_level (int): Heading level to split on (1-6) Returns: Dict[str, str]: Mapping of heading names to section content """ heading_pattern = f'^{"#" * heading_level}\\s+(.+?)$' sections = {} lines = text.split("\n") current_heading = None current_content = [] for line in lines: match = re.match(heading_pattern, line) if match: # Save previous section if current_heading: sections[current_heading] = "\n".join(current_content).strip() # Start new section current_heading = match.group(1).strip() current_content = [] else: if current_heading: current_content.append(line) # Save last section if current_heading: sections[current_heading] = "\n".join(current_content).strip() return sections
[docs] def extract_list_items(text: str, ordered: bool = False) -> List[str]: """Extract list items from markdown text. Args: text (str): Markdown text ordered (bool): Extract ordered lists (1. 2. 3.) vs unordered (- * +) Returns: List[str]: List items """ if ordered: pattern = r"^\d+\.\s+(.+)$" else: pattern = r"^[-*+]\s+(.+)$" items = [] for line in text.split("\n"): match = re.match(pattern, line.strip()) if match: items.append(match.group(1)) return items
[docs] def parse_markdown_table(text: str) -> List[Dict[str, str]]: """Parse a markdown table into a list of dictionaries. Args: text (str): Markdown table text Returns: List[Dict[str, str]]: List of rows as dictionaries Examples: >>> table = ''' ... | Name | Age | ... |------|-----| ... | John | 30 | ... | Jane | 25 | ... ''' >>> parse_markdown_table(table) [{'Name': 'John', 'Age': '30'}, {'Name': 'Jane', 'Age': '25'}] """ lines = [line.strip() for line in text.strip().split("\n") if line.strip()] if len(lines) < 2: return [] # Parse header header = [col.strip() for col in lines[0].split("|") if col.strip()] # Skip separator line (line with dashes) data_lines = [line for line in lines[2:] if not re.match(r"^[\s|:-]+$", line)] # Parse rows rows = [] for line in data_lines: # Split by | and get all parts parts = line.split("|") # Filter out empty strings but keep track of positions values = [val.strip() for val in parts] # Remove leading/trailing empty strings from pipe at start/end if values and values[0] == "": values = values[1:] if values and values[-1] == "": values = values[:-1] if len(values) == len(header): rows.append(dict(zip(header, values))) return rows