"""Markdown-aware chunking implementations."""
import re
from typing import List
from .text import Chunker, paragraph_chunker
[docs]
class MarkdownChunker(Chunker):
"""Split markdown text based on heading hierarchy.
Respects markdown structure by splitting on headers while trying
to keep related content together.
Args:
max_chunk_size (int): Maximum size per chunk. Defaults to 1000.
Examples:
>>> chunker = MarkdownChunker(max_chunk_size=500)
>>> chunks = chunker.chunk(markdown_text)
"""
[docs]
def __init__(self, max_chunk_size: int = 1000):
self.max_chunk_size = max_chunk_size
[docs]
def chunk(self, text: str) -> List[str]:
"""Split markdown text into chunks.
Args:
text (str): Markdown text to chunk
Returns:
List[str]: List of markdown-aware chunks
"""
if not text:
return []
# Split on markdown headers (# ## ### etc)
header_pattern = r"\n(?=#{1,6}\s)"
sections = re.split(header_pattern, text)
chunks = []
current_chunk = []
current_size = 0
for section in sections:
section = section.strip()
if not section:
continue
section_size = len(section)
# If single section is too large, split it further
if section_size > self.max_chunk_size:
if current_chunk:
chunks.append("\n\n".join(current_chunk))
current_chunk = []
current_size = 0
# Split large section by paragraphs
para_chunks = paragraph_chunker(section, max_paragraphs=2)
chunks.extend(para_chunks)
continue
# Check if adding section would exceed max size
if current_size + section_size > self.max_chunk_size and current_chunk:
chunks.append("\n\n".join(current_chunk))
current_chunk = []
current_size = 0
current_chunk.append(section)
current_size += section_size + 2 # +2 for newlines
# Add remaining chunk
if current_chunk:
chunks.append("\n\n".join(current_chunk))
return chunks