Source code for kerb.chunk.markdown

"""Markdown-aware chunking implementations."""

import re
from typing import List

from .text import Chunker, paragraph_chunker


[docs] class MarkdownChunker(Chunker): """Split markdown text based on heading hierarchy. Respects markdown structure by splitting on headers while trying to keep related content together. Args: max_chunk_size (int): Maximum size per chunk. Defaults to 1000. Examples: >>> chunker = MarkdownChunker(max_chunk_size=500) >>> chunks = chunker.chunk(markdown_text) """
[docs] def __init__(self, max_chunk_size: int = 1000): self.max_chunk_size = max_chunk_size
[docs] def chunk(self, text: str) -> List[str]: """Split markdown text into chunks. Args: text (str): Markdown text to chunk Returns: List[str]: List of markdown-aware chunks """ if not text: return [] # Split on markdown headers (# ## ### etc) header_pattern = r"\n(?=#{1,6}\s)" sections = re.split(header_pattern, text) chunks = [] current_chunk = [] current_size = 0 for section in sections: section = section.strip() if not section: continue section_size = len(section) # If single section is too large, split it further if section_size > self.max_chunk_size: if current_chunk: chunks.append("\n\n".join(current_chunk)) current_chunk = [] current_size = 0 # Split large section by paragraphs para_chunks = paragraph_chunker(section, max_paragraphs=2) chunks.extend(para_chunks) continue # Check if adding section would exceed max size if current_size + section_size > self.max_chunk_size and current_chunk: chunks.append("\n\n".join(current_chunk)) current_chunk = [] current_size = 0 current_chunk.append(section) current_size += section_size + 2 # +2 for newlines # Add remaining chunk if current_chunk: chunks.append("\n\n".join(current_chunk)) return chunks