Source code for kerb.chunk.text

"""Text-based chunking implementations."""

import re
from abc import ABC, abstractmethod
from typing import List, Optional


[docs] class Chunker(ABC): """Abstract base class for all chunker implementations. All chunker classes should inherit from this base class and implement the chunk method. """
[docs] @abstractmethod def chunk(self, text: str) -> List[str]: """Split text into chunks. Args: text (str): The text to chunk Returns: List[str]: List of text chunks """ pass
[docs] class RecursiveChunker(Chunker): """Recursively split text using a hierarchy of separators. Tries to split on larger semantic boundaries first (paragraphs, sentences) before falling back to character-level splitting. Similar to LangChain's RecursiveCharacterTextSplitter. Args: chunk_size (int): Target size for each chunk. Defaults to 1000. separators (List[str], optional): List of separators in priority order. Defaults to ['\\n\\n', '\\n', '. ', ' ', '']. Examples: >>> chunker = RecursiveChunker(chunk_size=500) >>> chunks = chunker.chunk("Your long text here...") """
[docs] def __init__(self, chunk_size: int = 1000, separators: Optional[List[str]] = None): self.chunk_size = chunk_size self.separators = ( separators if separators is not None else ["\n\n", "\n", ". ", " ", ""] )
[docs] def chunk(self, text: str) -> List[str]: """Split text into chunks recursively. Args: text (str): The text to chunk Returns: List[str]: List of recursively split chunks """ return self._recursive_split(text, self.separators)
def _recursive_split(self, text: str, separators: List[str]) -> List[str]: """Internal recursive splitting logic.""" if not text: return [] if len(text) <= self.chunk_size: return [text] chunks = [] # Try each separator in order for separator in separators: if separator == "": # Last resort: split by character from .utils import chunk_text as _chunk_text return _chunk_text(text, chunk_size=self.chunk_size) if separator in text: splits = text.split(separator) current_chunk = [] current_size = 0 for split in splits: split_with_sep = split + separator if split != splits[-1] else split split_size = len(split_with_sep) # If single split is too large, recurse with next separator if split_size > self.chunk_size: # Save current chunk if exists if current_chunk: chunks.append("".join(current_chunk).rstrip(separator)) current_chunk = [] current_size = 0 # Recurse with remaining separators remaining_seps = separators[separators.index(separator) + 1 :] chunks.extend(self._recursive_split(split, remaining_seps)) continue # Check if adding this split would exceed chunk size if current_size + split_size > self.chunk_size and current_chunk: chunks.append("".join(current_chunk).rstrip(separator)) current_chunk = [] current_size = 0 current_chunk.append(split_with_sep) current_size += split_size # Add remaining chunk if current_chunk: chunks.append("".join(current_chunk).rstrip(separator)) return chunks return [text]
[docs] class SentenceChunker(Chunker): """Split text into chunks based on sentence boundaries with optional overlap. Args: window_sentences (int): Number of sentences per chunk. Defaults to 5. overlap_sentences (int): Number of sentences to overlap. Defaults to 1. Examples: >>> chunker = SentenceChunker(window_sentences=3, overlap_sentences=1) >>> chunks = chunker.chunk("First sentence. Second sentence. Third sentence.") """
[docs] def __init__(self, window_sentences: int = 5, overlap_sentences: int = 1): self.window_sentences = window_sentences self.overlap_sentences = overlap_sentences
[docs] def chunk(self, text: str) -> List[str]: """Split text into sentence-based chunks with overlap. Args: text (str): The text to chunk Returns: List[str]: List of sentence-windowed chunks """ if not text: return [] # Split into sentences (simple approach) sentences = [s.strip() + "." for s in text.split(".") if s.strip()] if not sentences: return [] chunks = [] i = 0 while i < len(sentences): chunk_sentences = sentences[i : i + self.window_sentences] chunk = " ".join(chunk_sentences) chunks.append(chunk) # Move forward by (window_sentences - overlap_sentences) stride = max(1, self.window_sentences - self.overlap_sentences) i += stride if i >= len(sentences): break return chunks
[docs] def simple_chunker(text: str, chunk_size: int = 1000, overlap: int = 0) -> List[str]: """Split text into chunks of specified size. Args: text (str): The text to chunk chunk_size (int): Maximum size of each chunk. Defaults to 1000. overlap (int): Number of characters to overlap between chunks. Defaults to 0. Returns: List[str]: List of text chunks """ from .utils import chunk_text as _chunk_text return _chunk_text(text, chunk_size, overlap)
[docs] def overlap_chunker( text: str, chunk_size: int = 1000, overlap_ratio: float = 0.1 ) -> List[str]: """Split text with proportional overlap between chunks. Args: text (str): The text to chunk chunk_size (int): Maximum size of each chunk. Defaults to 1000. overlap_ratio (float): Proportion of chunk to overlap (0.0-1.0). Defaults to 0.1. Returns: List[str]: List of overlapping text chunks """ from .utils import chunk_text as _chunk_text overlap = int(chunk_size * overlap_ratio) return _chunk_text(text, chunk_size, overlap)
[docs] def paragraph_chunker(text: str, max_paragraphs: int = 3) -> List[str]: """Split text into chunks based on paragraph boundaries. Args: text (str): The text to chunk max_paragraphs (int): Maximum number of paragraphs per chunk. Defaults to 3. Returns: List[str]: List of paragraph-based chunks """ if not text: return [] # Split by double newlines (common paragraph separator) paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()] chunks = [] for i in range(0, len(paragraphs), max_paragraphs): chunk_paragraphs = paragraphs[i : i + max_paragraphs] chunk = "\n\n".join(chunk_paragraphs) chunks.append(chunk) return chunks
[docs] def sliding_window_chunker( text: str, window_size: int = 1000, stride: int = 500 ) -> List[str]: """Create chunks using a sliding window approach. Similar to simple_chunker with overlap, but stride-based for more control. Common in NLP tasks and document processing pipelines. Args: text (str): The text to chunk window_size (int): Size of each window/chunk. Defaults to 1000. stride (int): Number of characters to move forward for next window. Defaults to 500. Returns: List[str]: List of sliding window chunks """ if not text: return [] chunks = [] start = 0 while start < len(text): end = min(start + window_size, len(text)) chunks.append(text[start:end]) if end >= len(text): break start += stride return chunks
[docs] def token_based_chunker(text: str, max_tokens: int = 512, tokenizer=None) -> List[str]: """Split text based on token count. Uses the specified tokenizer to estimate chunk sizes. For accurate token-based chunking with OpenAI models, ensure tiktoken is installed. Args: text (str): The text to chunk max_tokens (int): Maximum tokens per chunk. Defaults to 512. tokenizer: Tokenizer to use for estimation. If None, uses character approximation. Returns: List[str]: List of token-based chunks Examples: >>> from kerb.tokenizer import Tokenizer >>> chunks = token_based_chunker(text, max_tokens=512, tokenizer=Tokenizer.CL100K_BASE) """ if not text: return [] from .utils import chunk_text as _chunk_text # Convert tokens to approximate character count if tokenizer is not None: from ..tokenizer import tokens_to_chars chunk_size = tokens_to_chars(max_tokens, tokenizer) else: # Rough approximation: 1 token ≈ 4 characters chunk_size = max_tokens * 4 return _chunk_text(text, chunk_size=chunk_size, overlap=0)
[docs] def recursive_chunker( text: str, chunk_size: int = 1000, separators: Optional[List[str]] = None ) -> List[str]: """Recursively split text using a hierarchy of separators. Functional interface for RecursiveChunker. Args: text (str): The text to chunk chunk_size (int): Target size for each chunk. Defaults to 1000. separators (List[str], optional): List of separators in priority order. Defaults to ['\\n\\n', '\\n', '. ', ' ', '']. Returns: List[str]: List of recursively split chunks """ chunker = RecursiveChunker(chunk_size=chunk_size, separators=separators) return chunker.chunk(text)
[docs] def sentence_window_chunker( text: str, window_sentences: int = 5, overlap_sentences: int = 1 ) -> List[str]: """Create overlapping chunks based on sentence boundaries. Functional interface for SentenceChunker. Args: text (str): The text to chunk window_sentences (int): Number of sentences per chunk. Defaults to 5. overlap_sentences (int): Number of sentences to overlap. Defaults to 1. Returns: List[str]: List of sentence-windowed chunks """ chunker = SentenceChunker( window_sentences=window_sentences, overlap_sentences=overlap_sentences ) return chunker.chunk(text)