Source code for kerb.context.optimization

"""Context optimization utilities.

This module provides functions for optimizing context windows through
deduplication, reordering, merging, and query-specific optimization.
"""

from collections import defaultdict
from typing import TYPE_CHECKING, List, Optional, Union

from .priority import priority_by_diversity, priority_by_relevance
from .types import ContextItem, ContextWindow
from .window import truncate_context_window

if TYPE_CHECKING:
    from kerb.core.enums import ReorderStrategy


[docs] def deduplicate_context( items: List[ContextItem], similarity_threshold: float = 0.9 ) -> List[ContextItem]: """Remove duplicate or highly similar context items. Args: items: List of context items similarity_threshold: Threshold for considering items duplicates (0-1) Returns: List[ContextItem]: Deduplicated items Example: >>> unique_items = deduplicate_context(items, similarity_threshold=0.85) """ if not items: return [] unique_items = [items[0]] for item in items[1:]: is_duplicate = False # Check against all unique items for unique_item in unique_items: # Simple word-based similarity words1 = set(item.content.lower().split()) words2 = set(unique_item.content.lower().split()) if not words1 or not words2: continue overlap = len(words1 & words2) union = len(words1 | words2) similarity = overlap / union if union > 0 else 0.0 if similarity >= similarity_threshold: is_duplicate = True break if not is_duplicate: unique_items.append(item) return unique_items
[docs] def reorder_context( items: List[ContextItem], strategy: Union["ReorderStrategy", str] = "chronological" ) -> List[ContextItem]: """Reorder context items using specified strategy. Args: items: List of context items strategy: Reordering strategy (ReorderStrategy enum or string: "chronological", "priority", "relevance", "alternating") Returns: List[ContextItem]: Reordered items Examples: >>> from kerb.core.enums import ReorderStrategy >>> reordered = reorder_context(items, strategy=ReorderStrategy.PRIORITY) """ from kerb.core.enums import ReorderStrategy, validate_enum_or_string # Validate and normalize strategy strategy_val = validate_enum_or_string(strategy, ReorderStrategy, "strategy") if isinstance(strategy_val, ReorderStrategy): strategy_str = strategy_val.value else: strategy_str = strategy_val if strategy_str == "chronological": return sorted(items, key=lambda x: x.timestamp or 0) elif strategy_str == "priority": return sorted(items, key=lambda x: x.priority, reverse=True) elif strategy_str == "relevance": return sorted(items, key=lambda x: x.priority, reverse=True) elif strategy_str == "alternating": # Alternate between different item types type_groups = defaultdict(list) for item in items: type_groups[item.item_type].append(item) result = [] max_len = max(len(group) for group in type_groups.values()) for i in range(max_len): for item_type in sorted(type_groups.keys()): if i < len(type_groups[item_type]): result.append(type_groups[item_type][i]) return result else: return items
[docs] def merge_context_windows( windows: List[ContextWindow], max_tokens: Optional[int] = None, deduplication: bool = True, ) -> ContextWindow: """Merge multiple context windows into one. Args: windows: List of context windows to merge max_tokens: Maximum tokens for merged window deduplication: Whether to deduplicate items Returns: ContextWindow: Merged context window Example: >>> merged = merge_context_windows([window1, window2], max_tokens=2000) """ if not windows: return ContextWindow(max_tokens=max_tokens) # Collect all items all_items = [] for window in windows: all_items.extend(window.items) # Deduplicate if requested if deduplication: all_items = deduplicate_context(all_items) # Create merged window total_tokens = sum(item.token_count or 0 for item in all_items) merged = ContextWindow( items=all_items, max_tokens=max_tokens, current_tokens=total_tokens ) # Truncate if needed if max_tokens and total_tokens > max_tokens: merged = truncate_context_window(merged, max_tokens) return merged
[docs] def optimize_context_for_query( window: ContextWindow, query: str, max_tokens: int, relevance_weight: float = 0.7, diversity_weight: float = 0.3, ) -> ContextWindow: """Optimize context window for a specific query. Args: window: Context window to optimize query: Query to optimize for max_tokens: Maximum tokens allowed relevance_weight: Weight for relevance scoring diversity_weight: Weight for diversity scoring Returns: ContextWindow: Optimized context window Example: >>> optimized = optimize_context_for_query(window, "What is AI?", max_tokens=1000) """ items = window.items.copy() # Assign relevance scores items = priority_by_relevance(items, query) relevance_scores = {id(item): item.priority for item in items} # Assign diversity scores items = priority_by_diversity(items) diversity_scores = {id(item): item.priority for item in items} # Combine scores for item in items: relevance = relevance_scores.get(id(item), 0) diversity = diversity_scores.get(id(item), 0) item.priority = relevance_weight * relevance + diversity_weight * diversity # Sort by combined score items.sort(key=lambda x: x.priority, reverse=True) # Select items up to token limit selected = [] current_tokens = 0 for item in items: item_tokens = item.token_count or 0 if current_tokens + item_tokens <= max_tokens: selected.append(item) current_tokens += item_tokens # Restore chronological order for selected items original_order = {id(item): i for i, item in enumerate(window.items)} selected.sort(key=lambda x: original_order.get(id(x), 0)) return ContextWindow( items=selected, max_tokens=max_tokens, current_tokens=current_tokens, metadata={**window.metadata, "query_optimized": True}, )