Source code for kerb.prompt.optimization

"""Prompt compression and optimization utilities.

This module provides tools for reducing prompt size and optimizing
prompt structure while maintaining clarity and effectiveness.
"""

import re
from typing import Any, Dict, List, Optional

from kerb.preprocessing import truncate_text

from .template import extract_template_variables



[docs]
def compress_prompt(
    prompt: str,
    max_length: Optional[int] = None,
    strategies: Optional[List[str]] = None,
) -> str:
    """Compress a prompt using multiple optimization strategies.

    Args:
        prompt (str): Prompt to compress
        max_length (Optional[int]): Target maximum length. If None, applies all strategies
            without length constraint. Defaults to None.
        strategies (Optional[List[str]]): List of strategies to apply.
            Options: ["whitespace", "abbreviations"].
            If None, applies all strategies. Defaults to None.

    Returns:
        str: Compressed prompt

    Examples:
        >>> compress_prompt("Hello    world!  This   is   a    test.")
        'Hello world! This is a test.'
    """
    if not prompt:
        return prompt

    if strategies is None:
        strategies = ["whitespace", "abbreviations"]

    result = prompt

    # Apply optimization strategies
    if "whitespace" in strategies:
        result = optimize_whitespace(result)

    if "abbreviations" in strategies:
        result = _apply_abbreviations(result)

    # Truncate if max_length is specified
    if max_length and len(result) > max_length:
        result = truncate_text(result, max_length, strategy="smart")

    return result




[docs]
def optimize_whitespace(prompt: str) -> str:
    """Optimize whitespace in a prompt.

    Removes excessive spaces, trailing whitespace, and normalizes line breaks.

    Args:
        prompt (str): Prompt to optimize

    Returns:
        str: Prompt with optimized whitespace

    Examples:
        >>> optimize_whitespace("Hello    world!  \\n\\n\\n  Test")
        'Hello world!\\n\\nTest'
    """
    if not prompt:
        return prompt

    # Replace multiple spaces with single space
    result = re.sub(r" +", " ", prompt)

    # Replace multiple newlines with double newline (paragraph break)
    result = re.sub(r"\n\s*\n\s*\n+", "\n\n", result)

    # Remove trailing whitespace from each line
    lines = result.split("\n")
    lines = [line.rstrip() for line in lines]
    result = "\n".join(lines)

    # Remove leading/trailing whitespace from entire prompt
    result = result.strip()

    return result



def _apply_abbreviations(prompt: str) -> str:
    """Apply common abbreviations to reduce prompt length.

    Args:
        prompt (str): Prompt to abbreviate

    Returns:
        str: Prompt with abbreviations applied
    """
    # Common abbreviations that maintain clarity
    abbreviations = {
        r"\bfor example\b": "e.g.",
        r"\bthat is\b": "i.e.",
        r"\band so on\b": "etc.",
        r"\band others\b": "et al.",
    }

    result = prompt
    for pattern, replacement in abbreviations.items():
        result = re.sub(pattern, replacement, result, flags=re.IGNORECASE)

    return result



[docs]
def analyze_prompt(prompt: str, tokenizer: Optional[Any] = None) -> Dict[str, Any]:
    """Analyze a prompt and return statistics.

    Args:
        prompt (str): Prompt to analyze
        tokenizer (Optional[Any]): Tokenizer to use for token counting.
            If None, uses character approximation. Defaults to None.

    Returns:
        Dict[str, Any]: Analysis results including length, word count, line count, etc.

    Examples:
        >>> analyze_prompt("Hello world! This is a test.")
        {
            'length': 28,
            'words': 6,
            'lines': 1,
            'sentences': 2,
            'tokens_approx': 7,
            'variables': []
        }
    """
    if not prompt:
        return {
            "length": 0,
            "words": 0,
            "lines": 0,
            "sentences": 0,
            "tokens_approx": 0,
            "variables": [],
        }

    # Basic statistics
    length = len(prompt)
    words = len(prompt.split())
    lines = len(prompt.split("\n"))

    # Count sentences (approximate)
    sentences = len(re.findall(r"[.!?]+", prompt))

    # Token approximation (4 chars per token is typical for English)
    tokens_approx = length // 4

    # Try to use actual tokenizer if provided
    if tokenizer is not None:
        try:
            from ..tokenizer import count_tokens

            tokens_approx = count_tokens(prompt, tokenizer)
        except Exception:
            pass

    # Extract template variables
    variables = extract_template_variables(prompt)

    return {
        "length": length,
        "words": words,
        "lines": lines,
        "sentences": sentences,
        "tokens_approx": tokens_approx,
        "variables": variables,
        "avg_word_length": length / words if words > 0 else 0,
        "avg_sentence_length": words / sentences if sentences > 0 else 0,
    }