Source code for kerb.evaluation.benchmarks

"""Benchmarking utilities for evaluation.

This module provides functions for running benchmarks on test cases
and comparing different prompts or models.
"""

import statistics
import time
from typing import Callable, Dict, List, Tuple

from .types import BenchmarkResult, TestCase

# ============================================================================
# Benchmarking Functions
# ============================================================================



[docs]
def run_benchmark(
    test_cases: List[TestCase],
    generation_fn: Callable[[str], str],
    evaluation_fn: Callable[[str, str], float],
    threshold: float = 0.7,
    name: str = "benchmark",
) -> BenchmarkResult:
    """Run a benchmark on a set of test cases.

    Args:
        test_cases: List of test cases
        generation_fn: Function to generate output from input
        evaluation_fn: Function to evaluate output (returns score 0-1)
        threshold: Pass threshold (default: 0.7)
        name: Benchmark name

    Returns:
        BenchmarkResult: Benchmark results

    Example:
        >>> cases = [TestCase(id="1", input="What is AI?", expected_output="Artificial Intelligence")]
        >>> result = run_benchmark(cases, lambda x: "AI means " + x, lambda o, e: 0.8)
        >>> result.pass_rate
        100.0
    """
    start_time = time.time()

    scores = []
    passed = 0
    failed = 0
    details = []

    for test_case in test_cases:
        try:
            # Generate output
            output = generation_fn(test_case.input)

            # Evaluate
            if test_case.expected_output:
                score = evaluation_fn(output, test_case.expected_output)
            else:
                # No expected output, just score the output
                score = evaluation_fn(output, "")

            scores.append(score)

            if score >= threshold:
                passed += 1
            else:
                failed += 1

            details.append(
                {
                    "test_id": test_case.id,
                    "score": score,
                    "passed": score >= threshold,
                    "output": output[:100],
                }
            )

        except Exception as e:
            failed += 1
            scores.append(0.0)
            details.append(
                {
                    "test_id": test_case.id,
                    "score": 0.0,
                    "passed": False,
                    "error": str(e),
                }
            )

    execution_time = time.time() - start_time

    return BenchmarkResult(
        name=name,
        total_tests=len(test_cases),
        passed_tests=passed,
        failed_tests=failed,
        average_score=statistics.mean(scores) if scores else 0.0,
        scores=scores,
        execution_time=execution_time,
        details={"test_results": details},
    )




[docs]
def benchmark_prompts(
    prompts: List[Tuple[str, str]],
    test_inputs: List[str],
    generation_fn: Callable[[str, str], str],
    evaluation_fn: Callable[[str], float],
) -> Dict[str, BenchmarkResult]:
    """Benchmark multiple prompts against test inputs.

    Args:
        prompts: List of (prompt_id, prompt_template) tuples
        test_inputs: List of test inputs
        generation_fn: Function(prompt, input) -> output
        evaluation_fn: Function(output) -> score

    Returns:
        dict: Benchmark results for each prompt

    Example:
        >>> results = benchmark_prompts(
        ...     [("v1", "Answer: {input}"), ("v2", "Detailed answer: {input}")],
        ...     ["What is AI?", "What is ML?"],
        ...     lambda p, i: p.format(input=i),
        ...     lambda o: len(o.split()) / 10
        ... )
        >>> len(results)
        2
    """
    results = {}

    for prompt_id, prompt_template in prompts:
        scores = []

        for test_input in test_inputs:
            try:
                output = generation_fn(prompt_template, test_input)
                score = evaluation_fn(output)
                scores.append(score)
            except Exception:
                scores.append(0.0)

        results[prompt_id] = BenchmarkResult(
            name=prompt_id,
            total_tests=len(test_inputs),
            passed_tests=sum(1 for s in scores if s >= 0.7),
            failed_tests=sum(1 for s in scores if s < 0.7),
            average_score=statistics.mean(scores) if scores else 0.0,
            scores=scores,
        )

    return results