Source code for kerb.evaluation.statistics

"""Statistical analysis utilities for evaluation.

This module provides statistical functions for analyzing evaluation scores:
- Calculate descriptive statistics
- Compute confidence intervals
"""

import math
import statistics
from typing import Dict, List, Tuple

# ============================================================================
# Statistical Analysis
# ============================================================================



[docs]
def calculate_statistics(scores: List[float]) -> Dict[str, float]:
    """Calculate statistical measures for a list of scores.

    Args:
        scores: List of numeric scores

    Returns:
        dict: Statistical measures (mean, median, stdev, min, max, percentiles)

    Example:
        >>> stats = calculate_statistics([0.5, 0.7, 0.8, 0.9, 1.0])
        >>> stats['mean']
        0.78
    """
    if not scores:
        return {
            "mean": 0.0,
            "median": 0.0,
            "stdev": 0.0,
            "variance": 0.0,
            "min": 0.0,
            "max": 0.0,
            "count": 0,
        }

    sorted_scores = sorted(scores)

    return {
        "mean": statistics.mean(scores),
        "median": statistics.median(scores),
        "stdev": statistics.stdev(scores) if len(scores) > 1 else 0.0,
        "variance": statistics.variance(scores) if len(scores) > 1 else 0.0,
        "min": min(scores),
        "max": max(scores),
        "count": len(scores),
        "p25": sorted_scores[len(sorted_scores) // 4],
        "p75": sorted_scores[3 * len(sorted_scores) // 4],
        "p90": sorted_scores[9 * len(sorted_scores) // 10],
        "p95": (
            sorted_scores[19 * len(sorted_scores) // 20]
            if len(sorted_scores) >= 20
            else sorted_scores[-1]
        ),
    }




[docs]
def confidence_interval(
    scores: List[float], confidence: float = 0.95
) -> Tuple[float, float]:
    """Calculate confidence interval for scores.

    Args:
        scores: List of scores
        confidence: Confidence level (default: 0.95)

    Returns:
        tuple: (lower_bound, upper_bound)

    Example:
        >>> lower, upper = confidence_interval([0.7, 0.8, 0.9])
        >>> lower < 0.8 < upper
        True
    """
    if not scores or len(scores) < 2:
        return (0.0, 0.0)

    mean = statistics.mean(scores)
    stdev = statistics.stdev(scores)
    n = len(scores)

    # Use t-distribution approximation (simplified)
    # For 95% confidence and reasonable n, t ≈ 2
    if confidence == 0.95:
        t_value = 2.0
    elif confidence == 0.99:
        t_value = 2.6
    else:
        t_value = 1.96  # z-value for 95%

    margin = t_value * stdev / math.sqrt(n)

    return (mean - margin, mean + margin)