Source code for kerb.evaluation.statistics

"""Statistical analysis utilities for evaluation.

This module provides statistical functions for analyzing evaluation scores:
- Calculate descriptive statistics
- Compute confidence intervals
"""

import math
import statistics
from typing import Dict, List, Tuple

# ============================================================================
# Statistical Analysis
# ============================================================================


[docs] def calculate_statistics(scores: List[float]) -> Dict[str, float]: """Calculate statistical measures for a list of scores. Args: scores: List of numeric scores Returns: dict: Statistical measures (mean, median, stdev, min, max, percentiles) Example: >>> stats = calculate_statistics([0.5, 0.7, 0.8, 0.9, 1.0]) >>> stats['mean'] 0.78 """ if not scores: return { "mean": 0.0, "median": 0.0, "stdev": 0.0, "variance": 0.0, "min": 0.0, "max": 0.0, "count": 0, } sorted_scores = sorted(scores) return { "mean": statistics.mean(scores), "median": statistics.median(scores), "stdev": statistics.stdev(scores) if len(scores) > 1 else 0.0, "variance": statistics.variance(scores) if len(scores) > 1 else 0.0, "min": min(scores), "max": max(scores), "count": len(scores), "p25": sorted_scores[len(sorted_scores) // 4], "p75": sorted_scores[3 * len(sorted_scores) // 4], "p90": sorted_scores[9 * len(sorted_scores) // 10], "p95": ( sorted_scores[19 * len(sorted_scores) // 20] if len(sorted_scores) >= 20 else sorted_scores[-1] ), }
[docs] def confidence_interval( scores: List[float], confidence: float = 0.95 ) -> Tuple[float, float]: """Calculate confidence interval for scores. Args: scores: List of scores confidence: Confidence level (default: 0.95) Returns: tuple: (lower_bound, upper_bound) Example: >>> lower, upper = confidence_interval([0.7, 0.8, 0.9]) >>> lower < 0.8 < upper True """ if not scores or len(scores) < 2: return (0.0, 0.0) mean = statistics.mean(scores) stdev = statistics.stdev(scores) n = len(scores) # Use t-distribution approximation (simplified) # For 95% confidence and reasonable n, t ≈ 2 if confidence == 0.95: t_value = 2.0 elif confidence == 0.99: t_value = 2.6 else: t_value = 1.96 # z-value for 95% margin = t_value * stdev / math.sqrt(n) return (mean - margin, mean + margin)