Source code for kerb.evaluation.judges

"""LLM-as-judge evaluation functions.

This module provides functions for using LLMs to evaluate output quality:
- LLM as Judge: Use LLM to rate outputs on specific criteria
- Pairwise Comparison: Compare two outputs to determine the better one
"""

import re
from typing import Callable, Optional, Tuple, Union

from .metrics import calculate_f1_score, calculate_semantic_similarity
from .quality import assess_coherence
from .types import ComparisonResult, EvaluationResult, JudgmentCriterion

# ============================================================================
# LLM-as-Judge Functions
# ============================================================================


[docs] def llm_as_judge( output: str, criterion: Union[str, JudgmentCriterion], context: Optional[str] = None, reference: Optional[str] = None, scale: int = 5, llm_function: Optional[Callable] = None, ) -> EvaluationResult: """Use an LLM to judge the quality of an output. Args: output: The text to evaluate criterion: Judgment criterion (relevance, accuracy, coherence, etc.) context: Optional context (e.g., the prompt or question) reference: Optional reference answer scale: Rating scale (default: 1-5) llm_function: Function to call LLM (should accept prompt and return string) Returns: EvaluationResult: Result with score and reasoning Example: >>> result = llm_as_judge( ... "Python is a programming language", ... JudgmentCriterion.RELEVANCE, ... context="What is Python?" ... ) >>> result.score 4.5 """ if isinstance(criterion, JudgmentCriterion): criterion_name = criterion.value else: criterion_name = criterion # Build evaluation prompt prompt_parts = [ f"Evaluate the following output on a scale of 1 to {scale} for {criterion_name}." ] if context: prompt_parts.append(f"\nContext/Question: {context}") if reference: prompt_parts.append(f"\nReference Answer: {reference}") prompt_parts.append(f"\nOutput to Evaluate: {output}") prompt_parts.append( f"\nProvide a rating from 1 to {scale} and explain your reasoning." ) prompt_parts.append(f"Format: Rating: X\nReasoning: <explanation>") prompt = "\n".join(prompt_parts) # Call LLM if function provided if llm_function is not None: try: response = llm_function(prompt) score, reasoning = _parse_judge_response(response, scale) except Exception as e: # If LLM call fails, return a default result score = scale / 2 reasoning = f"LLM evaluation failed: {str(e)}" else: # Without LLM, use heuristic scoring score, reasoning = _heuristic_judge( output, criterion_name, context, reference, scale ) return EvaluationResult( metric=f"llm_judge_{criterion_name}", score=score / scale, # Normalize to 0-1 details={"raw_score": score, "scale": scale, "reasoning": reasoning}, )
[docs] def pairwise_comparison( output_a: str, output_b: str, criterion: str, context: Optional[str] = None, llm_function: Optional[Callable] = None, ) -> ComparisonResult: """Compare two outputs using LLM-as-judge. Args: output_a: First output to compare output_b: Second output to compare criterion: Comparison criterion context: Optional context (e.g., the prompt) llm_function: Function to call LLM Returns: ComparisonResult: Winner and reasoning Example: >>> result = pairwise_comparison( ... "Python is great", ... "Python is a high-level programming language", ... "completeness" ... ) >>> result.winner 'b' """ prompt = f"""Compare the following two outputs based on {criterion}. Output A: {output_a} Output B: {output_b} """ if context: prompt = f"Context: {context}\n\n" + prompt prompt += ( "\nWhich output is better? Respond with 'A', 'B', or 'TIE' and explain why." ) if llm_function is not None: try: response = llm_function(prompt) winner, reasoning, confidence = _parse_comparison_response(response) except Exception as e: winner = None reasoning = f"Comparison failed: {str(e)}" confidence = 0.0 else: # Heuristic comparison without LLM winner, reasoning, confidence = _heuristic_comparison( output_a, output_b, criterion ) return ComparisonResult( output_a_id="a", output_b_id="b", winner=winner, scores={ "a": 0.5 + (0.5 if winner == "a" else -0.25 if winner == "b" else 0.0), "b": 0.5 + (0.5 if winner == "b" else -0.25 if winner == "a" else 0.0), }, confidence=confidence, reasoning=reasoning, )
# ============================================================================ # Helper Functions # ============================================================================ def _parse_judge_response(response: str, scale: int) -> Tuple[float, str]: """Parse LLM judge response to extract rating and reasoning.""" # Look for rating pattern rating_match = re.search(r"[Rr]ating:\s*(\d+(?:\.\d+)?)", response) reasoning_match = re.search(r"[Rr]easoning:\s*(.+)", response, re.DOTALL) if rating_match: rating = float(rating_match.group(1)) rating = min(scale, max(1, rating)) # Clamp to scale else: # Try to find any number numbers = re.findall(r"\b(\d+(?:\.\d+)?)\b", response) if numbers: rating = float(numbers[0]) rating = min(scale, max(1, rating)) else: rating = scale / 2 # Default to middle reasoning = reasoning_match.group(1).strip() if reasoning_match else response[:200] return rating, reasoning def _parse_comparison_response(response: str) -> Tuple[Optional[str], str, float]: """Parse comparison response to extract winner, reasoning, and confidence.""" response_upper = response.upper() # Determine winner if "OUTPUT A" in response_upper or response_upper.startswith("A"): winner = "a" elif "OUTPUT B" in response_upper or response_upper.startswith("B"): winner = "b" elif "TIE" in response_upper: winner = None else: # Look for first A or B if "A" in response_upper[:20]: winner = "a" elif "B" in response_upper[:20]: winner = "b" else: winner = None # Extract reasoning reasoning = response # Estimate confidence based on language strength confidence_words = [ "clearly", "definitely", "obviously", "significantly", "much better", ] confidence = 0.5 for word in confidence_words: if word in response.lower(): confidence += 0.1 confidence = min(1.0, confidence) return winner, reasoning, confidence def _heuristic_judge( output: str, criterion: str, context: Optional[str], reference: Optional[str], scale: int, ) -> Tuple[float, str]: """Heuristic-based judgment when LLM is not available.""" score = scale / 2 # Default to middle reasoning = f"Heuristic evaluation for {criterion}" if criterion == "relevance": if context: similarity = calculate_semantic_similarity( output, context, method="jaccard" ) score = 1 + (scale - 1) * similarity reasoning = f"Relevance based on overlap: {similarity:.2f}" elif criterion == "completeness": # Longer outputs score higher word_count = len(output.split()) if word_count > 50: score = scale * 0.9 elif word_count > 20: score = scale * 0.7 else: score = scale * 0.5 reasoning = f"Completeness based on length: {word_count} words" elif criterion == "coherence": result = assess_coherence(output) score = 1 + (scale - 1) * result.score reasoning = f"Coherence score: {result.score:.2f}" elif criterion == "accuracy": if reference: f1 = calculate_f1_score(output, reference) score = 1 + (scale - 1) * f1 reasoning = f"Accuracy (F1) vs reference: {f1:.2f}" return score, reasoning def _heuristic_comparison( output_a: str, output_b: str, criterion: str ) -> Tuple[Optional[str], str, float]: """Heuristic-based comparison when LLM is not available.""" len_a = len(output_a.split()) len_b = len(output_b.split()) if criterion == "completeness": if len_a > len_b * 1.2: return "a", f"Output A is more complete ({len_a} vs {len_b} words)", 0.7 elif len_b > len_a * 1.2: return "b", f"Output B is more complete ({len_b} vs {len_a} words)", 0.7 else: return ( None, f"Both outputs similar in length ({len_a} vs {len_b} words)", 0.5, ) elif criterion == "coherence": coherence_a = assess_coherence(output_a).score coherence_b = assess_coherence(output_b).score if coherence_a > coherence_b * 1.1: return ( "a", f"Output A is more coherent ({coherence_a:.2f} vs {coherence_b:.2f})", 0.6, ) elif coherence_b > coherence_a * 1.1: return ( "b", f"Output B is more coherent ({coherence_b:.2f} vs {coherence_a:.2f})", 0.6, ) else: return ( None, f"Both outputs similarly coherent ({coherence_a:.2f} vs {coherence_b:.2f})", 0.5, ) else: # Default: compare by length if abs(len_a - len_b) < 5: return None, "Outputs are similar in length", 0.5 elif len_a > len_b: return "a", f"Output A is longer ({len_a} vs {len_b} words)", 0.5 else: return "b", f"Output B is longer ({len_b} vs {len_a} words)", 0.5