Source code for kerb.evaluation.judges

"""LLM-as-judge evaluation functions.

This module provides functions for using LLMs to evaluate output quality:
- LLM as Judge: Use LLM to rate outputs on specific criteria
- Pairwise Comparison: Compare two outputs to determine the better one
"""

import re
from typing import Callable, Optional, Tuple, Union

from .metrics import calculate_f1_score, calculate_semantic_similarity
from .quality import assess_coherence
from .types import ComparisonResult, EvaluationResult, JudgmentCriterion

# ============================================================================
# LLM-as-Judge Functions
# ============================================================================



[docs]
def llm_as_judge(
    output: str,
    criterion: Union[str, JudgmentCriterion],
    context: Optional[str] = None,
    reference: Optional[str] = None,
    scale: int = 5,
    llm_function: Optional[Callable] = None,
) -> EvaluationResult:
    """Use an LLM to judge the quality of an output.

    Args:
        output: The text to evaluate
        criterion: Judgment criterion (relevance, accuracy, coherence, etc.)
        context: Optional context (e.g., the prompt or question)
        reference: Optional reference answer
        scale: Rating scale (default: 1-5)
        llm_function: Function to call LLM (should accept prompt and return string)

    Returns:
        EvaluationResult: Result with score and reasoning

    Example:
        >>> result = llm_as_judge(
        ...     "Python is a programming language",
        ...     JudgmentCriterion.RELEVANCE,
        ...     context="What is Python?"
        ... )
        >>> result.score
        4.5
    """
    if isinstance(criterion, JudgmentCriterion):
        criterion_name = criterion.value
    else:
        criterion_name = criterion

    # Build evaluation prompt
    prompt_parts = [
        f"Evaluate the following output on a scale of 1 to {scale} for {criterion_name}."
    ]

    if context:
        prompt_parts.append(f"\nContext/Question: {context}")

    if reference:
        prompt_parts.append(f"\nReference Answer: {reference}")

    prompt_parts.append(f"\nOutput to Evaluate: {output}")
    prompt_parts.append(
        f"\nProvide a rating from 1 to {scale} and explain your reasoning."
    )
    prompt_parts.append(f"Format: Rating: X\nReasoning: <explanation>")

    prompt = "\n".join(prompt_parts)

    # Call LLM if function provided
    if llm_function is not None:
        try:
            response = llm_function(prompt)
            score, reasoning = _parse_judge_response(response, scale)
        except Exception as e:
            # If LLM call fails, return a default result
            score = scale / 2
            reasoning = f"LLM evaluation failed: {str(e)}"
    else:
        # Without LLM, use heuristic scoring
        score, reasoning = _heuristic_judge(
            output, criterion_name, context, reference, scale
        )

    return EvaluationResult(
        metric=f"llm_judge_{criterion_name}",
        score=score / scale,  # Normalize to 0-1
        details={"raw_score": score, "scale": scale, "reasoning": reasoning},
    )




[docs]
def pairwise_comparison(
    output_a: str,
    output_b: str,
    criterion: str,
    context: Optional[str] = None,
    llm_function: Optional[Callable] = None,
) -> ComparisonResult:
    """Compare two outputs using LLM-as-judge.

    Args:
        output_a: First output to compare
        output_b: Second output to compare
        criterion: Comparison criterion
        context: Optional context (e.g., the prompt)
        llm_function: Function to call LLM

    Returns:
        ComparisonResult: Winner and reasoning

    Example:
        >>> result = pairwise_comparison(
        ...     "Python is great",
        ...     "Python is a high-level programming language",
        ...     "completeness"
        ... )
        >>> result.winner
        'b'
    """
    prompt = f"""Compare the following two outputs based on {criterion}.

Output A: {output_a}

Output B: {output_b}
"""

    if context:
        prompt = f"Context: {context}\n\n" + prompt

    prompt += (
        "\nWhich output is better? Respond with 'A', 'B', or 'TIE' and explain why."
    )

    if llm_function is not None:
        try:
            response = llm_function(prompt)
            winner, reasoning, confidence = _parse_comparison_response(response)
        except Exception as e:
            winner = None
            reasoning = f"Comparison failed: {str(e)}"
            confidence = 0.0
    else:
        # Heuristic comparison without LLM
        winner, reasoning, confidence = _heuristic_comparison(
            output_a, output_b, criterion
        )

    return ComparisonResult(
        output_a_id="a",
        output_b_id="b",
        winner=winner,
        scores={
            "a": 0.5 + (0.5 if winner == "a" else -0.25 if winner == "b" else 0.0),
            "b": 0.5 + (0.5 if winner == "b" else -0.25 if winner == "a" else 0.0),
        },
        confidence=confidence,
        reasoning=reasoning,
    )



# ============================================================================
# Helper Functions
# ============================================================================


def _parse_judge_response(response: str, scale: int) -> Tuple[float, str]:
    """Parse LLM judge response to extract rating and reasoning."""
    # Look for rating pattern
    rating_match = re.search(r"[Rr]ating:\s*(\d+(?:\.\d+)?)", response)
    reasoning_match = re.search(r"[Rr]easoning:\s*(.+)", response, re.DOTALL)

    if rating_match:
        rating = float(rating_match.group(1))
        rating = min(scale, max(1, rating))  # Clamp to scale
    else:
        # Try to find any number
        numbers = re.findall(r"\b(\d+(?:\.\d+)?)\b", response)
        if numbers:
            rating = float(numbers[0])
            rating = min(scale, max(1, rating))
        else:
            rating = scale / 2  # Default to middle

    reasoning = reasoning_match.group(1).strip() if reasoning_match else response[:200]

    return rating, reasoning


def _parse_comparison_response(response: str) -> Tuple[Optional[str], str, float]:
    """Parse comparison response to extract winner, reasoning, and confidence."""
    response_upper = response.upper()

    # Determine winner
    if "OUTPUT A" in response_upper or response_upper.startswith("A"):
        winner = "a"
    elif "OUTPUT B" in response_upper or response_upper.startswith("B"):
        winner = "b"
    elif "TIE" in response_upper:
        winner = None
    else:
        # Look for first A or B
        if "A" in response_upper[:20]:
            winner = "a"
        elif "B" in response_upper[:20]:
            winner = "b"
        else:
            winner = None

    # Extract reasoning
    reasoning = response

    # Estimate confidence based on language strength
    confidence_words = [
        "clearly",
        "definitely",
        "obviously",
        "significantly",
        "much better",
    ]
    confidence = 0.5

    for word in confidence_words:
        if word in response.lower():
            confidence += 0.1

    confidence = min(1.0, confidence)

    return winner, reasoning, confidence


def _heuristic_judge(
    output: str,
    criterion: str,
    context: Optional[str],
    reference: Optional[str],
    scale: int,
) -> Tuple[float, str]:
    """Heuristic-based judgment when LLM is not available."""
    score = scale / 2  # Default to middle
    reasoning = f"Heuristic evaluation for {criterion}"

    if criterion == "relevance":
        if context:
            similarity = calculate_semantic_similarity(
                output, context, method="jaccard"
            )
            score = 1 + (scale - 1) * similarity
            reasoning = f"Relevance based on overlap: {similarity:.2f}"

    elif criterion == "completeness":
        # Longer outputs score higher
        word_count = len(output.split())
        if word_count > 50:
            score = scale * 0.9
        elif word_count > 20:
            score = scale * 0.7
        else:
            score = scale * 0.5
        reasoning = f"Completeness based on length: {word_count} words"

    elif criterion == "coherence":
        result = assess_coherence(output)
        score = 1 + (scale - 1) * result.score
        reasoning = f"Coherence score: {result.score:.2f}"

    elif criterion == "accuracy":
        if reference:
            f1 = calculate_f1_score(output, reference)
            score = 1 + (scale - 1) * f1
            reasoning = f"Accuracy (F1) vs reference: {f1:.2f}"

    return score, reasoning


def _heuristic_comparison(
    output_a: str, output_b: str, criterion: str
) -> Tuple[Optional[str], str, float]:
    """Heuristic-based comparison when LLM is not available."""
    len_a = len(output_a.split())
    len_b = len(output_b.split())

    if criterion == "completeness":
        if len_a > len_b * 1.2:
            return "a", f"Output A is more complete ({len_a} vs {len_b} words)", 0.7
        elif len_b > len_a * 1.2:
            return "b", f"Output B is more complete ({len_b} vs {len_a} words)", 0.7
        else:
            return (
                None,
                f"Both outputs similar in length ({len_a} vs {len_b} words)",
                0.5,
            )

    elif criterion == "coherence":
        coherence_a = assess_coherence(output_a).score
        coherence_b = assess_coherence(output_b).score

        if coherence_a > coherence_b * 1.1:
            return (
                "a",
                f"Output A is more coherent ({coherence_a:.2f} vs {coherence_b:.2f})",
                0.6,
            )
        elif coherence_b > coherence_a * 1.1:
            return (
                "b",
                f"Output B is more coherent ({coherence_b:.2f} vs {coherence_a:.2f})",
                0.6,
            )
        else:
            return (
                None,
                f"Both outputs similarly coherent ({coherence_a:.2f} vs {coherence_b:.2f})",
                0.5,
            )

    else:
        # Default: compare by length
        if abs(len_a - len_b) < 5:
            return None, "Outputs are similar in length", 0.5
        elif len_a > len_b:
            return "a", f"Output A is longer ({len_a} vs {len_b} words)", 0.5
        else:
            return "b", f"Output B is longer ({len_b} vs {len_a} words)", 0.5