Source code for kerb.evaluation.quality

"""Quality assessment functions for generated text.

This module provides functions to assess various quality aspects of generated text:
- Coherence: Logical flow and structure
- Fluency: Naturalness and readability
- Faithfulness: Alignment with source material
- Answer Relevance: Relevance to questions
- Hallucination Detection: Unfounded claims
"""

import re
import statistics
from collections import Counter
from typing import TYPE_CHECKING, List, Union

from .metrics import calculate_semantic_similarity
from .types import EvaluationResult

if TYPE_CHECKING:
    from kerb.core.enums import FaithfulnessMethod


# ============================================================================
# Quality Assessment Functions
# ============================================================================


[docs] def assess_coherence(text: str) -> EvaluationResult: """Assess the coherence and logical flow of text. Args: text: Text to assess Returns: EvaluationResult: Coherence score and details Example: >>> result = assess_coherence("First point. Second point follows. Conclusion makes sense.") >>> result.score > 0.7 True """ if not text: return EvaluationResult(metric="coherence", score=0.0) sentences = _split_sentences(text) if len(sentences) <= 1: # Single sentence is coherent by default return EvaluationResult(metric="coherence", score=1.0, details={"sentences": 1}) # Heuristics for coherence score = 1.0 issues = [] # Check for transition words transition_words = { "however", "therefore", "furthermore", "moreover", "additionally", "consequently", "thus", "hence", "meanwhile", "similarly", "first", "second", "third", "finally", "next", "then", } has_transitions = any( any(word in sent.lower() for word in transition_words) for sent in sentences ) if not has_transitions and len(sentences) > 3: score -= 0.15 issues.append("Few transition words") # Check for repeated sentence structures (good for coherence) avg_sent_length = statistics.mean(len(s.split()) for s in sentences) sent_length_variance = ( statistics.variance(len(s.split()) for s in sentences) if len(sentences) > 1 else 0 ) if sent_length_variance > avg_sent_length * 2: score -= 0.1 issues.append("High sentence length variance") # Check for pronoun usage (indicates reference to previous content) pronouns = {"it", "they", "this", "that", "these", "those", "he", "she"} pronoun_usage = sum( 1 for sent in sentences[1:] # Skip first sentence for word in sent.lower().split() if word in pronouns ) if len(sentences) > 2 and pronoun_usage == 0: score -= 0.1 issues.append("No pronouns referencing previous content") return EvaluationResult( metric="coherence", score=max(0.0, score), details={ "sentences": len(sentences), "issues": issues, "has_transitions": has_transitions, }, )
[docs] def assess_fluency(text: str) -> EvaluationResult: """Assess the fluency and naturalness of text. Args: text: Text to assess Returns: EvaluationResult: Fluency score and details Example: >>> result = assess_fluency("This is a well-written sentence.") >>> result.score > 0.8 True """ if not text: return EvaluationResult(metric="fluency", score=0.0) score = 1.0 issues = [] # Check for repetitive words words = text.lower().split() if len(words) > 0: word_freq = Counter(words) # Exclude common words common_words = { "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", } significant_words = { w: c for w, c in word_freq.items() if w not in common_words and len(w) > 3 } max_repetition = max(significant_words.values()) if significant_words else 0 avg_length = len(words) if max_repetition > avg_length / 10: # Word repeated more than 10% of text score -= 0.2 issues.append("Excessive word repetition") # Check for incomplete sentences sentences = _split_sentences(text) incomplete = sum(1 for s in sentences if len(s.split()) < 3) if incomplete > len(sentences) / 3: score -= 0.15 issues.append("Many incomplete sentences") # Check for grammar patterns (very basic) # Look for common errors if re.search(r"\b(a)\s+([aeiou])", text.lower()): # "a apple" instead of "an apple" score -= 0.1 issues.append("Article agreement errors") # Check for excessive punctuation punct_count = sum(1 for c in text if c in "!?.,:;") word_count = len(words) if word_count > 0 and punct_count / word_count > 0.3: score -= 0.1 issues.append("Excessive punctuation") return EvaluationResult( metric="fluency", score=max(0.0, score), details={"issues": issues, "sentences": len(sentences)}, )
[docs] def detect_hallucination( output: str, context: str, threshold: float = 0.3 ) -> EvaluationResult: """Detect potential hallucinations (unfounded claims not supported by context). Args: output: Generated text to check context: Source context that should support the output threshold: Threshold for hallucination detection (lower = stricter) Returns: EvaluationResult: Hallucination score (0 = no hallucination, 1 = likely hallucination) Example: >>> result = detect_hallucination( ... "Paris is the capital of Germany", ... "Paris is the capital of France" ... ) >>> result.score > 0.5 True """ if not output or not context: return EvaluationResult(metric="hallucination", score=0.0) # Extract key entities and facts from output output_sentences = _split_sentences(output) context_lower = context.lower() unsupported_sentences = 0 total_sentences = len(output_sentences) details = [] for sent in output_sentences: # Check if sentence content appears in context sent_words = set(sent.lower().split()) # Remove common words common_words = { "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "is", "are", "was", "were", } significant_words = sent_words - common_words if not significant_words: continue # Check how many significant words appear in context words_in_context = sum(1 for word in significant_words if word in context_lower) support_ratio = ( words_in_context / len(significant_words) if significant_words else 1.0 ) if support_ratio < threshold: unsupported_sentences += 1 details.append( f"Unsupported: '{sent[:50]}...' (support: {support_ratio:.2f})" ) hallucination_score = ( unsupported_sentences / total_sentences if total_sentences > 0 else 0.0 ) return EvaluationResult( metric="hallucination", score=hallucination_score, details={ "unsupported_sentences": unsupported_sentences, "total_sentences": total_sentences, "examples": details[:3], # Limit to 3 examples }, passed=hallucination_score < 0.3, )
[docs] def assess_faithfulness( output: str, source: str, method: Union["FaithfulnessMethod", str] = "entailment" ) -> EvaluationResult: """Assess whether output is faithful to the source material. Args: output: Generated text source: Source material method: Assessment method (FaithfulnessMethod enum or string: "entailment", "nli", "fact_check", "llm") Returns: EvaluationResult: Faithfulness score (1 = fully faithful, 0 = not faithful) Examples: >>> from kerb.core.enums import FaithfulnessMethod >>> result = assess_faithfulness(output, source, method=FaithfulnessMethod.ENTAILMENT) >>> result.score > 0.7 True """ from kerb.core.enums import FaithfulnessMethod, validate_enum_or_string if not output or not source: return EvaluationResult(metric="faithfulness", score=0.0) # Validate and normalize method method_val = validate_enum_or_string(method, FaithfulnessMethod, "method") if isinstance(method_val, FaithfulnessMethod): method_str = method_val.value else: method_str = method_val if method_str == "overlap": # Token overlap method output_words = set(output.lower().split()) source_words = set(source.lower().split()) if not output_words: return EvaluationResult(metric="faithfulness", score=0.0) overlap = output_words & source_words faithfulness_score = len(overlap) / len(output_words) return EvaluationResult( metric="faithfulness", score=faithfulness_score, details={"method": "overlap", "overlap_tokens": len(overlap)}, ) elif method_str == "semantic": # Use semantic similarity similarity = calculate_semantic_similarity(output, source, method="tfidf") return EvaluationResult( metric="faithfulness", score=similarity, details={"method": "semantic"} ) elif method_str in ("entailment", "nli"): # Simple entailment check (heuristic-based) # Check if all key claims in output are supported by source output_sents = _split_sentences(output) source_lower = source.lower() supported = 0 for sent in output_sents: # Extract key terms sent_words = set(sent.lower().split()) common_words = { "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "is", "are", } key_terms = sent_words - common_words # Check if most key terms are in source terms_in_source = sum(1 for term in key_terms if term in source_lower) if key_terms and terms_in_source / len(key_terms) > 0.5: supported += 1 faithfulness_score = supported / len(output_sents) if output_sents else 0.0 return EvaluationResult( metric="faithfulness", score=faithfulness_score, details={ "method": "entailment", "supported_sentences": supported, "total_sentences": len(output_sents), }, ) else: raise ValueError(f"Unknown faithfulness method: {method}")
[docs] def assess_answer_relevance( answer: str, question: str, threshold: float = 0.3 ) -> EvaluationResult: """Assess whether an answer is relevant to the question. Args: answer: The answer text question: The question text threshold: Minimum overlap threshold Returns: EvaluationResult: Relevance score Example: >>> result = assess_answer_relevance( ... "Python is a programming language", ... "What is Python?" ... ) >>> result.score > 0.5 True """ if not answer or not question: return EvaluationResult(metric="answer_relevance", score=0.0) # Extract key terms from question question_words = set(question.lower().split()) # Remove question words question_stopwords = { "what", "when", "where", "why", "how", "who", "which", "is", "are", "the", "a", "an", } key_terms = question_words - question_stopwords # Check presence in answer answer_lower = answer.lower() terms_in_answer = sum(1 for term in key_terms if term in answer_lower) if not key_terms: # If no key terms, use semantic similarity relevance = calculate_semantic_similarity(answer, question, method="jaccard") else: relevance = terms_in_answer / len(key_terms) # Boost score if answer is substantive answer_length = len(answer.split()) if answer_length > 10: relevance = min(1.0, relevance * 1.1) return EvaluationResult( metric="answer_relevance", score=relevance, details={"key_terms": list(key_terms), "terms_found": terms_in_answer}, passed=relevance >= threshold, )
# ============================================================================ # Helper Functions # ============================================================================ def _split_sentences(text: str) -> List[str]: """Split text into sentences.""" # Simple sentence splitter sentences = re.split(r"[.!?]+", text) return [s.strip() for s in sentences if s.strip()]