Source code for kerb.embedding.utils.metrics

"""Similarity and distance metrics for embeddings."""

import math
from typing import List, Tuple, Union


[docs] def cosine_similarity(vector1: List[float], vector2: List[float]) -> float: """Calculate cosine similarity between two vectors. Args: vector1 (List[float]): First embedding vector vector2 (List[float]): Second embedding vector Returns: float: Cosine similarity score between -1 and 1 (1 = identical) Examples: from kerb.embedding import embed sim = cosine_similarity(embed("hello"), embed("hi")) """ if len(vector1) != len(vector2): raise ValueError( f"Vectors must have same dimensions: {len(vector1)} vs {len(vector2)}" ) if not vector1 or not vector2: return 0.0 dot_prod = sum(a * b for a, b in zip(vector1, vector2)) magnitude1 = math.sqrt(sum(x * x for x in vector1)) magnitude2 = math.sqrt(sum(x * x for x in vector2)) if magnitude1 == 0 or magnitude2 == 0: return 0.0 return dot_prod / (magnitude1 * magnitude2)
[docs] def euclidean_distance(vector1: List[float], vector2: List[float]) -> float: """Calculate Euclidean (L2) distance between two vectors. Args: vector1 (List[float]): First embedding vector vector2 (List[float]): Second embedding vector Returns: float: Euclidean distance (0 = identical, higher = more different) """ if len(vector1) != len(vector2): raise ValueError( f"Vectors must have same dimensions: {len(vector1)} vs {len(vector2)}" ) return math.sqrt(sum((a - b) ** 2 for a, b in zip(vector1, vector2)))
[docs] def manhattan_distance(vector1: List[float], vector2: List[float]) -> float: """Calculate Manhattan (L1) distance between two vectors. Args: vector1 (List[float]): First embedding vector vector2 (List[float]): Second embedding vector Returns: float: Manhattan distance """ if len(vector1) != len(vector2): raise ValueError( f"Vectors must have same dimensions: {len(vector1)} vs {len(vector2)}" ) return sum(abs(a - b) for a, b in zip(vector1, vector2))
[docs] def dot_product(vector1: List[float], vector2: List[float]) -> float: """Calculate dot product between two vectors. Args: vector1 (List[float]): First embedding vector vector2 (List[float]): Second embedding vector Returns: float: Dot product score """ if len(vector1) != len(vector2): raise ValueError( f"Vectors must have same dimensions: {len(vector1)} vs {len(vector2)}" ) return sum(a * b for a, b in zip(vector1, vector2))
[docs] def batch_similarity( query_vector: List[float], vectors: List[List[float]], metric: str = "cosine" ) -> List[float]: """Calculate similarity between a query vector and multiple vectors. Args: query_vector (List[float]): Query embedding vector vectors (List[List[float]]): List of embedding vectors to compare metric (str): Distance metric ("cosine", "euclidean", "manhattan", "dot") Returns: List[float]: Similarity/distance scores Examples: from kerb.embedding import embed, embed_batch query = embed("search query") docs = embed_batch(["doc1", "doc2", "doc3"]) scores = batch_similarity(query, docs, metric="cosine") """ metric_funcs = { "cosine": cosine_similarity, "euclidean": euclidean_distance, "manhattan": manhattan_distance, "dot": dot_product, } if metric not in metric_funcs: raise ValueError( f"Unknown metric: {metric}. Choose from {list(metric_funcs.keys())}" ) func = metric_funcs[metric] return [func(query_vector, vec) for vec in vectors]
[docs] def top_k_similar( query_vector: List[float], vectors: List[List[float]], k: int = 5, metric: str = "cosine", return_scores: bool = False, ) -> Union[List[int], List[Tuple[int, float]]]: """Find top-k most similar vectors to a query vector. Args: query_vector (List[float]): Query embedding vector vectors (List[List[float]]): List of embedding vectors to search k (int): Number of top results to return metric (str): Distance metric ("cosine", "euclidean", "manhattan", "dot") return_scores (bool): If True, return (index, score) tuples Returns: List[int] or List[Tuple[int, float]]: Top-k indices (or index-score pairs) Examples: from kerb.embedding import embed, embed_batch query = embed("search query") docs = embed_batch(["doc1", "doc2", "doc3"]) indices = top_k_similar(query, docs, k=2) # Or with scores results = top_k_similar(query, docs, k=2, return_scores=True) """ scores = batch_similarity(query_vector, vectors, metric=metric) # For distance metrics, lower is better reverse = metric in ["cosine", "dot"] indexed_scores = list(enumerate(scores)) indexed_scores.sort(key=lambda x: x[1], reverse=reverse) top_k_results = indexed_scores[:k] if return_scores: return top_k_results else: return [idx for idx, _ in top_k_results]