Source code for kerb.embedding.providers.local

"""Local embedding providers (run on your machine, no API calls).

This module includes:
1. Hash-based embeddings (no dependencies, for testing/prototyping)
2. Sentence Transformers (local ML models, high quality)
"""

import hashlib
from typing import Any, Dict, List

# Model cache for loaded ML models
_model_cache: Dict[str, Any] = {}


def normalize_vector(vector: List[float]) -> List[float]:
    """Normalize a vector to unit length (L2 norm = 1).

    Args:
        vector (List[float]): Input vector

    Returns:
        List[float]: Normalized vector
    """
    import math

    magnitude = math.sqrt(sum(x * x for x in vector))
    if magnitude == 0:
        return vector
    return [x / magnitude for x in vector]


# ============================================================================
# Hash-based Local Embeddings
# ============================================================================


[docs] def local_embed(text: str, dimensions: int = 384) -> List[float]: """Generate embedding using local hash-based method. This is a simple, deterministic embedding that requires no external models. Suitable for testing, prototyping, or when you don't need semantic quality. Args: text (str): Text to embed dimensions (int): Embedding dimension Returns: List[float]: Normalized embedding vector """ if not text: return [0.0] * dimensions # Hash-based embedding text_hash = hashlib.md5(text.encode()).hexdigest() vector = [] for i in range(dimensions): char_index = i % len(text_hash) char_value = ord(text_hash[char_index]) normalized_value = (char_value - 127.5) / 127.5 vector.append(normalized_value) return normalize_vector(vector)
[docs] class LocalEmbedder: """Local hash-based embedder This is a simple, deterministic embedding that requires no external models. Suitable for testing, prototyping, or when you don't need semantic quality. Args: dimensions (int): Embedding dimension (default: 384) Examples: embedder = LocalEmbedder(dimensions=512) vec = embedder.embed("Hello world") vecs = embedder.embed_batch(["Hello", "World"]) """
[docs] def __init__(self, dimensions: int = 384): """Initialize the local embedder. Args: dimensions (int): Embedding dimension """ self.dimensions = dimensions
[docs] def embed(self, text: str) -> List[float]: """Generate embedding for a single text. Args: text (str): Text to embed Returns: List[float]: Embedding vector """ return local_embed(text, dimensions=self.dimensions)
[docs] def embed_batch(self, texts: List[str]) -> List[List[float]]: """Generate embeddings for multiple texts. Args: texts (List[str]): Texts to embed Returns: List[List[float]]: List of embedding vectors """ return [self.embed(text) for text in texts]
# ============================================================================ # Sentence Transformers (Local ML Models) # ============================================================================
[docs] def sentence_transformer_embed( text: str, model_name: str = "all-MiniLM-L6-v2", **kwargs ) -> List[float]: """Generate embedding using Sentence Transformers (local ML model). Requires: pip install sentence-transformers Args: text (str): Text to embed model_name (str): Model name (default: "all-MiniLM-L6-v2") **kwargs: Additional model parameters Returns: List[float]: Embedding vector Popular models: - "all-MiniLM-L6-v2" (384 dim, fast) - "all-mpnet-base-v2" (768 dim, quality) - "all-MiniLM-L12-v2" (384 dim, balanced) """ try: from sentence_transformers import SentenceTransformer except ImportError: raise ImportError( "sentence-transformers not installed. " "Install with: pip install sentence-transformers" ) # Get or cache model if model_name not in _model_cache: _model_cache[model_name] = SentenceTransformer(model_name) model = _model_cache[model_name] embedding = model.encode(text, **kwargs) return embedding.tolist()
[docs] def sentence_transformer_embed_batch( texts: List[str], model_name: str = "all-MiniLM-L6-v2", batch_size: int = 32, **kwargs, ) -> List[List[float]]: """Generate embeddings for multiple texts using Sentence Transformers. More efficient than calling sentence_transformer_embed repeatedly. Args: texts (List[str]): Texts to embed model_name (str): Model name (default: "all-MiniLM-L6-v2") batch_size (int): Batch size for processing **kwargs: Additional model parameters Returns: List[List[float]]: List of embedding vectors """ try: from sentence_transformers import SentenceTransformer except ImportError: raise ImportError( "sentence-transformers not installed. " "Install with: pip install sentence-transformers" ) # Get or cache model if model_name not in _model_cache: _model_cache[model_name] = SentenceTransformer(model_name) model = _model_cache[model_name] embeddings = model.encode(texts, batch_size=batch_size, **kwargs) return [emb.tolist() for emb in embeddings]
[docs] class SentenceTransformerEmbedder: """Sentence Transformers embedding provider (runs locally). Requires: pip install sentence-transformers Args: model_name (str): Model name (default: "all-MiniLM-L6-v2") Examples: embedder = SentenceTransformerEmbedder(model_name="all-mpnet-base-v2") vec = embedder.embed("Hello world") vecs = embedder.embed_batch(["Hello", "World"]) """
[docs] def __init__(self, model_name: str = "all-MiniLM-L6-v2"): """Initialize the Sentence Transformer embedder. Args: model_name (str): Model name """ self.model_name = model_name
[docs] def embed(self, text: str, **kwargs) -> List[float]: """Generate embedding for a single text. Args: text (str): Text to embed **kwargs: Additional model parameters Returns: List[float]: Embedding vector """ return sentence_transformer_embed(text, self.model_name, **kwargs)
[docs] def embed_batch( self, texts: List[str], batch_size: int = 32, **kwargs ) -> List[List[float]]: """Generate embeddings for multiple texts. Args: texts (List[str]): Texts to embed batch_size (int): Batch size for processing **kwargs: Additional model parameters Returns: List[List[float]]: List of embedding vectors """ return sentence_transformer_embed_batch( texts, self.model_name, batch_size, **kwargs )