Source code for kerb.retrieval.search

"""Search methods for retrieval.

This module provides keyword, semantic, and hybrid search implementations.
"""

import math
from collections import Counter
from typing import TYPE_CHECKING, List, Optional, Union

from kerb.core.types import Document

from .structures import HybridSearchConfig, SearchResult

if TYPE_CHECKING:
    from kerb.core.enums import FusionMethod



[docs]
def keyword_search(
    query: str, documents: List[Document], top_k: int = 10, field: str = "content"
) -> List[SearchResult]:
    """Perform keyword-based search using BM25-like scoring.

    Args:
        query: Search query
        documents: List of documents to search
        top_k: Number of top results to return
        field: Document field to search ("content" or metadata key)

    Returns:
        List[SearchResult]: Ranked search results

    Example:
        >>> docs = [Document(id="1", content="Python is great"), ...]
        >>> results = keyword_search("python programming", docs)
    """
    if not documents:
        return []

    query_terms = query.lower().split()

    # Calculate document frequencies
    doc_freq = Counter()
    for doc in documents:
        text = doc.content if field == "content" else doc.metadata.get(field, "")
        terms = set(text.lower().split())
        for term in terms:
            if term in query_terms:
                doc_freq[term] += 1

    # Score documents
    scores = []
    num_docs = len(documents)

    for doc in documents:
        text = doc.content if field == "content" else doc.metadata.get(field, "")
        text_terms = text.lower().split()
        term_freq = Counter(text_terms)

        score = 0.0
        for term in query_terms:
            if term in term_freq:
                # BM25-like scoring
                tf = term_freq[term]
                df = doc_freq[term]
                idf = math.log((num_docs - df + 0.5) / (df + 0.5) + 1.0)

                # BM25 parameters
                k1 = 1.5
                b = 0.75
                avg_doc_len = sum(len(d.content.split()) for d in documents) / num_docs
                doc_len = len(text_terms)

                norm_tf = (tf * (k1 + 1)) / (
                    tf + k1 * (1 - b + b * (doc_len / avg_doc_len))
                )
                score += idf * norm_tf

        scores.append((doc, score))

    # Sort by score
    scores.sort(key=lambda x: x[1], reverse=True)

    # Create search results
    results = []
    for rank, (doc, score) in enumerate(scores[:top_k], 1):
        results.append(
            SearchResult(document=doc, score=score, rank=rank, method="keyword")
        )

    return results




[docs]
def semantic_search(
    query_embedding: List[float],
    documents: List[Document],
    document_embeddings: List[List[float]],
    top_k: int = 10,
    similarity_metric: str = "cosine",
) -> List[SearchResult]:
    """Perform semantic search using embeddings.

    Args:
        query_embedding: Embedding vector of the query
        documents: List of documents
        document_embeddings: Embedding vectors for documents (same order as documents)
        top_k: Number of top results to return
        similarity_metric: "cosine", "dot", or "euclidean"

    Returns:
        List[SearchResult]: Ranked search results

    Example:
        >>> from kerb.embedding import embed
        >>> query_emb = embed("python programming")
        >>> doc_embs = [embed(doc.content) for doc in docs]
        >>> results = semantic_search(query_emb, docs, doc_embs)
    """
    if not documents or len(documents) != len(document_embeddings):
        return []

    scores = []

    for doc, doc_emb in zip(documents, document_embeddings):
        if similarity_metric == "cosine":
            # Cosine similarity
            dot = sum(a * b for a, b in zip(query_embedding, doc_emb))
            norm_q = math.sqrt(sum(a * a for a in query_embedding))
            norm_d = math.sqrt(sum(b * b for b in doc_emb))
            score = dot / (norm_q * norm_d) if norm_q and norm_d else 0.0

        elif similarity_metric == "dot":
            # Dot product
            score = sum(a * b for a, b in zip(query_embedding, doc_emb))

        elif similarity_metric == "euclidean":
            # Euclidean distance (inverted for ranking)
            dist = math.sqrt(
                sum((a - b) ** 2 for a, b in zip(query_embedding, doc_emb))
            )
            score = 1.0 / (1.0 + dist)  # Convert to similarity
        else:
            score = 0.0

        scores.append((doc, score))

    # Sort by score
    scores.sort(key=lambda x: x[1], reverse=True)

    # Create search results
    results = []
    for rank, (doc, score) in enumerate(scores[:top_k], 1):
        results.append(
            SearchResult(document=doc, score=score, rank=rank, method="semantic")
        )

    return results




[docs]
def hybrid_search(
    query: str,
    query_embedding: List[float],
    documents: List[Document],
    document_embeddings: List[List[float]],
    top_k: int = 10,
    keyword_weight: float = 0.5,
    semantic_weight: float = 0.5,
    fusion_method: Union["FusionMethod", str] = "weighted",
    config: Optional[HybridSearchConfig] = None,
) -> List[SearchResult]:
    """Perform hybrid search combining keyword and semantic search.

    Args:
        query: Search query text
        query_embedding: Embedding vector of the query
        documents: List of documents
        document_embeddings: Embedding vectors for documents
        top_k: Number of top results to return (ignored if config is provided)
        keyword_weight: Weight for keyword scores (ignored if config is provided)
        semantic_weight: Weight for semantic scores (ignored if config is provided)
        fusion_method: Fusion method (ignored if config is provided)
        config: HybridSearchConfig object with all parameters (recommended)

    Returns:
        List[SearchResult]: Ranked search results

    Examples:
        >>> # Using config object (recommended)
        >>> from kerb.retrieval import HybridSearchConfig
        >>> from kerb.core.enums import FusionMethod
        >>> config = HybridSearchConfig(
        ...     top_k=10,
        ...     keyword_weight=0.4,
        ...     semantic_weight=0.6,
        ...     fusion_method=FusionMethod.RRF
        ... )
        >>> results = hybrid_search(
        ...     query="python async",
        ...     query_embedding=embed("python async"),
        ...     documents=docs,
        ...     document_embeddings=doc_embs,
        ...     config=config
        ... )

        >>> # Using individual parameters (backward compatible)
        >>> results = hybrid_search(
        ...     query="python async",
        ...     query_embedding=embed("python async"),
        ...     documents=docs,
        ...     document_embeddings=doc_embs,
        ...     keyword_weight=0.4,
        ...     semantic_weight=0.6
        ... )
    """
    from kerb.core.enums import FusionMethod, validate_enum_or_string

    # Use config if provided, otherwise use individual parameters
    if config is not None:
        top_k = config.top_k
        keyword_weight = config.keyword_weight
        semantic_weight = config.semantic_weight
        fusion_method = config.fusion_method

    # Validate and normalize fusion_method
    method_val = validate_enum_or_string(fusion_method, FusionMethod, "fusion_method")
    if isinstance(method_val, FusionMethod):
        method_str = method_val.value
    else:
        method_str = method_val

    # Get keyword search results
    keyword_results = keyword_search(query, documents, top_k=len(documents))

    # Get semantic search results
    semantic_results = semantic_search(
        query_embedding, documents, document_embeddings, top_k=len(documents)
    )

    # Create score maps
    keyword_scores = {r.document.id: r.score for r in keyword_results}
    semantic_scores = {r.document.id: r.score for r in semantic_results}
    keyword_ranks = {r.document.id: r.rank for r in keyword_results}
    semantic_ranks = {r.document.id: r.rank for r in semantic_results}

    # Normalize scores to 0-1 range
    if keyword_scores:
        max_kw = max(keyword_scores.values())
        if max_kw > 0:
            keyword_scores = {k: v / max_kw for k, v in keyword_scores.items()}

    if semantic_scores:
        max_sem = max(semantic_scores.values())
        if max_sem > 0:
            semantic_scores = {k: v / max_sem for k, v in semantic_scores.items()}

    # Combine scores
    combined_scores = []

    for doc in documents:
        doc_id = doc.id
        kw_score = keyword_scores.get(doc_id, 0.0)
        sem_score = semantic_scores.get(doc_id, 0.0)

        if method_str == "weighted":
            # Weighted combination
            score = keyword_weight * kw_score + semantic_weight * sem_score

        elif method_str == "rrf":
            # Reciprocal Rank Fusion
            k = 60  # RRF constant
            kw_rank = keyword_ranks.get(doc_id, len(documents) + 1)
            sem_rank = semantic_ranks.get(doc_id, len(documents) + 1)
            score = (1.0 / (k + kw_rank)) + (1.0 / (k + sem_rank))

        elif method_str in ("dbsf", "normalized"):
            # Distribution-Based Score Fusion or Normalized
            score = (kw_score + sem_score) / 2.0

        elif method_str == "max":
            # Take maximum score
            score = max(kw_score, sem_score)
        else:
            score = kw_score + sem_score

        combined_scores.append((doc, score))

    # Sort by combined score
    combined_scores.sort(key=lambda x: x[1], reverse=True)

    # Create search results
    results = []
    for rank, (doc, score) in enumerate(combined_scores[:top_k], 1):
        results.append(
            SearchResult(document=doc, score=score, rank=rank, method="hybrid")
        )

    return results