Source code for kerb.safety.classification

"""Content classification and pattern matching functions.

This module provides functions for pattern matching, content classification,
risk scoring, and entity extraction.
"""

import re
from typing import Dict, List, Optional, Tuple

from .enums import ContentCategory
from .moderation import (check_hate_speech, check_profanity,
                         check_sexual_content, check_toxicity, check_violence)
from .pii import (detect_credit_card, detect_email, detect_ip_address,
                  detect_phone, detect_ssn, detect_url)



[docs]
def match_patterns(
    text: str, patterns: List[str], case_sensitive: bool = False
) -> List[Tuple[str, List[str]]]:
    """Match text against safety patterns.

    Args:
        text: Text to match
        patterns: List of regex patterns
        case_sensitive: Whether matching is case sensitive

    Returns:
        List of tuples (pattern, list of matches)

    Examples:
        >>> patterns = [r'\b\d{3}-\d{2}-\d{4}\b', r'\b\w+@\w+\.\w+\b']
        >>> matches = match_patterns(text, patterns)
    """
    results = []
    flags = 0 if case_sensitive else re.IGNORECASE

    for pattern in patterns:
        matches = re.findall(pattern, text, flags)
        if matches:
            results.append((pattern, matches))

    return results




[docs]
def classify_content(
    text: str, categories: Optional[List[ContentCategory]] = None
) -> Dict[ContentCategory, float]:
    """Classify content into safety categories.

    Args:
        text: Text to classify
        categories: Specific categories to check (None = all)

    Returns:
        Dictionary mapping categories to confidence scores

    Examples:
        >>> scores = classify_content("I hate this stupid thing")
        >>> print(scores)
        {ContentCategory.TOXICITY: 0.7, ContentCategory.HATE_SPEECH: 0.6, ...}
    """
    if categories is None:
        categories = [
            ContentCategory.TOXICITY,
            ContentCategory.SEXUAL,
            ContentCategory.VIOLENCE,
            ContentCategory.HATE_SPEECH,
            ContentCategory.PROFANITY,
        ]

    scores = {}

    for category in categories:
        if category == ContentCategory.TOXICITY:
            result = check_toxicity(text)
            scores[category] = 1.0 - result.score  # Invert: high score = more toxic
        elif category == ContentCategory.SEXUAL:
            result = check_sexual_content(text)
            scores[category] = 1.0 - result.score
        elif category == ContentCategory.VIOLENCE:
            result = check_violence(text)
            scores[category] = 1.0 - result.score
        elif category == ContentCategory.HATE_SPEECH:
            result = check_hate_speech(text)
            scores[category] = 1.0 - result.score
        elif category == ContentCategory.PROFANITY:
            result = check_profanity(text)
            scores[category] = 1.0 - result.score
        else:
            scores[category] = 0.0

    return scores




[docs]
def score_content(
    text: str, weights: Optional[Dict[ContentCategory, float]] = None
) -> float:
    """Score content for safety risk.

    Args:
        text: Text to score
        weights: Category weights (defaults to equal weight)

    Returns:
        Overall safety risk score (0.0 = safe, 1.0 = very unsafe)

    Examples:
        >>> score = score_content("This is a normal message")
        >>> print(score)  # Close to 0.0 (safe)

        >>> score = score_content("I hate you stupid idiot")
        >>> print(score)  # Higher value (unsafe)
    """
    category_scores = classify_content(text)

    if weights is None:
        # Equal weights
        weights = {cat: 1.0 for cat in category_scores.keys()}

    # Weighted average
    total_weight = sum(weights.get(cat, 1.0) for cat in category_scores.keys())
    weighted_sum = sum(
        score * weights.get(cat, 1.0) for cat, score in category_scores.items()
    )

    return weighted_sum / total_weight if total_weight > 0 else 0.0




[docs]
def extract_entities(
    text: str, entity_types: Optional[List[str]] = None
) -> Dict[str, List[str]]:
    """Extract sensitive entities from text.

    Args:
        text: Text to extract from
        entity_types: Types of entities to extract (None = common types)

    Returns:
        Dictionary mapping entity types to lists of extracted entities

    Examples:
        >>> entities = extract_entities("Email john@example.com at 555-1234")
        >>> print(entities)
        {'email': ['john@example.com'], 'phone': ['555-1234']}
    """
    if entity_types is None:
        entity_types = ["email", "phone", "url", "ip_address"]

    entities = {}

    if "email" in entity_types:
        matches = [m.text for m in detect_email(text)]
        if matches:
            entities["email"] = matches

    if "phone" in entity_types:
        matches = [m.text for m in detect_phone(text)]
        if matches:
            entities["phone"] = matches

    if "url" in entity_types:
        matches = [m.text for m in detect_url(text)]
        if matches:
            entities["url"] = matches

    if "ip_address" in entity_types:
        matches = [m.text for m in detect_ip_address(text)]
        if matches:
            entities["ip_address"] = matches

    if "ssn" in entity_types:
        matches = [m.text for m in detect_ssn(text)]
        if matches:
            entities["ssn"] = matches

    if "credit_card" in entity_types:
        matches = [m.text for m in detect_credit_card(text)]
        if matches:
            entities["credit_card"] = matches

    return entities