Source code for kerb.preprocessing.analysis

"""Content analysis and classification."""

import re
from typing import List

from .enums import ContentType


[docs] def classify_content_type(text: str) -> ContentType: """Classify text content type. Args: text: Input text Returns: ContentType enum value Examples: >>> classify_content_type("def foo():\\n pass") <ContentType.CODE: 'code'> """ if not text: return ContentType.UNKNOWN # Check for code if detect_code(text): return ContentType.CODE # Check for JSON if text.strip().startswith("{") or text.strip().startswith("["): try: import json json.loads(text) return ContentType.JSON except: pass # Check for HTML if re.search(r"<[a-z][\s\S]*>", text, re.IGNORECASE): return ContentType.HTML # Check for Markdown if re.search(r"^#{1,6}\s|```|\[.+\]\(.+\)", text, re.MULTILINE): return ContentType.MARKDOWN # Default to plain text return ContentType.PLAIN_TEXT
[docs] def detect_code(text: str) -> bool: """Detect if text contains code. Args: text: Input text Returns: True if text appears to be code Examples: >>> detect_code("def foo(): return True") True """ if not text: return False # Check for code patterns code_patterns = [ r"\bdef\s+\w+\s*\(", # Python functions r"\bclass\s+\w+", # Class definitions r"\bimport\s+\w+", # Imports r"\bfunction\s+\w+\s*\(", # JavaScript functions r"=>", # Arrow functions r"{\s*\n\s+", # Code blocks r";\s*\n", # Statement terminators ] for pattern in code_patterns: if re.search(pattern, text): return True return False
[docs] def detect_sentiment(text: str) -> str: """Basic sentiment detection. Args: text: Input text Returns: Sentiment: "positive", "negative", or "neutral" Examples: >>> detect_sentiment("I love this!") 'positive' """ if not text: return "neutral" text_lower = text.lower() # Simple keyword-based sentiment positive_words = [ "love", "great", "excellent", "awesome", "wonderful", "good", "happy", ] negative_words = ["hate", "bad", "terrible", "awful", "horrible", "poor", "sad"] pos_count = sum(1 for word in positive_words if word in text_lower) neg_count = sum(1 for word in negative_words if word in text_lower) if pos_count > neg_count: return "positive" elif neg_count > pos_count: return "negative" else: return "neutral"
[docs] def measure_readability(text: str) -> float: """Calculate readability score (0-1, higher is more readable). Args: text: Input text Returns: Readability score Examples: >>> score = measure_readability("This is simple text.") >>> score > 0.5 True """ if not text or len(text) < 10: return 0.0 words = count_words(text) sentences = count_sentences(text) if sentences == 0 or words == 0: return 0.0 # Average word length avg_word_length = len(text.replace(" ", "")) / words # Average sentence length avg_sentence_length = words / sentences # Simple readability score # Penalize long words and long sentences word_score = max(0, 1 - (avg_word_length - 5) / 10) sentence_score = max(0, 1 - (avg_sentence_length - 15) / 20) return (word_score + sentence_score) / 2
[docs] def count_words(text: str) -> int: """Smart word counting. Args: text: Input text Returns: Word count Examples: >>> count_words("Hello world, this is a test") 6 """ if not text: return 0 # Split on whitespace and filter empty strings words = [w for w in re.split(r"\s+", text.strip()) if w] return len(words)
[docs] def count_sentences(text: str) -> int: """Smart sentence counting. Args: text: Input text Returns: Sentence count Examples: >>> count_sentences("Hello. World! How are you?") 3 """ if not text: return 0 # Split on sentence terminators sentences = re.split(r"[.!?]+", text) sentences = [s.strip() for s in sentences if s.strip()] return len(sentences)
[docs] def count_paragraphs(text: str) -> int: """Count paragraphs. Args: text: Input text Returns: Paragraph count Examples: >>> count_paragraphs("Para 1\\n\\nPara 2\\n\\nPara 3") 3 """ if not text: return 0 # Split on double newlines paragraphs = re.split(r"\n\s*\n", text.strip()) paragraphs = [p for p in paragraphs if p.strip()] return len(paragraphs)