""" Content Analysis for Dataset Quality Assessment =============================================== =============================================== This example demonstrates content analysis techniques to assess and improve training data quality through various metrics and classifications. Main concepts: - Content type classification - Quality metrics calculation - Readability assessment - Sentiment analysis - Code detection - Statistical analysis (word/sentence counts) Use case: Analyzing and validating dataset quality before LLM training """ from kerb.preprocessing import ( classify_content_type, detect_code, detect_sentiment, measure_readability, count_words, count_sentences, count_paragraphs, ContentType ) def main(): """Run content analysis examples.""" print("="*80) print("CONTENT ANALYSIS FOR DATASET QUALITY ASSESSMENT") print("="*80) # Example 1: Content type classification print("\n" + "-"*80) print("Example 1: Content Type Classification") print("-"*80) diverse_samples = [ "This is regular plain text content for training.", "def hello_world():\n print('Hello, World!')\n return True", '{"name": "John", "age": 30, "city": "New York"}', "

Title

Content

", "# Markdown Header\n\nThis is **bold** text with a [link](url).", ] print("\nClassifying content types:") for i, sample in enumerate(diverse_samples, 1): content_type = classify_content_type(sample) preview = sample.replace('\n', ' ')[:50] print(f"\n{i}. Type: {content_type.value}") print(f" Content: {preview}...") # Example 2: Code detection print("\n" + "-"*80) print("Example 2: Code Detection") print("-"*80) mixed_content = [ "Machine learning is a subset of artificial intelligence.", "def train_model(data):\n return model.fit(data)", "The function process() takes two arguments.", "import numpy as np\nimport pandas as pd", "We use Python for data science projects.", "class NeuralNetwork:\n def __init__(self):\n pass", ] print("\nDetecting code in mixed content:") for text in mixed_content: is_code = detect_code(text) status = "CODE" if is_code else "TEXT" preview = text.replace('\n', ' ')[:50] print(f"\n[{status:4s}] {preview}...") # Separate code from text code_samples = [text for text in mixed_content if detect_code(text)] text_samples = [text for text in mixed_content if not detect_code(text)] print(f"\nSeparation results:") print(f" Code samples: {len(code_samples)}") print(f" Text samples: {len(text_samples)}") # Example 3: Sentiment analysis print("\n" + "-"*80) print("Example 3: Sentiment Analysis") print("-"*80) review_dataset = [ "I absolutely love this product! It's amazing and wonderful!", "This is the worst experience I've ever had. Terrible service.", "The item is okay, nothing special.", "Great quality and excellent customer support. Very happy!", "Disappointed with the poor quality and bad performance.", "It works as expected. Average product.", ] print("\nAnalyzing sentiment:") sentiment_counts = {"positive": 0, "negative": 0, "neutral": 0} for text in review_dataset: sentiment = detect_sentiment(text) sentiment_counts[sentiment] += 1 print(f"\n[{sentiment.upper():8s}] {text}") print(f"\nSentiment distribution:") for sentiment, count in sentiment_counts.items(): percentage = (count / len(review_dataset)) * 100 print(f" {sentiment.capitalize():8s}: {count} ({percentage:.1f}%)") # Example 4: Readability assessment print("\n" + "-"*80) print("Example 4: Readability Assessment") print("-"*80) text_samples_readability = [ "AI is great.", # Very simple "Machine learning algorithms analyze data patterns.", # Medium "The implementation of sophisticated neural network architectures necessitates comprehensive understanding of backpropagation mechanisms.", # Complex "Deep learning is cool.", # Simple "Natural language processing encompasses various computational techniques for analyzing textual information.", # Medium-complex ] print("\nAssessing readability:") for text in text_samples_readability: score = measure_readability(text) if score > 0.7: level = "Easy" elif score > 0.4: level = "Medium" else: level = "Complex" print(f"\n[{level:7s}] Score: {score:.2f}") print(f" {text}") # Example 5: Statistical analysis print("\n" + "-"*80) print("Example 5: Statistical Text Analysis") print("-"*80) analysis_samples = [ "Short text.", "This is a medium-length sentence with several words.", "First sentence here. Second sentence follows. Third sentence concludes.", "Paragraph one has content.\n\nParagraph two has more content.\n\nParagraph three wraps up.", ] print("\nText statistics:") for i, text in enumerate(analysis_samples, 1): words = count_words(text) sentences = count_sentences(text) paragraphs = count_paragraphs(text) preview = text.replace('\n', ' ')[:40] print(f"\n{i}. {preview}...") print(f" Words: {words}, Sentences: {sentences}, Paragraphs: {paragraphs}") # Example 6: Dataset quality metrics print("\n" + "-"*80) print("Example 6: Dataset Quality Metrics") print("-"*80) training_dataset = [ "Machine learning enables computers to learn from data.", "def func(): pass", # Code "x", # Too short "Deep learning models require large amounts of training data.", "asdfjkl qwerty", # Low quality "Natural language processing helps computers understand text.", "", # Empty "AI is transforming industries worldwide.", "The quick brown fox jumps over the lazy dog.", ] print(f"\nAnalyzing dataset quality ({len(training_dataset)} samples):") # Compute metrics metrics = { "total": len(training_dataset), "empty": 0, "too_short": 0, "code": 0, "low_readability": 0, "good_quality": 0, } good_samples = [] for text in training_dataset: # Empty check if not text.strip(): metrics["empty"] += 1 continue # Length check if len(text) < 10: metrics["too_short"] += 1 continue # Code check if detect_code(text): metrics["code"] += 1 continue # Readability check readability = measure_readability(text) if readability < 0.3: metrics["low_readability"] += 1 continue # Good quality metrics["good_quality"] += 1 good_samples.append(text) print("\nQuality metrics:") print(f" Total samples: {metrics['total']}") print(f" Empty: {metrics['empty']}") print(f" Too short: {metrics['too_short']}") print(f" Code: {metrics['code']}") print(f" Low readability: {metrics['low_readability']}") print(f" Good quality: {metrics['good_quality']}") print(f"\nQuality rate: {metrics['good_quality'] / metrics['total'] * 100:.1f}%") print("\nGood quality samples:") for i, text in enumerate(good_samples, 1): print(f"{i}. {text}") # Example 7: Content distribution analysis print("\n" + "-"*80) print("Example 7: Content Distribution Analysis") print("-"*80) large_dataset = [] # Generate diverse content text_templates = [ "Machine learning is used in {}.", "Deep learning models can {}.", "Natural language processing helps {}.", ] code_templates = [ "def {}():\n pass", "class {}:\n def __init__(self):\n pass", ] applications = ["healthcare", "finance", "robotics", "education"] actions = ["classify images", "generate text", "translate languages"] helps = ["analyze sentiment", "extract entities", "summarize documents"] names = ["process", "analyze", "transform"] classes = ["Model", "Processor", "Analyzer"] for template in text_templates: if "{}" in template: items = applications if "used in" in template else actions if "can" in template else helps for item in items: large_dataset.append(template.format(item)) for template in code_templates: items = names if "def" in template else classes for item in items: large_dataset.append(template.format(item)) print(f"\nAnalyzing dataset ({len(large_dataset)} samples):") # Classify all content content_types = {} for text in large_dataset: ctype = classify_content_type(text) content_types[ctype] = content_types.get(ctype, 0) + 1 print("\nContent type distribution:") for ctype, count in sorted(content_types.items(), key=lambda x: -x[1]): percentage = (count / len(large_dataset)) * 100 print(f" {ctype.value:12s}: {count:2d} ({percentage:5.1f}%)") # Word count statistics word_counts = [count_words(text) for text in large_dataset] avg_words = sum(word_counts) / len(word_counts) min_words = min(word_counts) max_words = max(word_counts) print(f"\nWord count statistics:") print(f" Average: {avg_words:.1f} words") print(f" Min: {min_words} words") print(f" Max: {max_words} words") print("\n" + "="*80) print("CONTENT ANALYSIS COMPLETE") print("="*80) print("\nKey Takeaways:") print("1. Classify content types to ensure dataset composition") print("2. Detect and filter code from text datasets") print("3. Analyze sentiment for balanced training data") print("4. Assess readability for appropriate complexity") print("5. Count statistics for dataset characterization") print("6. Calculate quality metrics to guide filtering") print("7. Analyze distribution to ensure dataset diversity") if __name__ == "__main__": main()