Source code for kerb.document.metadata

"""Metadata extraction utilities.

This module provides functions for extracting metadata and structured information:
- File metadata extraction
- Document statistics
- URL extraction
- Email extraction
- Date extraction
- Phone number extraction
"""

import re
from pathlib import Path
from typing import Any, Dict, List


[docs] def extract_metadata(file_path: str) -> Dict[str, Any]: """Extract metadata from a file. Args: file_path (str): Path to file Returns: Dict[str, Any]: Extracted metadata Examples: >>> metadata = extract_metadata("document.pdf") >>> print(metadata['size'], metadata['created']) """ path = Path(file_path) metadata = { "filename": path.name, "extension": path.suffix.lstrip("."), "size": path.stat().st_size, "created": path.stat().st_ctime, "modified": path.stat().st_mtime, } return metadata
[docs] def extract_document_stats(text: str) -> Dict[str, int]: """Extract statistics from document text. Args: text (str): Document text Returns: Dict[str, int]: Document statistics Examples: >>> stats = extract_document_stats("Hello world. This is a test.") >>> print(stats['word_count'], stats['sentence_count']) """ words = text.split() sentences = re.split(r"[.!?]+", text) paragraphs = text.split("\n\n") return { "char_count": len(text), "word_count": len(words), "sentence_count": len([s for s in sentences if s.strip()]), "paragraph_count": len([p for p in paragraphs if p.strip()]), "line_count": text.count("\n") + 1, }
[docs] def extract_urls(text: str) -> List[str]: """Extract URLs from text. Args: text (str): Text to extract URLs from Returns: List[str]: List of URLs Examples: >>> extract_urls("Visit https://example.com and www.test.com") ['https://example.com', 'www.test.com'] """ url_pattern = r"https?://\S+|www\.\S+" return re.findall(url_pattern, text)
[docs] def extract_emails(text: str) -> List[str]: """Extract email addresses from text. Args: text (str): Text to extract emails from Returns: List[str]: List of email addresses Examples: >>> extract_emails("Contact us at info@example.com or sales@test.org") ['info@example.com', 'sales@test.org'] """ email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" return re.findall(email_pattern, text)
[docs] def extract_dates(text: str) -> List[str]: """Extract dates from text (simple patterns). Args: text (str): Text to extract dates from Returns: List[str]: List of potential date strings Examples: >>> extract_dates("Meeting on 2024-01-15 and 01/20/2024") ['2024-01-15', '01/20/2024'] """ date_patterns = [ r"\d{4}-\d{2}-\d{2}", # YYYY-MM-DD r"\d{2}/\d{2}/\d{4}", # MM/DD/YYYY r"\d{2}-\d{2}-\d{4}", # DD-MM-YYYY ] dates = [] for pattern in date_patterns: dates.extend(re.findall(pattern, text)) return dates
[docs] def extract_phone_numbers(text: str) -> List[str]: """Extract phone numbers from text (US format). Args: text (str): Text to extract phone numbers from Returns: List[str]: List of phone numbers Examples: >>> extract_phone_numbers("Call (555) 123-4567 or 555-987-6543") ['(555) 123-4567', '555-987-6543'] """ phone_patterns = [ r"\(\d{3}\)\s*\d{3}-\d{4}", # (555) 123-4567 r"\d{3}-\d{3}-\d{4}", # 555-123-4567 r"\d{3}\.\d{3}\.\d{4}", # 555.123.4567 ] phones = [] for pattern in phone_patterns: phones.extend(re.findall(pattern, text)) return phones