Source code for kerb.document.metadata
"""Metadata extraction utilities.
This module provides functions for extracting metadata and structured information:
- File metadata extraction
- Document statistics
- URL extraction
- Email extraction
- Date extraction
- Phone number extraction
"""
import re
from pathlib import Path
from typing import Any, Dict, List
[docs]
def extract_metadata(file_path: str) -> Dict[str, Any]:
"""Extract metadata from a file.
Args:
file_path (str): Path to file
Returns:
Dict[str, Any]: Extracted metadata
Examples:
>>> metadata = extract_metadata("document.pdf")
>>> print(metadata['size'], metadata['created'])
"""
path = Path(file_path)
metadata = {
"filename": path.name,
"extension": path.suffix.lstrip("."),
"size": path.stat().st_size,
"created": path.stat().st_ctime,
"modified": path.stat().st_mtime,
}
return metadata
[docs]
def extract_document_stats(text: str) -> Dict[str, int]:
"""Extract statistics from document text.
Args:
text (str): Document text
Returns:
Dict[str, int]: Document statistics
Examples:
>>> stats = extract_document_stats("Hello world. This is a test.")
>>> print(stats['word_count'], stats['sentence_count'])
"""
words = text.split()
sentences = re.split(r"[.!?]+", text)
paragraphs = text.split("\n\n")
return {
"char_count": len(text),
"word_count": len(words),
"sentence_count": len([s for s in sentences if s.strip()]),
"paragraph_count": len([p for p in paragraphs if p.strip()]),
"line_count": text.count("\n") + 1,
}
[docs]
def extract_urls(text: str) -> List[str]:
"""Extract URLs from text.
Args:
text (str): Text to extract URLs from
Returns:
List[str]: List of URLs
Examples:
>>> extract_urls("Visit https://example.com and www.test.com")
['https://example.com', 'www.test.com']
"""
url_pattern = r"https?://\S+|www\.\S+"
return re.findall(url_pattern, text)
[docs]
def extract_emails(text: str) -> List[str]:
"""Extract email addresses from text.
Args:
text (str): Text to extract emails from
Returns:
List[str]: List of email addresses
Examples:
>>> extract_emails("Contact us at info@example.com or sales@test.org")
['info@example.com', 'sales@test.org']
"""
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
return re.findall(email_pattern, text)
[docs]
def extract_dates(text: str) -> List[str]:
"""Extract dates from text (simple patterns).
Args:
text (str): Text to extract dates from
Returns:
List[str]: List of potential date strings
Examples:
>>> extract_dates("Meeting on 2024-01-15 and 01/20/2024")
['2024-01-15', '01/20/2024']
"""
date_patterns = [
r"\d{4}-\d{2}-\d{2}", # YYYY-MM-DD
r"\d{2}/\d{2}/\d{4}", # MM/DD/YYYY
r"\d{2}-\d{2}-\d{4}", # DD-MM-YYYY
]
dates = []
for pattern in date_patterns:
dates.extend(re.findall(pattern, text))
return dates
[docs]
def extract_phone_numbers(text: str) -> List[str]:
"""Extract phone numbers from text (US format).
Args:
text (str): Text to extract phone numbers from
Returns:
List[str]: List of phone numbers
Examples:
>>> extract_phone_numbers("Call (555) 123-4567 or 555-987-6543")
['(555) 123-4567', '555-987-6543']
"""
phone_patterns = [
r"\(\d{3}\)\s*\d{3}-\d{4}", # (555) 123-4567
r"\d{3}-\d{3}-\d{4}", # 555-123-4567
r"\d{3}\.\d{3}\.\d{4}", # 555.123.4567
]
phones = []
for pattern in phone_patterns:
phones.extend(re.findall(pattern, text))
return phones