Source code for kerb.document.loaders

"""Document loaders for various file formats.

This module provides format-specific document loaders for:
- Text files (TXT)
- Markdown files (MD)
- JSON files
- CSV files
- XML files
- HTML files
- PDF files
- DOCX files

Each loader returns a Document object with content and metadata.
"""

import json
import os
import re
from pathlib import Path
from typing import Any, Dict

from kerb.core.types import Document, DocumentFormat


[docs] def load_document(file_path: str, **kwargs) -> Document: """Load a document from file, automatically detecting format. This is the main entry point for loading documents. It detects the format and delegates to the appropriate loader. Args: file_path (str): Path to the document file **kwargs: Additional arguments passed to format-specific loaders Returns: Document: Loaded document with content and metadata Raises: FileNotFoundError: If file doesn't exist ValueError: If format is not supported Examples: >>> doc = load_document("report.pdf") >>> print(doc.content[:100]) >>> doc = load_document("data.csv", parse_as_dict=True) >>> print(doc.metadata['rows']) """ from .utils import detect_format if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") fmt = detect_format(file_path) loaders = { DocumentFormat.PDF: load_pdf, DocumentFormat.DOCX: load_docx, DocumentFormat.HTML: load_html, DocumentFormat.MARKDOWN: load_markdown, DocumentFormat.TXT: load_text, DocumentFormat.CSV: load_csv, DocumentFormat.JSON: load_json, DocumentFormat.XML: load_xml, } loader = loaders.get(fmt) if loader is None: raise ValueError(f"Unsupported format: {fmt.value}") doc = loader(file_path, **kwargs) doc.format = fmt doc.source = file_path return doc
[docs] def load_text(file_path: str, encoding: str = "utf-8") -> Document: """Load a plain text file. Args: file_path (str): Path to text file encoding (str): Text encoding. Defaults to 'utf-8'. Returns: Document: Loaded document Examples: >>> doc = load_text("notes.txt") >>> print(doc.content) """ with open(file_path, "r", encoding=encoding) as f: content = f.read() metadata = { "encoding": encoding, "size": os.path.getsize(file_path), "lines": content.count("\n") + 1, } return Document(content=content, metadata=metadata)
[docs] def load_markdown(file_path: str, extract_frontmatter: bool = True) -> Document: """Load a Markdown file. Args: file_path (str): Path to markdown file extract_frontmatter (bool): Extract YAML frontmatter if present Returns: Document: Loaded document with frontmatter in metadata Examples: >>> doc = load_markdown("README.md") >>> if 'frontmatter' in doc.metadata: ... print(doc.metadata['frontmatter']) """ with open(file_path, "r", encoding="utf-8") as f: content = f.read() metadata = { "size": os.path.getsize(file_path), } # Extract frontmatter if present if extract_frontmatter: frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n" match = re.match(frontmatter_pattern, content, re.DOTALL) if match: frontmatter_text = match.group(1) metadata["frontmatter"] = {} # Simple YAML parsing (for basic key-value pairs) for line in frontmatter_text.split("\n"): if ":" in line: key, value = line.split(":", 1) metadata["frontmatter"][key.strip()] = value.strip() # Remove frontmatter from content content = content[match.end() :] # Extract headings headings = re.findall(r"^#{1,6}\s+(.+)$", content, re.MULTILINE) metadata["headings"] = headings return Document(content=content, metadata=metadata)
[docs] def load_json(file_path: str, as_string: bool = False) -> Document: """Load a JSON file. Args: file_path (str): Path to JSON file as_string (bool): If True, return formatted JSON as string content. If False, store parsed object in metadata. Returns: Document: Loaded document Examples: >>> doc = load_json("data.json", as_string=True) >>> print(doc.content) >>> doc = load_json("config.json") >>> print(doc.metadata['json_data']) """ with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if as_string: content = json.dumps(data, indent=2) else: content = str(data) metadata = { "json_data": data, "size": os.path.getsize(file_path), } return Document(content=content, metadata=metadata)
[docs] def load_csv( file_path: str, parse_as_dict: bool = True, encoding: str = "utf-8" ) -> Document: """Load a CSV file. Args: file_path (str): Path to CSV file parse_as_dict (bool): Parse CSV and store structured data in metadata encoding (str): Text encoding Returns: Document: Loaded document with CSV data in metadata Examples: >>> doc = load_csv("data.csv") >>> rows = doc.metadata['rows'] >>> headers = doc.metadata['headers'] """ with open(file_path, "r", encoding=encoding) as f: content = f.read() metadata = { "size": os.path.getsize(file_path), "encoding": encoding, } if parse_as_dict: lines = content.strip().split("\n") if lines: headers = [h.strip() for h in lines[0].split(",")] rows = [] for line in lines[1:]: values = [v.strip() for v in line.split(",")] if len(values) == len(headers): rows.append(dict(zip(headers, values))) metadata["headers"] = headers metadata["rows"] = rows metadata["num_rows"] = len(rows) return Document(content=content, metadata=metadata)
[docs] def load_xml(file_path: str, encoding: str = "utf-8") -> Document: """Load an XML file. Args: file_path (str): Path to XML file encoding (str): Text encoding Returns: Document: Loaded document Examples: >>> doc = load_xml("data.xml") >>> print(doc.content) """ with open(file_path, "r", encoding=encoding) as f: content = f.read() # Extract root tag root_match = re.search(r"<(\w+)", content) root_tag = root_match.group(1) if root_match else None metadata = { "size": os.path.getsize(file_path), "encoding": encoding, "root_tag": root_tag, } return Document(content=content, metadata=metadata)
[docs] def load_html( file_path: str, extract_text: bool = True, encoding: str = "utf-8" ) -> Document: """Load an HTML file. Args: file_path (str): Path to HTML file extract_text (bool): If True, extract plain text from HTML encoding (str): Text encoding Returns: Document: Loaded document Examples: >>> doc = load_html("page.html", extract_text=True) >>> print(doc.content) # Plain text without HTML tags """ from .extractors import extract_text_from_html with open(file_path, "r", encoding=encoding) as f: html_content = f.read() content = html_content metadata = { "size": os.path.getsize(file_path), "encoding": encoding, "raw_html": html_content, } if extract_text: # Basic HTML text extraction content = extract_text_from_html(html_content) metadata["extracted_text"] = True # Extract title title_match = re.search( r"<title>(.*?)</title>", html_content, re.IGNORECASE | re.DOTALL ) if title_match: metadata["title"] = title_match.group(1).strip() return Document(content=content, metadata=metadata)
[docs] def load_pdf(file_path: str, extract_images: bool = False) -> Document: """Load a PDF file. Requires: pypdf or PyPDF2 package Args: file_path (str): Path to PDF file extract_images (bool): Whether to extract image information Returns: Document: Loaded document with page-by-page content Examples: >>> doc = load_pdf("report.pdf") >>> print(f"Pages: {doc.metadata['num_pages']}") >>> print(doc.content) # All pages concatenated """ try: import pypdf PdfReader = pypdf.PdfReader except ImportError: try: import PyPDF2 PdfReader = PyPDF2.PdfReader except ImportError: raise ImportError( "PDF support requires pypdf or PyPDF2. " "Install with: pip install pypdf" ) reader = PdfReader(file_path) pages = [] for page in reader.pages: pages.append(page.extract_text()) content = "\n\n".join(pages) metadata = { "num_pages": len(pages), "size": os.path.getsize(file_path), } # Extract PDF metadata if reader.metadata: pdf_meta = {} for key in ["/Title", "/Author", "/Subject", "/Creator", "/Producer"]: if key in reader.metadata: pdf_meta[key.lstrip("/")] = reader.metadata[key] if pdf_meta: metadata["pdf_metadata"] = pdf_meta return Document(content=content, metadata=metadata, page_content=pages)
[docs] def load_docx(file_path: str) -> Document: """Load a DOCX file. Requires: python-docx package Args: file_path (str): Path to DOCX file Returns: Document: Loaded document Examples: >>> doc = load_docx("report.docx") >>> print(doc.content) """ try: import docx except ImportError: raise ImportError( "DOCX support requires python-docx. " "Install with: pip install python-docx" ) document = docx.Document(file_path) # Extract paragraphs paragraphs = [para.text for para in document.paragraphs if para.text.strip()] content = "\n\n".join(paragraphs) metadata = { "num_paragraphs": len(paragraphs), "size": os.path.getsize(file_path), } # Extract core properties props = document.core_properties doc_metadata = {} for prop in ["title", "author", "subject", "keywords", "created", "modified"]: if hasattr(props, prop): value = getattr(props, prop) if value: doc_metadata[prop] = str(value) if doc_metadata: metadata["document_properties"] = doc_metadata # Extract tables if present if document.tables: metadata["num_tables"] = len(document.tables) return Document(content=content, metadata=metadata)