Source code for kerb.document.loaders

"""Document loaders for various file formats.

This module provides format-specific document loaders for:
- Text files (TXT)
- Markdown files (MD)
- JSON files
- CSV files
- XML files
- HTML files
- PDF files
- DOCX files

Each loader returns a Document object with content and metadata.
"""

import json
import os
import re
from pathlib import Path
from typing import Any, Dict

from kerb.core.types import Document, DocumentFormat



[docs]
def load_document(file_path: str, **kwargs) -> Document:
    """Load a document from file, automatically detecting format.

    This is the main entry point for loading documents. It detects the format
    and delegates to the appropriate loader.

    Args:
        file_path (str): Path to the document file
        **kwargs: Additional arguments passed to format-specific loaders

    Returns:
        Document: Loaded document with content and metadata

    Raises:
        FileNotFoundError: If file doesn't exist
        ValueError: If format is not supported

    Examples:
        >>> doc = load_document("report.pdf")
        >>> print(doc.content[:100])

        >>> doc = load_document("data.csv", parse_as_dict=True)
        >>> print(doc.metadata['rows'])
    """
    from .utils import detect_format

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    fmt = detect_format(file_path)

    loaders = {
        DocumentFormat.PDF: load_pdf,
        DocumentFormat.DOCX: load_docx,
        DocumentFormat.HTML: load_html,
        DocumentFormat.MARKDOWN: load_markdown,
        DocumentFormat.TXT: load_text,
        DocumentFormat.CSV: load_csv,
        DocumentFormat.JSON: load_json,
        DocumentFormat.XML: load_xml,
    }

    loader = loaders.get(fmt)
    if loader is None:
        raise ValueError(f"Unsupported format: {fmt.value}")

    doc = loader(file_path, **kwargs)
    doc.format = fmt
    doc.source = file_path

    return doc




[docs]
def load_text(file_path: str, encoding: str = "utf-8") -> Document:
    """Load a plain text file.

    Args:
        file_path (str): Path to text file
        encoding (str): Text encoding. Defaults to 'utf-8'.

    Returns:
        Document: Loaded document

    Examples:
        >>> doc = load_text("notes.txt")
        >>> print(doc.content)
    """
    with open(file_path, "r", encoding=encoding) as f:
        content = f.read()

    metadata = {
        "encoding": encoding,
        "size": os.path.getsize(file_path),
        "lines": content.count("\n") + 1,
    }

    return Document(content=content, metadata=metadata)




[docs]
def load_markdown(file_path: str, extract_frontmatter: bool = True) -> Document:
    """Load a Markdown file.

    Args:
        file_path (str): Path to markdown file
        extract_frontmatter (bool): Extract YAML frontmatter if present

    Returns:
        Document: Loaded document with frontmatter in metadata

    Examples:
        >>> doc = load_markdown("README.md")
        >>> if 'frontmatter' in doc.metadata:
        ...     print(doc.metadata['frontmatter'])
    """
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    metadata = {
        "size": os.path.getsize(file_path),
    }

    # Extract frontmatter if present
    if extract_frontmatter:
        frontmatter_pattern = r"^---\s*\n(.*?)\n---\s*\n"
        match = re.match(frontmatter_pattern, content, re.DOTALL)
        if match:
            frontmatter_text = match.group(1)
            metadata["frontmatter"] = {}

            # Simple YAML parsing (for basic key-value pairs)
            for line in frontmatter_text.split("\n"):
                if ":" in line:
                    key, value = line.split(":", 1)
                    metadata["frontmatter"][key.strip()] = value.strip()

            # Remove frontmatter from content
            content = content[match.end() :]

    # Extract headings
    headings = re.findall(r"^#{1,6}\s+(.+)$", content, re.MULTILINE)
    metadata["headings"] = headings

    return Document(content=content, metadata=metadata)




[docs]
def load_json(file_path: str, as_string: bool = False) -> Document:
    """Load a JSON file.

    Args:
        file_path (str): Path to JSON file
        as_string (bool): If True, return formatted JSON as string content.
                         If False, store parsed object in metadata.

    Returns:
        Document: Loaded document

    Examples:
        >>> doc = load_json("data.json", as_string=True)
        >>> print(doc.content)

        >>> doc = load_json("config.json")
        >>> print(doc.metadata['json_data'])
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if as_string:
        content = json.dumps(data, indent=2)
    else:
        content = str(data)

    metadata = {
        "json_data": data,
        "size": os.path.getsize(file_path),
    }

    return Document(content=content, metadata=metadata)




[docs]
def load_csv(
    file_path: str, parse_as_dict: bool = True, encoding: str = "utf-8"
) -> Document:
    """Load a CSV file.

    Args:
        file_path (str): Path to CSV file
        parse_as_dict (bool): Parse CSV and store structured data in metadata
        encoding (str): Text encoding

    Returns:
        Document: Loaded document with CSV data in metadata

    Examples:
        >>> doc = load_csv("data.csv")
        >>> rows = doc.metadata['rows']
        >>> headers = doc.metadata['headers']
    """
    with open(file_path, "r", encoding=encoding) as f:
        content = f.read()

    metadata = {
        "size": os.path.getsize(file_path),
        "encoding": encoding,
    }

    if parse_as_dict:
        lines = content.strip().split("\n")
        if lines:
            headers = [h.strip() for h in lines[0].split(",")]
            rows = []

            for line in lines[1:]:
                values = [v.strip() for v in line.split(",")]
                if len(values) == len(headers):
                    rows.append(dict(zip(headers, values)))

            metadata["headers"] = headers
            metadata["rows"] = rows
            metadata["num_rows"] = len(rows)

    return Document(content=content, metadata=metadata)




[docs]
def load_xml(file_path: str, encoding: str = "utf-8") -> Document:
    """Load an XML file.

    Args:
        file_path (str): Path to XML file
        encoding (str): Text encoding

    Returns:
        Document: Loaded document

    Examples:
        >>> doc = load_xml("data.xml")
        >>> print(doc.content)
    """
    with open(file_path, "r", encoding=encoding) as f:
        content = f.read()

    # Extract root tag
    root_match = re.search(r"<(\w+)", content)
    root_tag = root_match.group(1) if root_match else None

    metadata = {
        "size": os.path.getsize(file_path),
        "encoding": encoding,
        "root_tag": root_tag,
    }

    return Document(content=content, metadata=metadata)




[docs]
def load_html(
    file_path: str, extract_text: bool = True, encoding: str = "utf-8"
) -> Document:
    """Load an HTML file.

    Args:
        file_path (str): Path to HTML file
        extract_text (bool): If True, extract plain text from HTML
        encoding (str): Text encoding

    Returns:
        Document: Loaded document

    Examples:
        >>> doc = load_html("page.html", extract_text=True)
        >>> print(doc.content)  # Plain text without HTML tags
    """
    from .extractors import extract_text_from_html

    with open(file_path, "r", encoding=encoding) as f:
        html_content = f.read()

    content = html_content
    metadata = {
        "size": os.path.getsize(file_path),
        "encoding": encoding,
        "raw_html": html_content,
    }

    if extract_text:
        # Basic HTML text extraction
        content = extract_text_from_html(html_content)
        metadata["extracted_text"] = True

    # Extract title
    title_match = re.search(
        r"<title>(.*?)</title>", html_content, re.IGNORECASE | re.DOTALL
    )
    if title_match:
        metadata["title"] = title_match.group(1).strip()

    return Document(content=content, metadata=metadata)




[docs]
def load_pdf(file_path: str, extract_images: bool = False) -> Document:
    """Load a PDF file.

    Requires: pypdf or PyPDF2 package

    Args:
        file_path (str): Path to PDF file
        extract_images (bool): Whether to extract image information

    Returns:
        Document: Loaded document with page-by-page content

    Examples:
        >>> doc = load_pdf("report.pdf")
        >>> print(f"Pages: {doc.metadata['num_pages']}")
        >>> print(doc.content)  # All pages concatenated
    """
    try:
        import pypdf

        PdfReader = pypdf.PdfReader
    except ImportError:
        try:
            import PyPDF2

            PdfReader = PyPDF2.PdfReader
        except ImportError:
            raise ImportError(
                "PDF support requires pypdf or PyPDF2. "
                "Install with: pip install pypdf"
            )

    reader = PdfReader(file_path)

    pages = []
    for page in reader.pages:
        pages.append(page.extract_text())

    content = "\n\n".join(pages)

    metadata = {
        "num_pages": len(pages),
        "size": os.path.getsize(file_path),
    }

    # Extract PDF metadata
    if reader.metadata:
        pdf_meta = {}
        for key in ["/Title", "/Author", "/Subject", "/Creator", "/Producer"]:
            if key in reader.metadata:
                pdf_meta[key.lstrip("/")] = reader.metadata[key]
        if pdf_meta:
            metadata["pdf_metadata"] = pdf_meta

    return Document(content=content, metadata=metadata, page_content=pages)




[docs]
def load_docx(file_path: str) -> Document:
    """Load a DOCX file.

    Requires: python-docx package

    Args:
        file_path (str): Path to DOCX file

    Returns:
        Document: Loaded document

    Examples:
        >>> doc = load_docx("report.docx")
        >>> print(doc.content)
    """
    try:
        import docx
    except ImportError:
        raise ImportError(
            "DOCX support requires python-docx. "
            "Install with: pip install python-docx"
        )

    document = docx.Document(file_path)

    # Extract paragraphs
    paragraphs = [para.text for para in document.paragraphs if para.text.strip()]
    content = "\n\n".join(paragraphs)

    metadata = {
        "num_paragraphs": len(paragraphs),
        "size": os.path.getsize(file_path),
    }

    # Extract core properties
    props = document.core_properties
    doc_metadata = {}
    for prop in ["title", "author", "subject", "keywords", "created", "modified"]:
        if hasattr(props, prop):
            value = getattr(props, prop)
            if value:
                doc_metadata[prop] = str(value)

    if doc_metadata:
        metadata["document_properties"] = doc_metadata

    # Extract tables if present
    if document.tables:
        metadata["num_tables"] = len(document.tables)

    return Document(content=content, metadata=metadata)