Source code for kerb.parsing.json

"""JSON parsing and extraction utilities.

This module provides functions for extracting and parsing JSON from LLM outputs,
including automatic fixing of common formatting issues.
"""

import json
import re
from typing import Any, Dict, List, Optional

from .types import ParseMode, ParseResult


[docs] def extract_json(text: str, mode: ParseMode = ParseMode.LENIENT) -> ParseResult: """Extract JSON from text that may contain additional content. This function intelligently extracts JSON objects or arrays from LLM outputs that may include markdown formatting, explanatory text, or other artifacts. Args: text (str): Text containing JSON (may have markdown, explanations, etc.) mode (ParseMode): Parsing mode - strict, lenient, or best_effort Returns: ParseResult: Parsed JSON data and metadata Examples: >>> extract_json('Here is the data: {"name": "John", "age": 30}') ParseResult(success=True, data={'name': 'John', 'age': 30}, ...) >>> extract_json('```json\\n{"key": "value"}\\n```') ParseResult(success=True, data={'key': 'value'}, ...) """ original = text warnings = [] # Try direct parsing first try: data = json.loads(text) return ParseResult(success=True, data=data, original=original) except json.JSONDecodeError: pass # Extract from markdown code blocks json_pattern = r"```(?:json)?\s*\n?(.*?)\n?```" matches = re.findall(json_pattern, text, re.DOTALL) if matches: for match in matches: try: data = json.loads(match) warnings.append("Extracted JSON from markdown code block") return ParseResult( success=True, data=data, original=original, warnings=warnings ) except json.JSONDecodeError: continue # Try to find JSON object or array in text # Look for outermost { } or [ ] json_patterns = [ r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}", # Nested objects r"\[[^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*\]", # Nested arrays ] for pattern in json_patterns: matches = re.findall(pattern, text, re.DOTALL) for match in matches: try: data = json.loads(match) warnings.append("Extracted JSON from surrounding text") return ParseResult( success=True, data=data, original=original, warnings=warnings ) except json.JSONDecodeError: continue if mode == ParseMode.STRICT: return ParseResult( success=False, error="No valid JSON found in text", original=original ) # Try fixing common issues if mode in [ParseMode.LENIENT, ParseMode.BEST_EFFORT]: fixed_result = fix_json(text) if fixed_result.success: return fixed_result return ParseResult( success=False, error="Could not extract or parse JSON from text", original=original, warnings=warnings, )
[docs] def parse_json(text: str, mode: ParseMode = ParseMode.LENIENT) -> ParseResult: """Parse JSON with automatic fixing for common LLM output issues. Args: text (str): JSON text to parse mode (ParseMode): Parsing mode - strict, lenient, or best_effort Returns: ParseResult: Parsed JSON data and metadata """ return extract_json(text, mode)
[docs] def fix_json(text: str) -> ParseResult: """Attempt to fix common JSON formatting issues in LLM outputs. Common fixes: - Remove trailing commas - Fix single quotes to double quotes - Remove comments - Fix missing/extra brackets - Handle truncated JSON Args: text (str): Potentially malformed JSON text Returns: ParseResult: Fixed and parsed JSON if successful """ original = text fixed = text fixes_applied = [] # Remove markdown formatting fixed = re.sub(r"```(?:json)?\s*\n?", "", fixed) # Remove comments (// style and /* */ style) fixed = re.sub(r"//.*?$", "", fixed, flags=re.MULTILINE) fixed = re.sub(r"/\*.*?\*/", "", fixed, flags=re.DOTALL) # Fix single quotes to double quotes (be careful with apostrophes) # Only replace quotes that are likely JSON delimiters fixed = re.sub(r"(?<=[{\[,:])\s*'([^']*)'(?=\s*[,:\]}])", r'"\1"', fixed) # Remove trailing commas before closing brackets fixed = re.sub(r",(\s*[}\]])", r"\1", fixed) # Try parsing try: data = json.loads(fixed) if fixed != original: fixes_applied.append("Applied automatic JSON fixes") return ParseResult( success=True, data=data, fixed=fixed != original, original=original, warnings=fixes_applied, ) except json.JSONDecodeError as e: pass # Try to complete truncated JSON if text.count("{") > text.count("}"): fixed = fixed + "}" * (text.count("{") - text.count("}")) fixes_applied.append("Added missing closing braces") if text.count("[") > text.count("]"): fixed = fixed + "]" * (text.count("[") - text.count("]")) fixes_applied.append("Added missing closing brackets") try: data = json.loads(fixed) return ParseResult( success=True, data=data, fixed=True, original=original, warnings=fixes_applied, ) except json.JSONDecodeError as e: return ParseResult( success=False, error=f"Could not fix JSON: {str(e)}", original=original, warnings=fixes_applied, )
[docs] def extract_json_array(text: str, mode: ParseMode = ParseMode.LENIENT) -> ParseResult: """Extract a JSON array from text. Args: text (str): Text containing JSON array mode (ParseMode): Parsing mode Returns: ParseResult: Parsed JSON array """ result = extract_json(text, mode) if result.success and not isinstance(result.data, list): return ParseResult( success=False, error="Extracted JSON is not an array", original=text ) return result
[docs] def extract_json_object(text: str, mode: ParseMode = ParseMode.LENIENT) -> ParseResult: """Extract a JSON object from text. Args: text (str): Text containing JSON object mode (ParseMode): Parsing mode Returns: ParseResult: Parsed JSON object """ result = extract_json(text, mode) if result.success and not isinstance(result.data, dict): return ParseResult( success=False, error="Extracted JSON is not an object", original=text ) return result
[docs] def ensure_json_output(text: str, default: Any = None) -> Any: """Extract JSON from text, returning default if parsing fails. Args: text (str): Text containing JSON default: Default value if parsing fails Returns: Parsed JSON or default value """ result = extract_json(text, mode=ParseMode.BEST_EFFORT) return result.data if result.success else default
[docs] def ensure_list_output(text: str, default: Optional[List] = None) -> List: """Extract JSON array from text, returning default if parsing fails. Args: text (str): Text containing JSON array default (List, optional): Default value if parsing fails Returns: Parsed list or default value """ result = extract_json_array(text, mode=ParseMode.BEST_EFFORT) return result.data if result.success else (default or [])
[docs] def ensure_dict_output(text: str, default: Optional[Dict] = None) -> Dict: """Extract JSON object from text, returning default if parsing fails. Args: text (str): Text containing JSON object default (Dict, optional): Default value if parsing fails Returns: Parsed dict or default value """ result = extract_json_object(text, mode=ParseMode.BEST_EFFORT) return result.data if result.success else (default or {})