# app/utils/json_utils.py - Robust JSON parsing for LLM responses import json import re from typing import Dict, Any, Optional, Union from app.utils.logger import AgentLogger logger = AgentLogger("json_utils") def robust_json_loads(text: str) -> Optional[Union[Dict, list]]: """ SOC-Grade Robust JSON parser for LLM outputs. Handles: 1. Markdown backticks (```json ... ```) 2. Leading/Trailing garbage text 3. Common LLM syntax errors (trailing commas - attempt) 4. Empty or whitespace-only responses """ if not text or not text.strip(): logger.warning("robust_json_loads received empty/whitespace text") return None cleaned = text.strip() # 1. Handle Markdown Blocks if "```json" in cleaned: cleaned = cleaned.split("```json")[1].split("```")[0].strip() elif "```" in cleaned: cleaned = cleaned.split("```")[1].split("```")[0].strip() # 2. Extract first occurring JSON object/array using Regex if standard parsing fails try: return json.loads(cleaned) except json.JSONDecodeError: # Try to find the first { or [ and the last } or ] try: # Search for the outermost JSON structure # This regex looks for anything that starts with { and ends with } # or starts with [ and ends with ] match = re.search(r'(\{.*\}|\[.*\])', cleaned, re.DOTALL) if match: potential_json = match.group(1) # Try simple fix for trailing commas before parsing potential_json = re.sub(r',\s*([\}\]])', r'\1', potential_json) return json.loads(potential_json) except Exception as e: logger.warning("Robust-Regex JSON parsing failed", error=str(e), partial=cleaned[:200]) logger.error("All JSON parsing attempts failed", text_preview=text[:200] if text else "None") return None def extract_json_with_fallback(text: str, fallback_value: Any) -> Any: """Extract JSON or return fallback if parsing fails.""" result = robust_json_loads(text) return result if result is not None else fallback_value def parse_llm_number(text: str, fallback: float = 0.0) -> float: """Extract a float from an LLM response (e.g., '0.75' or 'Score: 0.75').""" if not text: return fallback try: # Find the first thing that looks like a number match = re.search(r'(\d+(?:\.\d+)?)', text) if match: return float(match.group(1)) except Exception: pass return fallback