"""File loading and validation utilities for LEGATO OMR (images and PDFs).""" import os import gradio as gr from PIL import Image MAX_ASPECT_RATIO = 10 MAX_PDF_PAGES = 3 ALLOWED_IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".webp") def check_aspect_ratio(image: Image.Image, max_ratio: float = MAX_ASPECT_RATIO) -> None: """Raise ValueError if image height/width exceeds max_ratio (too long).""" w, h = image.size if w <= 0: raise ValueError("Invalid image width.") if h / w > max_ratio: raise ValueError("Too long image or too many PDF pages.") def load_pdf_as_image(path: str, max_pages: int = MAX_PDF_PAGES) -> Image.Image: """Load a PDF and concatenate its pages vertically into one PIL Image. Raises ValueError if the PDF has more than max_pages. Raises ValueError for invalid or unsupported PDFs. """ try: import fitz except ImportError: raise ValueError("PDF support is not available (pymupdf not installed).") try: doc = fitz.open(path) except Exception: raise ValueError("Invalid or unsupported PDF.") try: page_count = len(doc) if page_count == 0: raise ValueError("PDF has no pages.") if page_count > max_pages: raise ValueError("PDF has more than 3 pages.") images = [] for i in range(min(page_count, max_pages)): page = doc.load_page(i) pix = page.get_pixmap(alpha=False) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) images.append(img) except ValueError: raise except Exception: raise ValueError("Invalid or unsupported PDF.") finally: doc.close() if not images: raise ValueError("PDF has no pages.") # Concatenate vertically: same width (use max width), stack heights width = max(im.width for im in images) total_height = sum(im.height for im in images) out = Image.new("RGB", (width, total_height), (255, 255, 255)) y = 0 for im in images: if im.width != width: im = im.resize((width, im.height), Image.Resampling.LANCZOS) out.paste(im, (0, y)) y += im.height return out def file_to_pil_image(file_input): """Convert uploaded file (image or PDF) to a single PIL Image with validation. Returns None if no file. Raises gr.Error for invalid/unsupported file or validation failures. """ path = file_input if not path or not os.path.isfile(path): return None _, ext = os.path.splitext(path) ext = ext.lower() try: if ext == ".pdf": image = load_pdf_as_image(path, max_pages=MAX_PDF_PAGES) elif ext in ALLOWED_IMAGE_EXTENSIONS: image = Image.open(path).convert("RGB") else: raise gr.Error("Unsupported file type. Please upload an image or a PDF.") check_aspect_ratio(image, max_ratio=MAX_ASPECT_RATIO) return image except gr.Error: raise except ValueError as e: raise gr.Error(str(e)) except Exception: if ext == ".pdf": raise gr.Error("Invalid or unsupported PDF.") raise gr.Error("Failed to load image.")