Spaces:

Kapanther
/

manga-translator

Running

App Files Files Community

Kapanther commited on Jan 10

Commit

4c962fa

1 Parent(s): 63cf941

Initial deployment with LFS for fonts

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +19 -5
app.py +35 -0
core/__init__.py +43 -0
core/__pycache__/__init__.cpython-311.pyc +0 -0
core/__pycache__/__init__.cpython-314.pyc +0 -0
core/__pycache__/caching.cpython-311.pyc +0 -0
core/__pycache__/caching.cpython-314.pyc +0 -0
core/__pycache__/config.cpython-311.pyc +0 -0
core/__pycache__/llm_defaults.cpython-311.pyc +0 -0
core/__pycache__/outside_text_processor.cpython-311.pyc +0 -0
core/__pycache__/pipeline.cpython-311.pyc +0 -0
core/__pycache__/scaling.cpython-311.pyc +0 -0
core/__pycache__/scaling.cpython-314.pyc +0 -0
core/__pycache__/validation.cpython-311.pyc +0 -0
core/caching.py +584 -0
core/config.py +240 -0
core/image/__init__.py +42 -0
core/image/__pycache__/__init__.cpython-311.pyc +0 -0
core/image/__pycache__/__init__.cpython-314.pyc +0 -0
core/image/__pycache__/cleaning.cpython-311.pyc +0 -0
core/image/__pycache__/cleaning.cpython-314.pyc +0 -0
core/image/__pycache__/detection.cpython-311.pyc +0 -0
core/image/__pycache__/detection.cpython-314.pyc +0 -0
core/image/__pycache__/image_utils.cpython-311.pyc +0 -0
core/image/__pycache__/image_utils.cpython-314.pyc +0 -0
core/image/__pycache__/inpainting.cpython-311.pyc +0 -0
core/image/__pycache__/ocr_detection.cpython-311.pyc +0 -0
core/image/__pycache__/sorting.cpython-311.pyc +0 -0
core/image/cleaning.py +849 -0
core/image/detection.py +914 -0
core/image/image_utils.py +779 -0
core/image/inpainting.py +773 -0
core/image/ocr_detection.py +730 -0
core/image/sorting.py +359 -0
core/llm_defaults.py +31 -0
core/ml/__init__.py +14 -0
core/ml/__pycache__/__init__.cpython-311.pyc +0 -0
core/ml/__pycache__/__init__.cpython-314.pyc +0 -0
core/ml/__pycache__/model_manager.cpython-311.pyc +0 -0
core/ml/__pycache__/model_manager.cpython-314.pyc +0 -0
core/ml/model_manager.py +854 -0
core/outside_text_processor.py +638 -0
core/pipeline.py +1295 -0
core/scaling.py +109 -0
core/services/__init__.py +20 -0
core/services/__pycache__/__init__.cpython-311.pyc +0 -0
core/services/__pycache__/translation.cpython-311.pyc +0 -0
core/services/translation.py +1385 -0
core/text/__init__.py +49 -0
core/text/__pycache__/__init__.cpython-311.pyc +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,27 @@
 ---
 title: Manga Translator
-emoji: 🦀
 colorFrom: blue
-colorTo: red
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Manga Translator
+emoji: 📖
 colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 5.44.1
 app_file: app.py
 pinned: false
 ---
+# MangaTranslator
+AI-powered manga/comic translation tool. Upload manga pages and get them translated automatically!
+## Features
+- Speech bubble detection and cleaning
+- LLM-powered OCR and translation (54 languages)
+- Automatic text rendering with custom fonts
+## Usage
+1. Go to the **Config** tab and enter your LLM API key (Google, OpenRouter, etc.)
+2. Upload a manga image
+3. Click **Translate**
+## Note
+This is running on CPU (free tier), so translations may take 1-3 minutes per page.

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import os
+from pathlib import Path
+# Set environment before importing torch
+os.environ["PYTORCH_ALLOC_CONF"] = "max_split_size_mb:512"
+import gradio as gr
+import torch
+import core
+from ui import layout
+# Directories
+MODELS_DIR = Path("./models")
+FONTS_BASE_DIR = Path("./fonts")
+os.makedirs(MODELS_DIR, exist_ok=True)
+os.makedirs(FONTS_BASE_DIR, exist_ok=True)
+# Force CPU for HF Spaces free tier
+target_device = torch.device("cpu")
+print(f"Using device: CPU")
+print(f"PyTorch version: {torch.__version__}")
+print(f"MangaTranslator version: {core.__version__}")
+# Create and launch the Gradio app
+app = layout.create_layout(
+    models_dir=MODELS_DIR,
+    fonts_base_dir=FONTS_BASE_DIR,
+    target_device=target_device,
+)
+app.queue()
+app.launch()

core/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+MangaTranslator Core Package
+This package contains the core functionality for translating manga/comic speech bubbles.
+It uses YOLO for speech bubble detection and LLMs for text translation.
+"""
+from .caching import UnifiedCache, get_cache
+from .image.cleaning import clean_speech_bubbles
+from .image.detection import detect_speech_bubbles
+from .image.image_utils import cv2_to_pil, pil_to_cv2, save_image_with_compression
+from .image.inpainting import FluxKontextInpainter
+from .image.ocr_detection import OutsideTextDetector
+from .ml.model_manager import ModelManager, get_model_manager
+from .pipeline import batch_translate_images, translate_and_render
+from .services.translation import call_translation_api_batch
+from .image.sorting import sort_bubbles_by_reading_order
+from .text.text_renderer import render_text_skia
+__version__ = "1.10.5"
+__version_info__ = (1, 10, 5)
+__author__ = "grinnch"
+__copyright__ = "Copyright 2025-present grinnch"
+__license__ = "Apache-2.0"
+__description__ = "A tool for translating manga pages using AI"
+__all__ = [
+    "get_cache",
+    "UnifiedCache",
+    "translate_and_render",
+    "batch_translate_images",
+    "render_text_skia",
+    "detect_speech_bubbles",
+    "clean_speech_bubbles",
+    "call_translation_api_batch",
+    "sort_bubbles_by_reading_order",
+    "pil_to_cv2",
+    "cv2_to_pil",
+    "save_image_with_compression",
+    "get_model_manager",
+    "ModelManager",
+    "OutsideTextDetector",
+    "FluxKontextInpainter",
+]

core/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.77 kB). View file

core/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (1.55 kB). View file

core/__pycache__/caching.cpython-311.pyc ADDED Viewed

Binary file (28.9 kB). View file

core/__pycache__/caching.cpython-314.pyc ADDED Viewed

Binary file (31.7 kB). View file

core/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

core/__pycache__/llm_defaults.cpython-311.pyc ADDED Viewed

Binary file (1.53 kB). View file

core/__pycache__/outside_text_processor.cpython-311.pyc ADDED Viewed

Binary file (25.1 kB). View file

core/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (44.3 kB). View file

core/__pycache__/scaling.cpython-311.pyc ADDED Viewed

Binary file (4.56 kB). View file

core/__pycache__/scaling.cpython-314.pyc ADDED Viewed

Binary file (5.54 kB). View file

core/__pycache__/validation.cpython-311.pyc ADDED Viewed

Binary file (15.4 kB). View file

core/caching.py ADDED Viewed

	@@ -0,0 +1,584 @@

+import hashlib
+import pickle
+from typing import Any, Dict, List, Optional
+import numpy as np
+from PIL import Image
+from utils.logging import log_message
+class UnifiedCache:
+    """Unified cache for various MangaTranslator operations."""
+    def __init__(self):
+        """Initialize the unified cache."""
+        from core.text.font_manager import LRUCache
+        self._yolo_cache = LRUCache(max_size=1)
+        self._sam_cache = LRUCache(max_size=1)
+        self._translation_cache = LRUCache(max_size=1)
+        self._manga_ocr_cache = LRUCache(max_size=20)
+        self._upscale_cache = LRUCache(max_size=20)
+        self._inpaint_cache = LRUCache(max_size=20)
+        self._current_image_hash = None
+    def _hash_image(self, image: Image.Image) -> str:
+        """Compute strict SHA256 hash of PIL Image pixel data.
+        Args:
+            image: PIL Image to hash
+        Returns:
+            str: Hash string (16 chars)
+        """
+        if image.mode == "RGBA":
+            rgb_image = Image.new("RGB", image.size, (255, 255, 255))
+            rgb_image.paste(image, mask=image.split()[-1])
+            data_image = rgb_image
+        elif image.mode == "L":
+            data_image = image
+        else:
+            data_image = image
+        metadata = (
+            f"{data_image.mode}_{data_image.size[0]}_{data_image.size[1]}".encode()
+        )
+        image_bytes = data_image.tobytes()
+        digest = hashlib.sha256(metadata + image_bytes).hexdigest()
+        return digest[:16]
+    def _hash_numpy(self, array: np.ndarray) -> str:
+        """Compute strict SHA256 hash of numpy array contents.
+        Args:
+            array: Numpy array to hash
+        Returns:
+            str: Hash string (16 chars)
+        """
+        if array.size == 0:
+            return hashlib.sha256(b"empty_array").hexdigest()[:16]
+        metadata = f"{array.shape}_{array.dtype}".encode()
+        combined_data = metadata + array.tobytes()
+        return hashlib.sha256(combined_data).hexdigest()[:16]
+    def _hash_dict(self, data: Dict) -> str:
+        """Compute hash of dictionary.
+        Args:
+            data: Dictionary to hash
+        Returns:
+            str: Hash string (16 chars)
+        """
+        data_bytes = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
+        return hashlib.sha256(data_bytes).hexdigest()[:16]
+    def get_yolo_cache_key(
+        self, image: Image.Image, model_path: str, confidence: float
+    ) -> str:
+        """Compute cache key for YOLO detection.
+        Args:
+            image: Input image
+            model_path: Path to YOLO model
+            confidence: Confidence threshold
+        Returns:
+            str: Cache key
+        """
+        image_hash = self._hash_image(image)
+        model_hash = hashlib.sha256(model_path.encode()).hexdigest()[:16]
+        key_string = f"yolo_{image_hash}_{model_hash}_conf{confidence:.3f}"
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def get_yolo_detection(self, cache_key: str) -> Optional[Any]:
+        """Get cached YOLO detection result.
+        Args:
+            cache_key: Cache key
+        Returns:
+            Cached YOLO results or None if not found
+        """
+        return self._yolo_cache.get(cache_key)
+    def set_yolo_detection(
+        self, cache_key: str, results: Any, verbose: bool = False
+    ) -> None:
+        """Cache YOLO detection result.
+        Args:
+            cache_key: Cache key
+            results: YOLO detection results to cache
+            verbose: Whether to print verbose logging
+        """
+        self._yolo_cache.put(cache_key, results)
+        log_message(
+            f"  - Cached YOLO detection (cache size: {len(self._yolo_cache.cache)})",
+            verbose=verbose,
+        )
+    def get_sam_cache_key(
+        self,
+        image: Image.Image,
+        yolo_boxes: Any,
+        use_sam2: bool = True,
+        conjoined_detection: bool = True,
+        conjoined_confidence: float = 0.35,
+    ) -> str:
+        """Compute cache key for SAM segmentation.
+        Args:
+            image: Input image
+            yolo_boxes: YOLO detection boxes (tensor or list)
+            use_sam2: Whether SAM2 is enabled
+            conjoined_detection: Whether conjoined detection is enabled
+            conjoined_confidence: Confidence threshold for conjoined detection
+        Returns:
+            str: Cache key
+        """
+        image_hash = self._hash_image(image)
+        if hasattr(yolo_boxes, "cpu"):
+            boxes_np = yolo_boxes.cpu().numpy()
+        else:
+            boxes_np = np.array(yolo_boxes)
+        boxes_hash = self._hash_numpy(boxes_np)
+        sam_model_id = "facebook/sam2.1-hiera-large"
+        model_hash = hashlib.sha256(sam_model_id.encode()).hexdigest()[:8]
+        key_string = (
+            f"sam_{image_hash}_{boxes_hash}_{model_hash}_sam2{int(use_sam2)}"
+            f"_conjoined{int(conjoined_detection)}"
+            f"_conf{conjoined_confidence:.3f}"
+        )
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def get_sam_masks(self, cache_key: str) -> Optional[Any]:
+        """Get cached SAM masks.
+        Args:
+            cache_key: Cache key
+        Returns:
+            Cached SAM masks or None if not found
+        """
+        return self._sam_cache.get(cache_key)
+    def set_sam_masks(self, cache_key: str, masks: Any, verbose: bool = False) -> None:
+        """Cache SAM masks.
+        Args:
+            cache_key: Cache key
+            masks: SAM masks to cache
+            verbose: Whether to print verbose logging
+        """
+        self._sam_cache.put(cache_key, masks)
+        log_message(
+            f"  - Cached SAM masks (cache size: {len(self._sam_cache.cache)})",
+            verbose=verbose,
+        )
+    def _is_deterministic(self, config) -> bool:
+        """Check if translation config is deterministic.
+        Args:
+            config: TranslationConfig object
+        Returns:
+            bool: True if translation is deterministic
+        """
+        return config.temperature == 0.0 or config.top_k == 1 or config.top_p == 0.0
+    def get_translation_cache_key(
+        self,
+        images_b64: list,
+        full_image_b64: str,
+        config,
+    ) -> Optional[str]:
+        """Compute cache key for LLM translation.
+        Only returns a key if the config is deterministic.
+        Args:
+            images_b64: List of base64 encoded bubble images
+            full_image_b64: Base64 encoded full page image
+            config: TranslationConfig object
+        Returns:
+            str: Cache key, or None if not deterministic
+        """
+        if not self._is_deterministic(config):
+            return None
+        images_hash = hashlib.sha256("".join(images_b64).encode()).hexdigest()[:16]
+        full_hash = hashlib.sha256(full_image_b64.encode()).hexdigest()[:16]
+        cache_params = {
+            "provider": config.provider,
+            "model_name": config.model_name,
+            "input_language": config.input_language,
+            "output_language": config.output_language,
+            "reading_direction": config.reading_direction,
+            "translation_mode": config.translation_mode,
+            "send_full_page_context": config.send_full_page_context,
+            "temperature": config.temperature,
+            "top_k": config.top_k,
+            "top_p": config.top_p,
+            "ocr_method": config.ocr_method,
+            "special_instructions": (
+                config.special_instructions.strip()
+                if config.special_instructions
+                else None
+            ),
+            "max_tokens": config.max_tokens,
+            "reasoning_effort": config.reasoning_effort,
+            "effort": config.effort,
+            "media_resolution": getattr(config, "media_resolution", None),
+            "media_resolution_bubbles": getattr(
+                config, "media_resolution_bubbles", None
+            ),
+            "media_resolution_context": getattr(
+                config, "media_resolution_context", None
+            ),
+            "enable_web_search": getattr(config, "enable_web_search", None),
+            "upscale_method": getattr(config, "upscale_method", None),
+            "bubble_min_side_pixels": getattr(config, "bubble_min_side_pixels", None),
+            "context_image_max_side_pixels": getattr(
+                config, "context_image_max_side_pixels", None
+            ),
+        }
+        config_hash = self._hash_dict(cache_params)
+        key_string = f"trans_{images_hash}_{full_hash}_{config_hash}"
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def get_translation(self, cache_key: Optional[str]) -> Optional[list]:
+        """Get cached translation results.
+        Args:
+            cache_key: Cache key (can be None if not deterministic)
+        Returns:
+            Cached translations or None if not found
+        """
+        if cache_key is None:
+            return None
+        return self._translation_cache.get(cache_key)
+    def set_translation(
+        self, cache_key: Optional[str], translations: list, verbose: bool = False
+    ) -> None:
+        """Cache translation results.
+        Args:
+            cache_key: Cache key (can be None if not deterministic)
+            translations: Translation results to cache
+            verbose: Whether to print verbose logging
+        """
+        if cache_key is None:
+            return
+        self._translation_cache.put(cache_key, translations)
+        log_message(
+            f"  - Cached translation (cache size: {len(self._translation_cache.cache)})",
+            verbose=verbose,
+        )
+    def get_manga_ocr_cache_key(
+        self, images_b64: List[str], total_elements: int
+    ) -> Optional[str]:
+        """Compute cache key for manga-ocr results.
+        Args:
+            images_b64: List of base64-encoded cropped images.
+            total_elements: Expected number of OCR outputs.
+        Returns:
+            str: Cache key (always deterministic)
+        """
+        images_hash = hashlib.sha256("".join(images_b64).encode()).hexdigest()[:16]
+        key_string = f"mocr_{images_hash}_n{total_elements}"
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def get_manga_ocr_result(self, cache_key: Optional[str]) -> Optional[list]:
+        """Get cached manga-ocr results."""
+        if cache_key is None:
+            return None
+        return self._manga_ocr_cache.get(cache_key)
+    def set_manga_ocr_result(
+        self, cache_key: Optional[str], results: list, verbose: bool = False
+    ) -> None:
+        """Cache manga-ocr results (including failure markers)."""
+        if cache_key is None:
+            return
+        self._manga_ocr_cache.put(cache_key, results)
+        log_message(
+            f"  - Cached manga-ocr result (cache size: {len(self._manga_ocr_cache.cache)})",
+            verbose=verbose,
+        )
+    def get_upscale_cache_key(
+        self, image: Image.Image, factor: float, model_type: str = "model"
+    ) -> str:
+        """Compute cache key for image upscaling.
+        Args:
+            image: Input image
+            factor: Upscaling factor
+            model_type: Model type identifier ("model" or "model_lite")
+        Returns:
+            str: Cache key
+        """
+        image_hash = self._hash_image(image)
+        key_string = f"upscale_{image_hash}_factor{factor:.3f}_model{model_type}"
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def get_upscale_dimension_cache_key(
+        self, image: Image.Image, target: int, mode: str, model_type: str = "model"
+    ) -> str:
+        """Compute cache key for image upscaling to dimension.
+        Args:
+            image: Input image
+            target: Target dimension
+            mode: Upscaling mode ('max' or 'min')
+            model_type: Model type identifier ("model" or "model_lite")
+        Returns:
+            str: Cache key
+        """
+        image_hash = self._hash_image(image)
+        key_string = (
+            f"upscale_dim_{image_hash}_target{target}_mode{mode}_model{model_type}"
+        )
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def get_bubble_processing_cache_key(
+        self, image: Image.Image, target: int, mode: str, model_type: str = "model"
+    ) -> str:
+        """Compute cache key for complete bubble processing (upscale + color match).
+        Args:
+            image: Input image
+            target: Target dimension
+            mode: Upscaling mode ('max' or 'min')
+            model_type: Model type identifier ("model" or "model_lite")
+        Returns:
+            str: Cache key
+        """
+        image_hash = self._hash_image(image)
+        key_string = (
+            f"bubble_proc_{image_hash}_target{target}_mode{mode}_model{model_type}"
+        )
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def get_upscaled_image(self, cache_key: str) -> Optional[Image.Image]:
+        """Get cached upscaled image.
+        Args:
+            cache_key: Cache key
+        Returns:
+            Cached upscaled image or None if not found
+        """
+        return self._upscale_cache.get(cache_key)
+    def set_upscaled_image(
+        self, cache_key: str, image: Image.Image, verbose: bool = False
+    ) -> None:
+        """Cache upscaled image.
+        Args:
+            cache_key: Cache key
+            image: Upscaled image to cache
+            verbose: Whether to print verbose logging
+        """
+        self._upscale_cache.put(cache_key, image)
+        log_message(
+            f"  - Cached upscaled image (cache size: {len(self._upscale_cache.cache)})",
+            verbose=verbose,
+        )
+    def get_inpaint_cache_key(
+        self,
+        image: Image.Image,
+        mask: np.ndarray,
+        seed: int,
+        num_inference_steps: int,
+        residual_diff_threshold: float,
+        guidance_scale: float,
+        prompt: str,
+        ocr_params: Optional[Dict] = None,
+    ) -> str:
+        """Compute cache key for Flux inpainting.
+        Args:
+            image: Input image
+            mask: Mask array
+            seed: Random seed
+            num_inference_steps: Number of inference steps
+            residual_diff_threshold: Residual diff threshold
+            guidance_scale: Guidance scale
+            prompt: Inpainting prompt
+            ocr_params: Optional OCR parameters dict (e.g., {'min_size': 200})
+        Returns:
+            str: Cache key
+        """
+        image_hash = self._hash_image(image)
+        mask_hash = self._hash_numpy(mask)
+        # Include OCR parameters in cache key if provided
+        ocr_params_str = ""
+        if ocr_params:
+            ocr_params_str = "_" + "_".join(
+                f"{k}{v}" for k, v in sorted(ocr_params.items())
+            )
+        key_string = (
+            f"inpaint_{image_hash}_{mask_hash}_"
+            f"seed{seed}_steps{num_inference_steps}_"
+            f"thresh{residual_diff_threshold:.3f}_"
+            f"guide{guidance_scale:.2f}_"
+            f"{prompt}{ocr_params_str}"
+        )
+        return hashlib.sha256(key_string.encode()).hexdigest()
+    def should_use_inpaint_cache(self, seed: int) -> bool:
+        """Determine if inpainting caching should be used.
+        Args:
+            seed: Random seed value
+        Returns:
+            bool: True if caching is enabled (seed != -1)
+        """
+        return seed != -1
+    def get_inpainted_image(self, cache_key: str) -> Optional[Image.Image]:
+        """Get cached inpainted image.
+        Args:
+            cache_key: Cache key
+        Returns:
+            Cached inpainted image or None if not found
+        """
+        return self._inpaint_cache.get(cache_key)
+    def set_inpainted_image(
+        self, cache_key: str, image: Image.Image, verbose: bool = False
+    ) -> None:
+        """Cache inpainted image.
+        Args:
+            cache_key: Cache key
+            image: Inpainted image to cache
+            verbose: Whether to print verbose logging
+        """
+        self._inpaint_cache.put(cache_key, image)
+        log_message(
+            f"  - Cached inpainted image (cache size: {len(self._inpaint_cache.cache)})",
+            verbose=verbose,
+        )
+    def clear_yolo_cache(self, verbose: bool = False) -> None:
+        """Clear YOLO detection cache."""
+        self._yolo_cache.cache.clear()
+        log_message("YOLO cache cleared", verbose=verbose)
+    def clear_sam_cache(self, verbose: bool = False) -> None:
+        """Clear SAM masks cache."""
+        self._sam_cache.cache.clear()
+        log_message("SAM cache cleared", verbose=verbose)
+    def clear_translation_cache(self, verbose: bool = False) -> None:
+        """Clear translation cache."""
+        self._translation_cache.cache.clear()
+        log_message("Translation cache cleared", verbose=verbose)
+    def clear_manga_ocr_cache(self, verbose: bool = False) -> None:
+        """Clear manga-ocr cache."""
+        self._manga_ocr_cache.cache.clear()
+        log_message("manga-ocr cache cleared", verbose=verbose)
+    def clear_upscale_cache(self, verbose: bool = False) -> None:
+        """Clear upscaling cache."""
+        self._upscale_cache.cache.clear()
+        log_message("Upscale cache cleared", verbose=verbose)
+    def clear_inpaint_cache(self, verbose: bool = False) -> None:
+        """Clear inpainting cache."""
+        self._inpaint_cache.cache.clear()
+        log_message("Inpaint cache cleared", verbose=verbose)
+    def clear_all(self) -> None:
+        """Clear all caches."""
+        self.clear_yolo_cache(verbose=False)
+        self.clear_sam_cache(verbose=False)
+        self.clear_translation_cache(verbose=False)
+        self.clear_manga_ocr_cache(verbose=False)
+        self.clear_upscale_cache(verbose=False)
+        self.clear_inpaint_cache(verbose=False)
+        log_message("All caches cleared", always_print=True)
+    def set_current_image(self, image: Image.Image, verbose: bool = False) -> None:
+        """Set the current image being processed and clear caches if different.
+        Args:
+            image: The current image being processed
+            verbose: Whether to print verbose logging
+        """
+        image_hash = self._hash_image(image)
+        if self._current_image_hash is None:
+            # First image
+            self._current_image_hash = image_hash
+            log_message("Cache initialized for new image", verbose=verbose)
+        elif self._current_image_hash != image_hash:
+            # Different image detected - clear all caches
+            log_message(
+                "Different image detected - clearing all caches", verbose=verbose
+            )
+            self.clear_all()
+            self._current_image_hash = image_hash
+        else:
+            # Same image - no action needed
+            log_message("Same image detected - reusing caches", verbose=verbose)
+    def get_cache_stats(self) -> dict:
+        """Get statistics about cache sizes.
+        Returns:
+            dict: Cache statistics
+        """
+        return {
+            "yolo": len(self._yolo_cache.cache),
+            "sam": len(self._sam_cache.cache),
+            "translation": len(self._translation_cache.cache),
+            "manga_ocr": len(self._manga_ocr_cache.cache),
+            "upscale": len(self._upscale_cache.cache),
+            "inpaint": len(self._inpaint_cache.cache),
+        }
+_global_cache = None
+def get_cache() -> UnifiedCache:
+    """Get the global cache instance.
+    Returns:
+        UnifiedCache: The global cache instance
+    """
+    global _global_cache
+    if _global_cache is None:
+        _global_cache = UnifiedCache()
+    return _global_cache

core/config.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from dataclasses import dataclass, field
+from typing import Optional
+import torch
+from core.llm_defaults import DEFAULT_LLM_PROVIDER, get_provider_sampling_defaults
+@dataclass
+class DetectionConfig:
+    """Configuration for speech bubble detection."""
+    confidence: float = 0.6
+    conjoined_confidence: float = 0.35
+    panel_confidence: float = 0.25
+    use_sam2: bool = True
+    conjoined_detection: bool = True
+    use_panel_sorting: bool = True
+    use_osb_text_verification: bool = True
+@dataclass
+class CleaningConfig:
+    """Configuration for speech bubble cleaning."""
+    thresholding_value: int = 190
+    use_otsu_threshold: bool = False
+    roi_shrink_px: int = 4
+    inpaint_colored_bubbles: bool = True
+_DEFAULT_TRANSLATION_PROVIDER = DEFAULT_LLM_PROVIDER
+_DEFAULT_SAMPLING = get_provider_sampling_defaults(_DEFAULT_TRANSLATION_PROVIDER)
+@dataclass
+class TranslationConfig:
+    """Configuration for text translation."""
+    provider: str = _DEFAULT_TRANSLATION_PROVIDER
+    google_api_key: str = ""
+    openai_api_key: str = ""
+    anthropic_api_key: str = ""
+    xai_api_key: str = ""
+    deepseek_api_key: str = ""
+    zai_api_key: str = ""
+    moonshot_api_key: str = ""
+    openrouter_api_key: str = ""
+    openai_compatible_url: str = "http://localhost:1234/v1"
+    openai_compatible_api_key: Optional[str] = ""
+    model_name: str = "gemini-2.5-flash"
+    provider_models: dict[str, Optional[str]] = field(default_factory=dict)
+    temperature: float = float(_DEFAULT_SAMPLING["temperature"])
+    top_p: float = float(_DEFAULT_SAMPLING["top_p"])
+    top_k: int = int(_DEFAULT_SAMPLING["top_k"])
+    max_tokens: Optional[int] = (
+        None  # None = use default logic (16384 for reasoning, 4096 otherwise)
+    )
+    input_language: str = "Japanese"
+    output_language: str = "English"
+    reading_direction: str = "rtl"
+    translation_mode: str = "one-step"
+    reasoning_effort: Optional[str] = (
+        None  # Default: Google uses "auto", Anthropic uses "none", others use "medium"
+    )
+    effort: Optional[str] = (
+        None  # Claude Opus 4.5 only: Controls token spending eagerness (high/medium/low)
+    )
+    send_full_page_context: bool = True
+    upscale_method: str = "model_lite"  # "model", "model_lite", "lanczos", or "none"
+    enable_web_search: bool = (
+        False  # Enable model's built-in web search for up-to-date information. OpenRouter uses its own web search tool.
+    )
+    media_resolution: str = (
+        "auto"  # Only available via Google provider (auto/high/medium/low)
+    )
+    media_resolution_bubbles: str = "auto"  # Gemini 3 models
+    media_resolution_context: str = "auto"  # Gemini 3 models
+    bubble_min_side_pixels: int = 128
+    context_image_max_side_pixels: int = 1024
+    osb_min_side_pixels: int = 128
+    special_instructions: Optional[str] = None
+    ocr_method: str = "LLM"  # "LLM" or "manga-ocr"
+@dataclass
+class RenderingConfig:
+    """Configuration for rendering translated text."""
+    font_dir: str = "./fonts"
+    max_font_size: int = 16
+    min_font_size: int = 8
+    line_spacing_mult: float = 1.0
+    use_subpixel_rendering: bool = False
+    font_hinting: str = "none"
+    use_ligatures: bool = False
+    hyphenate_before_scaling: bool = True
+    hyphen_penalty: float = 1000.0
+    hyphenation_min_word_length: int = 8
+    badness_exponent: float = 3.0
+    padding_pixels: float = 5.0
+    outline_width: float = 0.0
+    supersampling_factor: int = 4
+@dataclass
+class OutsideTextConfig:
+    """Configuration for outside speech bubble text detection and removal."""
+    enabled: bool = False
+    enable_page_number_filtering: bool = False
+    page_filter_margin_threshold: float = 0.1
+    page_filter_min_area_ratio: float = 0.05
+    seed: int = 1  # -1 = random
+    huggingface_token: str = ""  # Required for Flux Kontext model downloads
+    force_cv2_inpainting: bool = False
+    flux_num_inference_steps: int = 8
+    flux_residual_diff_threshold: float = 0.15
+    osb_confidence: float = 0.6
+    osb_font_name: Optional[str] = None  # None = use main font as fallback
+    osb_max_font_size: int = 64
+    osb_min_font_size: int = 12
+    osb_use_ligatures: bool = False
+    osb_outline_width: float = 3.0
+    osb_line_spacing: float = 1.0
+    osb_use_subpixel_rendering: bool = False
+    osb_font_hinting: str = "none"
+    bbox_expansion_percent: float = 0.1
+    text_box_proximity_ratio: float = 0.02  # 2% of image dimension
+    flux_guidance_scale: float = 2.5
+    flux_prompt: str = "Remove all text."
+@dataclass
+class OutputConfig:
+    """Configuration for saving output images."""
+    jpeg_quality: int = 95
+    png_compression: int = 2
+    output_format: str = "auto"
+    upscale_final_image: bool = False
+    image_upscale_factor: float = 2.0
+    image_upscale_model: str = "model_lite"  # "model" or "model_lite"
+@dataclass
+class MangaTranslatorConfig:
+    """Main configuration for the MangaTranslator pipeline."""
+    yolo_model_path: str
+    detection: DetectionConfig = field(default_factory=DetectionConfig)
+    cleaning: CleaningConfig = field(default_factory=CleaningConfig)
+    translation: TranslationConfig = field(default_factory=TranslationConfig)
+    rendering: RenderingConfig = field(default_factory=RenderingConfig)
+    output: OutputConfig = field(default_factory=OutputConfig)
+    outside_text: OutsideTextConfig = field(default_factory=OutsideTextConfig)
+    preprocessing: "PreprocessingConfig" = field(
+        default_factory=lambda: PreprocessingConfig()
+    )
+    verbose: bool = False
+    device: Optional[torch.device] = None
+    cleaning_only: bool = False
+    upscaling_only: bool = False
+    test_mode: bool = False
+    processing_scale: float = 1.0
+    def __post_init__(self):
+        # Load API keys from environment variables if not already set
+        import os
+        if not self.translation.google_api_key:
+            self.translation.google_api_key = os.environ.get("GOOGLE_API_KEY", "")
+        if not self.translation.openai_api_key:
+            self.translation.openai_api_key = os.environ.get("OPENAI_API_KEY", "")
+        if not self.translation.anthropic_api_key:
+            self.translation.anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+        if not self.translation.xai_api_key:
+            self.translation.xai_api_key = os.environ.get("XAI_API_KEY", "")
+        if not self.translation.deepseek_api_key:
+            self.translation.deepseek_api_key = os.environ.get("DEEPSEEK_API_KEY", "")
+        if not self.translation.moonshot_api_key:
+            self.translation.moonshot_api_key = os.environ.get("MOONSHOT_API_KEY", "")
+        if not self.translation.openrouter_api_key:
+            self.translation.openrouter_api_key = os.environ.get(
+                "OPENROUTER_API_KEY", ""
+            )
+        if (
+            not self.translation.openai_compatible_api_key
+        ):  # Check if it's None or empty string
+            self.translation.openai_compatible_api_key = os.environ.get(
+                "OPENAI_COMPATIBLE_API_KEY", ""
+            )
+        # Autodetect device if not specified
+        if self.device is None:
+            if torch.cuda.is_available():
+                self.device = torch.device("cuda")
+            elif torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            else:
+                self.device = torch.device("cpu")
+        pass
+@dataclass
+class PreprocessingConfig:
+    """Configuration for image preprocessing before detection/cleaning."""
+    enabled: bool = False
+    factor: float = 2.0
+    auto_scale: bool = True
+def calculate_reasoning_budget(total_tokens: int, effort_level: str) -> int:
+    """
+    Calculate reasoning token budget based on effort level.
+    Args:
+        total_tokens: Total available tokens (typically max_tokens)
+        effort_level: Reasoning effort level ("high", "medium", "low", "minimal", "auto", or "none")
+    Returns:
+        int: Calculated budget in tokens
+        - "high": 80% of total_tokens
+        - "medium": 50% of total_tokens
+        - "low": 20% of total_tokens
+        - "minimal": 10% of total_tokens
+        - "auto" or "none": Returns 0 (caller should handle these cases separately)
+    """
+    if effort_level == "high":
+        return int(total_tokens * 0.8)
+    elif effort_level == "medium":
+        return int(total_tokens * 0.5)
+    elif effort_level == "low":
+        return int(total_tokens * 0.2)
+    elif effort_level == "minimal":
+        return int(total_tokens * 0.1)
+    else:
+        # "auto" or "none" - return 0, caller should handle these cases
+        return 0

core/image/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Image processing and analysis modules for MangaTranslator.
+This subpackage contains modules for:
+- Speech bubble detection (YOLO, SAM)
+- OCR text detection outside bubbles
+- Image cleaning and preprocessing
+- Inpainting for text removal
+- General image utilities
+"""
+from .cleaning import clean_speech_bubbles
+from .detection import detect_speech_bubbles
+from .image_utils import (
+    calculate_centroid_expansion_box,
+    convert_image_to_target_mode,
+    cv2_to_pil,
+    pil_to_cv2,
+    process_bubble_image_cached,
+    resize_to_max_side,
+    save_image_with_compression,
+    upscale_image,
+    upscale_image_to_dimension,
+)
+from .inpainting import FluxKontextInpainter
+from .ocr_detection import OutsideTextDetector
+__all__ = [
+    "clean_speech_bubbles",
+    "detect_speech_bubbles",
+    "calculate_centroid_expansion_box",
+    "convert_image_to_target_mode",
+    "cv2_to_pil",
+    "pil_to_cv2",
+    "process_bubble_image_cached",
+    "resize_to_max_side",
+    "save_image_with_compression",
+    "upscale_image",
+    "upscale_image_to_dimension",
+    "FluxKontextInpainter",
+    "OutsideTextDetector",
+]

core/image/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.29 kB). View file

core/image/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (1.11 kB). View file

core/image/__pycache__/cleaning.cpython-311.pyc ADDED Viewed

Binary file (32 kB). View file

core/image/__pycache__/cleaning.cpython-314.pyc ADDED Viewed

Binary file (30.5 kB). View file

core/image/__pycache__/detection.cpython-311.pyc ADDED Viewed

Binary file (40.2 kB). View file

core/image/__pycache__/detection.cpython-314.pyc ADDED Viewed

Binary file (37.5 kB). View file

core/image/__pycache__/image_utils.cpython-311.pyc ADDED Viewed

Binary file (36.2 kB). View file

core/image/__pycache__/image_utils.cpython-314.pyc ADDED Viewed

Binary file (35.2 kB). View file

core/image/__pycache__/inpainting.cpython-311.pyc ADDED Viewed

Binary file (32.5 kB). View file

core/image/__pycache__/ocr_detection.cpython-311.pyc ADDED Viewed

Binary file (32.4 kB). View file

core/image/__pycache__/sorting.cpython-311.pyc ADDED Viewed

Binary file (17.6 kB). View file

core/image/cleaning.py ADDED Viewed

	@@ -0,0 +1,849 @@

+import gc
+import os
+import random
+import tempfile
+from pathlib import Path
+from typing import Optional, Union
+import cv2
+import numpy as np
+from PIL import Image
+from core.scaling import scale_area, scale_kernel, scale_scalar
+from utils.exceptions import CleaningError, ImageProcessingError, ValidationError
+from utils.logging import log_message
+from .detection import detect_speech_bubbles
+from .image_utils import pil_to_cv2
+from .inpainting import FluxKontextInpainter
+# Cleaning parameters
+GRAYSCALE_MIDPOINT = 128  # Threshold for determining black vs white bubbles
+MIN_CONTOUR_AREA = 50  # Minimum area threshold for filtering small contours
+DILATION_KERNEL_SIZE = (7, 7)  # Kernel size for morphological dilation
+EROSION_KERNEL_SIZE = (5, 5)  # Kernel size for morphological erosion
+DISTANCE_TRANSFORM_MASK_SIZE = 5  # Mask size for distance transform
+# Classification thresholds for colored bubbles
+BRIGHT_RATIO_THRESHOLD = 0.50
+DARK_RATIO_THRESHOLD = 0.50
+BRIGHT_DOM_RATIO_MIN = 0.30
+DARK_DOM_RATIO_MIN = 0.30
+BRIGHT_DARK_RATIO_MAX = 0.10
+DARK_BRIGHT_RATIO_MAX = 0.10
+def _normalize_mask(mask: np.ndarray) -> np.ndarray:
+    """
+    Ensure mask is uint8 binary (0/255).
+    """
+    if mask.dtype != np.uint8:
+        mask = mask.astype(np.uint8)
+    return np.where(mask > 0, 255, 0).astype(np.uint8)
+def process_single_bubble(
+    base_mask,
+    img_gray,
+    img_height,
+    img_width,
+    thresholding_value,
+    use_otsu_threshold,
+    roi_shrink_px,
+    verbose,
+    detection_bbox=None,
+    is_sam=False,
+    dilation_kernel=None,
+    constraint_erosion_kernel=None,
+    min_contour_area: float = MIN_CONTOUR_AREA,
+    classify_colored: bool = False,
+):
+    """
+    Process a single speech bubble mask to extract text regions and determine fill color.
+    Args:
+        base_mask (numpy.ndarray): The base mask (SAM or YOLO) for the bubble
+        img_gray (numpy.ndarray): Grayscale image
+        img_height (int): Image height
+        img_width (int): Image width
+        thresholding_value (int): Fixed threshold value for text detection
+        use_otsu_threshold (bool): Whether to use Otsu's method for thresholding
+        roi_shrink_px (int): Pixels to shrink ROI inwards
+        verbose (bool): Whether to print verbose messages
+        detection_bbox: Bounding box for logging (optional)
+        is_sam (bool): Whether this is a SAM mask (for logging)
+    Returns:
+        tuple: (final_mask, fill_color_bgr, is_colored, sample_color_bgr, text_bbox)
+    Raises:
+        CleaningError: If processing fails
+    """
+    try:
+        base_mask = _normalize_mask(base_mask)
+        if dilation_kernel is None:
+            dilation_kernel = cv2.getStructuringElement(
+                cv2.MORPH_ELLIPSE, DILATION_KERNEL_SIZE
+            )
+        if constraint_erosion_kernel is None:
+            constraint_erosion_kernel = cv2.getStructuringElement(
+                cv2.MORPH_ELLIPSE, EROSION_KERNEL_SIZE
+            )
+        masked_pixels = img_gray[base_mask == 255]
+        if masked_pixels.size == 0:
+            log_message(
+                f"{'[SAM]' if is_sam else ''}Skipping detection {detection_bbox}: empty mask",
+                verbose=verbose,
+            )
+            raise CleaningError(f"Empty mask for detection {detection_bbox}")
+        mean_pixel_value = np.mean(masked_pixels)
+        is_black_bubble = mean_pixel_value < GRAYSCALE_MIDPOINT
+        fill_color_bgr = (0, 0, 0) if is_black_bubble else (255, 255, 255)
+        is_colored_bubble = False
+        sample_color_bgr: tuple[int, int, int] = fill_color_bgr
+        log_message(
+            f"{'[SAM]' if is_sam else ''}Detection {detection_bbox}: "
+            f"{'Black' if is_black_bubble else 'White'} bubble (mean={mean_pixel_value:.1f})",
+            verbose=verbose,
+        )
+        roi_mask = cv2.dilate(base_mask, dilation_kernel, iterations=1)
+        roi_gray = np.zeros_like(img_gray)
+        roi_indices = roi_mask == 255
+        roi_gray[roi_indices] = img_gray[roi_indices]
+        # Invert for black bubbles to detect text properly
+        roi_for_thresholding = (
+            cv2.bitwise_not(roi_gray) if is_black_bubble else roi_gray
+        )
+        thresholded_roi = np.zeros_like(img_gray)
+        if use_otsu_threshold:
+            roi_pixels_for_otsu = roi_for_thresholding[roi_indices]
+            thresh_val, _ = cv2.threshold(
+                roi_pixels_for_otsu, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU
+            )
+            log_message(
+                f"{'[SAM]' if is_sam else ''}  Otsu threshold: {thresh_val}",
+                verbose=verbose,
+            )
+            _, thresholded_roi = cv2.threshold(
+                roi_for_thresholding, thresh_val, 255, cv2.THRESH_BINARY
+            )
+        else:
+            _, thresholded_roi = cv2.threshold(
+                roi_for_thresholding, thresholding_value, 255, cv2.THRESH_BINARY
+            )
+        thresholded_roi = cv2.bitwise_and(thresholded_roi, roi_mask)
+        # Shrink ROI to avoid border artifacts
+        dist_map = cv2.distanceTransform(
+            roi_mask, cv2.DIST_L2, DISTANCE_TRANSFORM_MASK_SIZE
+        )
+        shrunk_roi_mask = np.where(dist_map >= float(roi_shrink_px), 255, 0).astype(
+            np.uint8
+        )
+        thresholded_roi = cv2.bitwise_and(thresholded_roi, shrunk_roi_mask)
+        # Use eroded mask to avoid erasing bubble outlines
+        eroded_constraint_mask = cv2.erode(
+            base_mask, constraint_erosion_kernel, iterations=1
+        )
+        contours, _ = cv2.findContours(
+            thresholded_roi, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+        )
+        valid_contours = []
+        for cnt in contours:
+            area = cv2.contourArea(cnt)
+            if area <= min_contour_area:
+                continue
+            m = cv2.moments(cnt)
+            if m["m00"] == 0:
+                continue
+            cx = int(m["m10"] / m["m00"])
+            cy = int(m["m01"] / m["m00"])
+            if (
+                0 <= cx < img_width
+                and 0 <= cy < img_height
+                and eroded_constraint_mask[cy, cx] == 255
+            ):
+                valid_contours.append(cnt)
+        log_message(
+            f"{'[SAM]' if is_sam else ''}Detection {detection_bbox}: {len(valid_contours)} text fragments found",
+            verbose=verbose,
+        )
+        text_bbox = None
+        if valid_contours:
+            validated_mask = np.zeros((img_height, img_width), dtype=np.uint8)
+            cv2.drawContours(
+                validated_mask, valid_contours, -1, 255, thickness=cv2.FILLED
+            )
+            # Re-contour to get clean boundary from validated mask
+            boundary_contours, _ = cv2.findContours(
+                validated_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+            )
+            if boundary_contours:
+                largest_contour = max(boundary_contours, key=cv2.contourArea)
+                final_mask = np.zeros((img_height, img_width), dtype=np.uint8)
+                cv2.drawContours(
+                    final_mask, [largest_contour], -1, 255, thickness=cv2.FILLED
+                )
+                x, y, w, h = cv2.boundingRect(largest_contour)
+                text_bbox = (x, y, x + w, y + h)
+                if classify_colored:
+                    # Sample bubble interior excluding text box and outline to determine if colored
+                    sampling_mask = cv2.erode(
+                        base_mask, constraint_erosion_kernel, iterations=2
+                    )
+                    if text_bbox:
+                        x1, y1, x2, y2 = text_bbox
+                        x1 = max(0, x1)
+                        y1 = max(0, y1)
+                        x2 = min(img_width, x2)
+                        y2 = min(img_height, y2)
+                        sampling_mask[y1:y2, x1:x2] = 0
+                    sample_pixels = img_gray[sampling_mask == 255]
+                    if sample_pixels.size == 0:
+                        sample_pixels = masked_pixels
+                    sample_values = sample_pixels.astype(np.uint8).flatten()
+                    hist = np.bincount(sample_values, minlength=256)
+                    dominant_val = (
+                        int(hist.argmax()) if hist.size > 0 else int(mean_pixel_value)
+                    )
+                    dominant_count = int(hist.max()) if hist.size > 0 else 0
+                    total_count = max(int(sample_values.size), 1)
+                    dominant_ratio = dominant_count / float(total_count)
+                    bright_ratio = float(
+                        np.count_nonzero(sample_values >= 245)
+                    ) / float(total_count)
+                    dark_ratio = float(np.count_nonzero(sample_values <= 15)) / float(
+                        total_count
+                    )
+                    log_prefix = "[SAM] " if is_sam else ""
+                    if bright_ratio >= BRIGHT_RATIO_THRESHOLD or (
+                        dominant_val >= 245
+                        and dominant_ratio >= BRIGHT_DOM_RATIO_MIN
+                        and dark_ratio <= BRIGHT_DARK_RATIO_MAX
+                    ):
+                        is_colored_bubble = False
+                        fill_color_bgr = (255, 255, 255)
+                        sample_color_bgr = (255, 255, 255)
+                        log_message(
+                            f"{log_prefix}Detection {detection_bbox}: white "
+                            f"(mode={dominant_val}, dom_ratio={dominant_ratio:.2f}, "
+                            f"bright_ratio={bright_ratio:.2f}, dark_ratio={dark_ratio:.2f})",
+                            verbose=verbose,
+                        )
+                    elif dark_ratio >= DARK_RATIO_THRESHOLD or (
+                        dominant_val <= 15
+                        and dominant_ratio >= DARK_DOM_RATIO_MIN
+                        and bright_ratio <= DARK_BRIGHT_RATIO_MAX
+                    ):
+                        is_colored_bubble = False
+                        fill_color_bgr = (0, 0, 0)
+                        sample_color_bgr = (0, 0, 0)
+                        log_message(
+                            f"{log_prefix}Detection {detection_bbox}: black "
+                            f"(mode={dominant_val}, dom_ratio={dominant_ratio:.2f}, "
+                            f"bright_ratio={bright_ratio:.2f}, dark_ratio={dark_ratio:.2f})",
+                            verbose=verbose,
+                        )
+                    else:
+                        is_colored_bubble = True
+                        sample_color_bgr = (dominant_val, dominant_val, dominant_val)
+                        log_message(
+                            f"{log_prefix}Detection {detection_bbox}: "
+                            f"colored/gradient (mode={dominant_val}, "
+                            f"dom_ratio={dominant_ratio:.2f}, "
+                            f"bright_ratio={bright_ratio:.2f}, "
+                            f"dark_ratio={dark_ratio:.2f})",
+                            verbose=verbose,
+                        )
+                return (
+                    final_mask,
+                    fill_color_bgr,
+                    is_colored_bubble,
+                    sample_color_bgr,
+                    text_bbox,
+                )
+        raise CleaningError("Failed to process bubble mask")
+    except Exception as e:
+        log_message(
+            f"Failed to process {'SAM' if is_sam else 'YOLO'} mask for {detection_bbox}",
+            always_print=True,
+        )
+        raise CleaningError("Failed to process bubble mask") from e
+def clean_speech_bubbles(
+    image_input: Union[str, Path, Image.Image],
+    model_path,
+    confidence=0.6,
+    pre_computed_detections=None,
+    device=None,
+    thresholding_value: int = 190,
+    use_otsu_threshold: bool = False,
+    roi_shrink_px: int = 4,
+    verbose: bool = False,
+    processing_scale: float = 1.0,
+    conjoined_confidence=0.35,
+    inpaint_colored_bubbles: bool = False,
+    flux_hf_token: str = "",
+    flux_num_inference_steps: int = 10,
+    flux_residual_diff_threshold: float = 0.15,
+    flux_seed: int = 1,
+    osb_text_verification: bool = False,
+    osb_text_hf_token: str = "",
+    force_cv2_inpainting: bool = False,
+):
+    """
+    Clean speech bubbles using YOLO/SAM masks and optional Flux inpainting for colored bubbles.
+    Args:
+        image_input (str, Path, or PIL.Image.Image): Path to input image or a PIL Image object.
+        model_path (str): Path to YOLO model.
+        confidence (float): Confidence threshold for detections.
+        pre_computed_detections (list, optional): Pre-computed detections from previous call.
+        device (torch.device, optional): The device to run detection model on if needed.
+        thresholding_value (int): Fixed threshold value for text detection (0-255). Lower values (e.g., 190)
+                                 are useful for uncleaned text close to bubble's edges.
+        use_otsu_threshold (bool): If True, use Otsu's method for thresholding instead of the fixed value.
+        roi_shrink_px (int): Number of pixels to shrink the ROI inwards before identification/fill.
+        inpaint_colored_bubbles (bool): If True, detect non-white/black bubbles and inpaint text with Flux.
+        flux_hf_token (str): Hugging Face token for Flux downloads (shared with outside-text removal).
+        flux_num_inference_steps (int): Flux denoising steps for colored bubble inpainting.
+        flux_residual_diff_threshold (float): Flux residual diff threshold for caching.
+        flux_seed (int): Seed for Flux; -1 enables random per run.
+        osb_text_verification (bool): When True, expand bubble boxes to fully cover OSB text detections.
+        osb_text_hf_token (str): Optional token for OSB text model downloads.
+        force_cv2_inpainting (bool): If True, skip Flux inpainting even for colored bubbles and use standard fill.
+    Returns:
+        numpy.ndarray: Cleaned image with text removed.
+        list[dict]: A list of dictionaries per bubble containing:
+                    - 'mask' (np.ndarray): validated text mask (0/255)
+                    - 'base_mask' (np.ndarray): normalized detection mask used for processing
+                    - 'color' (tuple BGR): sampled bubble color
+                    - 'bbox' (tuple): detection bounding box
+                    - 'is_colored' (bool): whether bubble interior was classified colored
+                    - 'text_bbox' (tuple|None): bounding box of detected text mask
+                    - 'is_sam' (bool): whether detection originated from SAM
+    Raises:
+        ValueError: If the image cannot be loaded or if an image object is passed without pre-computed detections.
+        RuntimeError: If model loading or bubble detection fails.
+    """
+    try:
+        if isinstance(image_input, (str, Path)):
+            pil_image = Image.open(image_input)
+            image_path = image_input
+        else:
+            pil_image = image_input
+            image_path = None  # In-memory image has no path
+        image = pil_to_cv2(pil_image)
+        img_height, img_width = image.shape[:2]
+        img_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        cleaned_image = image.copy()
+        if pre_computed_detections is not None:
+            detections = pre_computed_detections
+        elif image_path is not None:
+            detection_result = detect_speech_bubbles(
+                image_path,
+                model_path,
+                confidence,
+                device=device,
+                conjoined_confidence=conjoined_confidence,
+                osb_text_verification=osb_text_verification,
+                osb_text_hf_token=osb_text_hf_token,
+            )
+            detections = (
+                detection_result[0]
+                if isinstance(detection_result, tuple)
+                else detection_result
+            )
+        else:
+            raise ValidationError(
+                "Bubble detection requires an image path, but an image object "
+                "was provided without pre-computed detections."
+            )
+        processed_bubbles = []
+        effective_roi_shrink_px = float(
+            scale_scalar(
+                roi_shrink_px,
+                processing_scale,
+                minimum=0.0,
+                maximum=64.0,
+            )
+        )
+        dilation_kernel = cv2.getStructuringElement(
+            cv2.MORPH_ELLIPSE, scale_kernel(DILATION_KERNEL_SIZE, processing_scale)
+        )
+        constraint_erosion_kernel = cv2.getStructuringElement(
+            cv2.MORPH_ELLIPSE, scale_kernel(EROSION_KERNEL_SIZE, processing_scale)
+        )
+        min_contour_area = scale_area(
+            MIN_CONTOUR_AREA,
+            processing_scale,
+            minimum=MIN_CONTOUR_AREA,
+            maximum=5000,
+        )
+        for detection in detections:
+            final_mask = None
+            fill_color_bgr = None
+            is_colored_bubble = False
+            sample_color_bgr: Optional[tuple[int, int, int]] = None
+            text_bbox: Optional[tuple[int, int, int, int]] = None
+            base_mask = None
+            is_sam_mask = False
+            sam_mask = detection.get("sam_mask")
+            if sam_mask is not None:
+                base_mask = _normalize_mask(sam_mask)
+                is_sam_mask = True
+                try:
+                    (
+                        final_mask,
+                        fill_color_bgr,
+                        is_colored_bubble,
+                        sample_color_bgr,
+                        text_bbox,
+                    ) = process_single_bubble(
+                        base_mask,
+                        img_gray,
+                        img_height,
+                        img_width,
+                        thresholding_value,
+                        use_otsu_threshold,
+                        effective_roi_shrink_px,
+                        verbose,
+                        detection.get("bbox"),
+                        is_sam=True,
+                        dilation_kernel=dilation_kernel,
+                        constraint_erosion_kernel=constraint_erosion_kernel,
+                        min_contour_area=min_contour_area,
+                        classify_colored=inpaint_colored_bubbles,
+                    )
+                except Exception as e:
+                    retry_success = False
+                    if not use_otsu_threshold and base_mask is not None:
+                        log_message(
+                            f"Standard cleaning failed for {detection.get('bbox')}, retrying with Otsu...",
+                            verbose=verbose,
+                        )
+                        retry_res = retry_cleaning_with_otsu(
+                            image,
+                            {
+                                "base_mask": base_mask,
+                                "bbox": detection.get("bbox"),
+                                "is_sam": True,
+                            },
+                            thresholding_value,
+                            roi_shrink_px,
+                            processing_scale,
+                            verbose,
+                            inpaint_colored_bubbles,
+                        )
+                        if retry_res:
+                            final_mask = retry_res["mask"]
+                            fill_color_bgr = retry_res["color"]
+                            sample_color_bgr = retry_res["color"]
+                            is_colored_bubble = retry_res["is_colored"]
+                            text_bbox = retry_res["text_bbox"]
+                            retry_success = True
+                            log_message(
+                                f"Otsu retry successful for {detection.get('bbox')}",
+                                verbose=verbose,
+                            )
+                        else:
+                            log_message(
+                                f"Otsu retry failed for {detection.get('bbox')}",
+                                verbose=verbose,
+                            )
+                    if not retry_success:
+                        error_msg = f"Error processing SAM mask for detection {detection.get('bbox')}: {e}"
+                        log_message(error_msg, always_print=True)
+                        continue
+            else:
+                if "mask_points" not in detection or not detection["mask_points"]:
+                    log_message(
+                        f"Skipping detection {detection.get('bbox')}: no mask points",
+                        verbose=verbose,
+                    )
+                    continue
+                try:
+                    points_list = detection["mask_points"]
+                    points = np.array(points_list, dtype=np.float32)
+                    if len(points.shape) == 3 and points.shape[1] == 1:
+                        points_int = np.round(points).astype(int)
+                    elif len(points.shape) == 2 and points.shape[1] == 2:
+                        points_int = np.round(points).astype(int).reshape((-1, 1, 2))
+                    else:
+                        log_message(
+                            f"Skipping detection {detection.get('bbox')}: invalid mask format",
+                            verbose=verbose,
+                        )
+                        continue
+                    yolo_mask = np.zeros((img_height, img_width), dtype=np.uint8)
+                    cv2.fillPoly(yolo_mask, [points_int], 255)
+                    base_mask = _normalize_mask(yolo_mask)
+                    (
+                        final_mask,
+                        fill_color_bgr,
+                        is_colored_bubble,
+                        sample_color_bgr,
+                        text_bbox,
+                    ) = process_single_bubble(
+                        base_mask,
+                        img_gray,
+                        img_height,
+                        img_width,
+                        thresholding_value,
+                        use_otsu_threshold,
+                        effective_roi_shrink_px,
+                        verbose,
+                        detection.get("bbox"),
+                        is_sam=False,
+                        dilation_kernel=dilation_kernel,
+                        constraint_erosion_kernel=constraint_erosion_kernel,
+                        min_contour_area=min_contour_area,
+                        classify_colored=inpaint_colored_bubbles,
+                    )
+                except Exception as e:
+                    retry_success = False
+                    if not use_otsu_threshold and base_mask is not None:
+                        log_message(
+                            f"Standard cleaning failed for {detection.get('bbox')}, retrying with Otsu...",
+                            verbose=verbose,
+                        )
+                        retry_res = retry_cleaning_with_otsu(
+                            image,
+                            {
+                                "base_mask": base_mask,
+                                "bbox": detection.get("bbox"),
+                                "is_sam": False,
+                            },
+                            thresholding_value,
+                            roi_shrink_px,
+                            processing_scale,
+                            verbose,
+                            inpaint_colored_bubbles,
+                        )
+                        if retry_res:
+                            final_mask = retry_res["mask"]
+                            fill_color_bgr = retry_res["color"]
+                            sample_color_bgr = retry_res["color"]
+                            is_colored_bubble = retry_res["is_colored"]
+                            text_bbox = retry_res["text_bbox"]
+                            retry_success = True
+                            log_message(
+                                f"Otsu retry successful for {detection.get('bbox')}",
+                                verbose=verbose,
+                            )
+                        else:
+                            log_message(
+                                f"Otsu retry failed for {detection.get('bbox')}",
+                                verbose=verbose,
+                            )
+                    if not retry_success:
+                        error_msg = f"Error processing YOLO mask for detection {detection.get('bbox')}: {e}"
+                        log_message(error_msg, always_print=True)
+                        continue
+            if final_mask is not None and fill_color_bgr is not None:
+                processed_bubbles.append(
+                    {
+                        "mask": final_mask,
+                        "base_mask": base_mask,
+                        "color": (
+                            sample_color_bgr if sample_color_bgr else fill_color_bgr
+                        ),
+                        "bbox": detection.get("bbox"),
+                        "is_colored": is_colored_bubble,
+                        "text_bbox": text_bbox,
+                        "is_sam": is_sam_mask,
+                        "inpainted": False,
+                    }
+                )
+                log_message(
+                    f"Detection {detection.get('bbox')}: processed successfully",
+                    verbose=verbose,
+                )
+        # Optional Flux inpainting for colored bubbles (text-only mask)
+        if inpaint_colored_bubbles:
+            colored_bubbles = [
+                b for b in processed_bubbles if b.get("is_colored", False)
+            ]
+            if colored_bubbles and flux_hf_token and not force_cv2_inpainting:
+                log_message(
+                    f"Inpainting {len(colored_bubbles)} colored bubbles with Flux",
+                    always_print=True,
+                )
+                pil_working = Image.fromarray(
+                    cv2.cvtColor(cleaned_image, cv2.COLOR_BGR2RGB)
+                )
+                base_seed = (
+                    random.randint(1, 999999)
+                    if flux_seed == -1
+                    else max(0, int(flux_seed))
+                )
+                temp_files = []
+                try:
+                    inpainter = FluxKontextInpainter(
+                        device=device,
+                        huggingface_token=flux_hf_token,
+                        num_inference_steps=int(flux_num_inference_steps),
+                        residual_diff_threshold=float(flux_residual_diff_threshold),
+                    )
+                    for idx, bubble_info in enumerate(colored_bubbles):
+                        mask_np = bubble_info["mask"]
+                        mask_bool = mask_np.astype(bool)
+                        region_seed = base_seed + idx if base_seed > 0 else base_seed
+                        bbox_tuple = bubble_info.get("bbox")
+                        ocr_params = {"type": "colored_bubble", "bbox": bbox_tuple}
+                        try:
+                            pil_working = inpainter.inpaint_mask(
+                                pil_working,
+                                mask_bool,
+                                seed=region_seed,
+                                verbose=verbose,
+                                ocr_params=ocr_params,
+                            )
+                            bubble_info["inpainted"] = True
+                            # Re-sample background brightness after inpaint for accurate text contrast
+                            cv_after = cv2.cvtColor(
+                                np.array(pil_working.convert("RGB")), cv2.COLOR_RGB2BGR
+                            )
+                            masked_after = cv_after[mask_bool]
+                            if masked_after.size > 0:
+                                mean_val = int(np.clip(np.mean(masked_after), 0, 255))
+                                bubble_info["color"] = (mean_val, mean_val, mean_val)
+                        except Exception as e:
+                            log_message(
+                                f"Flux inpainting failed for bubble {bbox_tuple}: {e}; falling back to standard fill",
+                                always_print=True,
+                            )
+                            continue
+                        # Save intermediate result to disk to free memory when multiple regions
+                        if idx < len(colored_bubbles) - 1:
+                            temp_file = None
+                            try:
+                                temp_fd, temp_file = tempfile.mkstemp(suffix=".png")
+                                os.close(temp_fd)
+                                pil_working.save(temp_file, format="PNG")
+                                log_message(
+                                    "Saved intermediate inpainting result to disk",
+                                    verbose=verbose,
+                                )
+                                temp_files.append(temp_file)
+                                with Image.open(temp_file) as img_tmp:
+                                    img_tmp.load()
+                                    pil_working = img_tmp.copy()
+                                gc.collect()
+                            except Exception as e:
+                                log_message(
+                                    f"Warning: Failed to save intermediate inpainting result: {e}",
+                                    verbose=verbose,
+                                )
+                                if temp_file and temp_file in temp_files:
+                                    temp_files.remove(temp_file)
+                                # fall through with in-memory image
+                    cleaned_image = cv2.cvtColor(
+                        np.array(pil_working.convert("RGB")), cv2.COLOR_RGB2BGR
+                    )
+                except Exception as e:
+                    log_message(
+                        f"Flux inpainting aborted; falling back to standard fill: {e}",
+                        always_print=True,
+                    )
+                finally:
+                    for temp_file in temp_files:
+                        if temp_file and os.path.exists(temp_file):
+                            try:
+                                os.remove(temp_file)
+                            except Exception:
+                                pass
+            elif colored_bubbles:
+                reason = (
+                    "forced CV2 inpainting"
+                    if force_cv2_inpainting
+                    else "missing Hugging Face token"
+                )
+                log_message(
+                    f"Colored bubbles detected but Flux inpainting skipped ({reason}); "
+                    "falling back to standard fill",
+                    always_print=True,
+                )
+        # Group masks by color for efficient batch processing (skip already inpainted regions)
+        if processed_bubbles:
+            color_groups = {}
+            for bubble_info in processed_bubbles:
+                if bubble_info.get("inpainted", False):
+                    continue
+                color_key = bubble_info["color"]
+                if color_key not in color_groups:
+                    color_groups[color_key] = []
+                color_groups[color_key].append(bubble_info["mask"])
+            for color_bgr, masks in color_groups.items():
+                combined_mask = np.bitwise_or.reduce(masks)
+                if cleaned_image.shape[2] == 4:
+                    cleaned_image[combined_mask == 255, :3] = (
+                        color_bgr  # Preserve alpha channel
+                    )
+                else:
+                    cleaned_image[combined_mask == 255] = color_bgr
+        log_message(
+            f"Cleaned {len(processed_bubbles)} speech bubbles", always_print=True
+        )
+        return cleaned_image, processed_bubbles
+    except IOError as e:
+        raise ImageProcessingError(f"Error loading image {image_input}: {str(e)}")
+    except Exception as e:
+        raise CleaningError(f"Error cleaning speech bubbles: {str(e)}")
+def retry_cleaning_with_otsu(
+    image_bgr: np.ndarray,
+    bubble_info: dict,
+    thresholding_value: int,
+    roi_shrink_px: int,
+    processing_scale: float = 1.0,
+    verbose: bool = False,
+    classify_colored: bool = False,
+) -> Optional[dict]:
+    """
+    Retry cleaning for a single bubble using Otsu thresholding.
+    Returns a bubble-info dict compatible with clean_speech_bubbles output,
+    or None if retry fails.
+    """
+    base_mask = bubble_info.get("base_mask")
+    if base_mask is None:
+        log_message(
+            f"Otsu retry skipped for {bubble_info.get('bbox')}: missing base_mask",
+            verbose=verbose,
+        )
+        return None
+    try:
+        if len(image_bgr.shape) == 3 and image_bgr.shape[2] == 4:
+            img_gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGRA2GRAY)
+        else:
+            img_gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+    except Exception as e:
+        log_message(
+            f"Otsu retry failed to convert image to grayscale: {e}",
+            always_print=True,
+        )
+        return None
+    img_height, img_width = img_gray.shape[:2]
+    effective_roi_shrink_px = float(
+        scale_scalar(
+            roi_shrink_px,
+            processing_scale,
+            minimum=0.0,
+            maximum=64.0,
+        )
+    )
+    dilation_kernel = cv2.getStructuringElement(
+        cv2.MORPH_ELLIPSE, scale_kernel(DILATION_KERNEL_SIZE, processing_scale)
+    )
+    constraint_erosion_kernel = cv2.getStructuringElement(
+        cv2.MORPH_ELLIPSE, scale_kernel(EROSION_KERNEL_SIZE, processing_scale)
+    )
+    min_contour_area = scale_area(
+        MIN_CONTOUR_AREA,
+        processing_scale,
+        minimum=MIN_CONTOUR_AREA,
+        maximum=5000,
+    )
+    try:
+        result = process_single_bubble(
+            base_mask,
+            img_gray,
+            img_height,
+            img_width,
+            thresholding_value,
+            True,  # force Otsu
+            effective_roi_shrink_px,
+            verbose,
+            bubble_info.get("bbox"),
+            bubble_info.get("is_sam", False),
+            dilation_kernel=dilation_kernel,
+            constraint_erosion_kernel=constraint_erosion_kernel,
+            min_contour_area=min_contour_area,
+            classify_colored=classify_colored,
+        )
+    except CleaningError as e:
+        log_message(
+            f"Otsu retry cleaning failed for {bubble_info.get('bbox')}: {e}",
+            always_print=True,
+        )
+        return None
+    except Exception as e:
+        log_message(
+            f"Otsu retry cleaning unexpected error for {bubble_info.get('bbox')}: {e}",
+            always_print=True,
+        )
+        return None
+    if not result:
+        return None
+    (
+        final_mask,
+        fill_color_bgr,
+        is_colored_bubble,
+        sample_color_bgr,
+        text_bbox,
+    ) = result
+    bubble_color = sample_color_bgr if sample_color_bgr else fill_color_bgr
+    log_message(
+        f"Otsu retry succeeded for {bubble_info.get('bbox')}",
+        verbose=verbose,
+    )
+    return {
+        "mask": final_mask,
+        "base_mask": _normalize_mask(base_mask),
+        "color": bubble_color,
+        "bbox": bubble_info.get("bbox"),
+        "is_colored": is_colored_bubble,
+        "text_bbox": text_bbox,
+        "is_sam": bubble_info.get("is_sam", False),
+    }

core/image/detection.py ADDED Viewed

	@@ -0,0 +1,914 @@

+from pathlib import Path
+from typing import List, Optional, Tuple
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from core.caching import get_cache
+from core.ml.model_manager import ModelType, get_model_manager
+from utils.exceptions import ImageProcessingError, ModelError
+from utils.logging import log_message
+# Detection Parameters
+IOA_THRESHOLD = 0.50  # 50% IoA threshold for conjoined bubble detection
+SAM_MASK_THRESHOLD = 0.5  # SAM2 mask binarization threshold
+IOA_OVERLAP_THRESHOLD = 0.5  # IoA threshold for general overlap detection between boxes
+IOU_DUPLICATE_THRESHOLD = 0.7  # IoU threshold for duplicate primary detection
+def _box_contains(inner, outer) -> bool:
+    """Return True if inner box is fully contained in outer box."""
+    ix0, iy0, ix1, iy1 = inner
+    ox0, oy0, ox1, oy1 = outer
+    return ix0 >= ox0 and iy0 >= oy0 and ix1 <= ox1 and iy1 <= oy1
+def _expand_boxes_with_osb_text(
+    image_cv,
+    image_pil,
+    primary_boxes: torch.Tensor,
+    cache,
+    model_manager,
+    device,
+    confidence: float,
+    hf_token: str,
+    verbose: bool,
+):
+    """Expand speech-bubble boxes to fully contain detected OSB text boxes."""
+    if primary_boxes is None or len(primary_boxes) == 0:
+        return primary_boxes
+    try:
+        model_path = str(model_manager.model_paths[ModelType.YOLO_OSBTEXT])
+        cache_key = cache.get_yolo_cache_key(image_pil, model_path, confidence)
+        cached = cache.get_yolo_detection(cache_key)
+        if cached is not None:
+            _, osb_boxes, _ = cached
+        else:
+            osb_model = model_manager.load_yolo_osbtext(token=hf_token)
+            osb_results = osb_model(
+                image_cv, conf=confidence, device=device, verbose=False
+            )[0]
+            osb_boxes = (
+                osb_results.boxes.xyxy
+                if osb_results.boxes is not None
+                else torch.tensor([])
+            )
+            osb_confs = (
+                osb_results.boxes.conf
+                if osb_results.boxes is not None
+                else torch.tensor([])
+            )
+            cache.set_yolo_detection(cache_key, (osb_results, osb_boxes, osb_confs))
+        if osb_boxes is None or len(osb_boxes) == 0:
+            return primary_boxes
+        pb_np = primary_boxes.detach().cpu().numpy()
+        osb_np = osb_boxes.detach().cpu().numpy()
+        for t_box in osb_np:
+            tx0, ty0, tx1, ty1 = t_box
+            best_idx = None
+            best_intersection = 0.0
+            for i, b_box in enumerate(pb_np):
+                bx0, by0, bx1, by1 = b_box
+                inter_x0 = max(bx0, tx0)
+                inter_y0 = max(by0, ty0)
+                inter_x1 = min(bx1, tx1)
+                inter_y1 = min(by1, ty1)
+                inter_w = max(0.0, inter_x1 - inter_x0)
+                inter_h = max(0.0, inter_y1 - inter_y0)
+                intersection = inter_w * inter_h
+                if intersection > best_intersection:
+                    best_intersection = intersection
+                    best_idx = i
+            if best_idx is None or best_intersection <= 0.0:
+                continue
+            if _box_contains(t_box, pb_np[best_idx]):
+                continue
+            bx0, by0, bx1, by1 = pb_np[best_idx]
+            pb_np[best_idx] = [
+                min(bx0, tx0),
+                min(by0, ty0),
+                max(bx1, tx1),
+                max(by1, ty1),
+            ]
+        return torch.tensor(
+            pb_np, device=primary_boxes.device, dtype=primary_boxes.dtype
+        )
+    except Exception as e:
+        log_message(f"OSB text verification skipped: {e}", verbose=verbose)
+        return primary_boxes
+def _calculate_ioa(box_inner, box_outer):
+    """Calculate Intersection over Area (IoA) for two bounding boxes.
+    IoA = intersection_area / area_of_inner_box
+    Args:
+        box_inner: Tuple or list of (x0, y0, x1, y1) for the inner box
+        box_outer: Tuple or list of (x0, y0, x1, y1) for the outer box
+    Returns:
+        float: IoA value between 0 and 1
+    """
+    x_inner_min, y_inner_min, x_inner_max, y_inner_max = box_inner
+    x_outer_min, y_outer_min, x_outer_max, y_outer_max = box_outer
+    inter_x_min = max(x_inner_min, x_outer_min)
+    inter_y_min = max(y_inner_min, y_outer_min)
+    inter_x_max = min(x_inner_max, x_outer_max)
+    inter_y_max = min(y_inner_max, y_outer_max)
+    inter_w = max(0, inter_x_max - inter_x_min)
+    inter_h = max(0, inter_y_max - inter_y_min)
+    intersection = inter_w * inter_h
+    area_inner = (x_inner_max - x_inner_min) * (y_inner_max - y_inner_min)
+    return intersection / area_inner if area_inner > 0 else 0.0
+def _calculate_iou(box_a, box_b):
+    """Calculate Intersection over Union (IoU) for two bounding boxes.
+    IoU = intersection_area / union_area
+    Args:
+        box_a: Tuple of (x0, y0, x1, y1)
+        box_b: Tuple of (x0, y0, x1, y1)
+    Returns:
+        float: IoU value between 0 and 1
+    """
+    inter_x_min = max(box_a[0], box_b[0])
+    inter_y_min = max(box_a[1], box_b[1])
+    inter_x_max = min(box_a[2], box_b[2])
+    inter_y_max = min(box_a[3], box_b[3])
+    inter_w = max(0, inter_x_max - inter_x_min)
+    inter_h = max(0, inter_y_max - inter_y_min)
+    intersection = inter_w * inter_h
+    area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
+    area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
+    union = area_a + area_b - intersection
+    return intersection / union if union > 0 else 0.0
+def _deduplicate_primary_boxes(
+    boxes: torch.Tensor, confidences: torch.Tensor, threshold: float
+) -> Tuple[torch.Tensor, List[int]]:
+    """Remove duplicate primary detections using IoU-based NMS.
+    When two boxes have IoU > threshold, keeps the one with higher confidence.
+    Args:
+        boxes: Tensor of bounding boxes (N, 4)
+        confidences: Tensor of confidence scores (N,)
+        threshold: IoU threshold above which boxes are considered duplicates
+    Returns:
+        Tuple of (deduplicated boxes tensor, indices of kept boxes)
+    """
+    if len(boxes) <= 1:
+        return boxes, list(range(len(boxes)))
+    boxes_list = boxes.tolist()
+    confs_list = confidences.tolist()
+    n = len(boxes_list)
+    # Sort by confidence (descending)
+    indices = sorted(range(n), key=lambda i: confs_list[i], reverse=True)
+    keep = []
+    for i in indices:
+        is_duplicate = False
+        for k in keep:
+            if _calculate_iou(boxes_list[i], boxes_list[k]) > threshold:
+                is_duplicate = True
+                break
+        if not is_duplicate:
+            keep.append(i)
+    return boxes[keep], keep
+def _categorize_detections(primary_boxes, secondary_boxes, ioa_threshold=IOA_THRESHOLD):
+    """Categorize detections into simple and conjoined bubbles.
+    Args:
+        primary_boxes: Tensor of primary YOLO detection boxes (N, 4)
+        secondary_boxes: Tensor of secondary YOLO detection boxes (M, 4)
+        ioa_threshold: Threshold for determining if a secondary box is contained in a primary box
+    Returns:
+        tuple: (conjoined_indices, simple_indices)
+            - conjoined_indices: List of tuples (primary_idx, [secondary_indices])
+            - simple_indices: List of primary indices that are simple bubbles
+    """
+    # Handle cases where one bubble is detected on the page and is conjoined
+    if primary_boxes.ndim == 1 and primary_boxes.numel() == 4:
+        primary_boxes = primary_boxes.unsqueeze(0)
+    if secondary_boxes.ndim == 1 and secondary_boxes.numel() == 4:
+        secondary_boxes = secondary_boxes.unsqueeze(0)
+    conjoined_indices = []
+    processed_secondary_indices = set()
+    for i, p_box in enumerate(primary_boxes):
+        contained_indices = []
+        for j, s_box in enumerate(secondary_boxes):
+            if j in processed_secondary_indices:
+                continue
+            ioa = _calculate_ioa(s_box.tolist(), p_box.tolist())
+            if ioa > ioa_threshold:
+                contained_indices.append(j)
+        if len(contained_indices) >= 2:
+            conjoined_indices.append((i, contained_indices))
+            processed_secondary_indices.update(contained_indices)
+    primary_simple_indices = []
+    conjoined_primary_indices = {c[0] for c in conjoined_indices}
+    for i in range(len(primary_boxes)):
+        if i in conjoined_primary_indices:
+            continue
+        # Check for duplication against processed secondary bubbles
+        is_duplicate = False
+        p_box_list = primary_boxes[i].tolist()
+        for s_idx in processed_secondary_indices:
+            s_box_list = secondary_boxes[s_idx].tolist()
+            if _calculate_ioa(s_box_list, p_box_list) > ioa_threshold:
+                is_duplicate = True
+                break
+        if not is_duplicate:
+            primary_simple_indices.append(i)
+    return conjoined_indices, primary_simple_indices
+def _process_simple_bubbles(
+    image, primary_boxes, simple_indices, processor, sam_model, device
+):
+    """Process simple (non-conjoined) speech bubbles using SAM2.
+    Args:
+        image: PIL Image
+        primary_boxes: Tensor of primary YOLO detection boxes
+        simple_indices: List of indices for simple bubbles
+        processor: SAM2 processor
+        sam_model: SAM2 model
+        device: PyTorch device
+    Returns:
+        list: List of numpy boolean masks for simple bubbles
+    """
+    if not simple_indices:
+        return []
+    simple_boxes_to_sam = primary_boxes[simple_indices].unsqueeze(0).cpu()
+    inputs = processor(image, input_boxes=simple_boxes_to_sam, return_tensors="pt")
+    # Cast floating point tensors to model's dtype before moving to device
+    for key in inputs:
+        if isinstance(inputs[key], torch.Tensor) and inputs[key].is_floating_point():
+            inputs[key] = inputs[key].to(sam_model.dtype)
+    inputs = inputs.to(device)
+    with torch.no_grad():
+        outputs = sam_model(multimask_output=False, **inputs)
+    masks_tensor = processor.post_process_masks(
+        outputs.pred_masks, inputs["original_sizes"]
+    )[0][:, 0]
+    simple_masks_np = (masks_tensor > SAM_MASK_THRESHOLD).cpu().numpy()
+    return [mask for mask in simple_masks_np]
+def _fallback_to_yolo_mask(primary_results, i, mask_type="points"):
+    """Extract YOLO mask as fallback when SAM2 fails.
+    Args:
+        primary_results: YOLO detection results
+        i: Detection index
+        mask_type: Type of mask to extract ("points" or "binary")
+    Returns:
+        Mask data or None if extraction fails
+    """
+    if getattr(primary_results, "masks", None) is None:
+        return None
+    try:
+        masks = primary_results.masks
+        if len(masks) <= i:
+            return None
+        if mask_type == "points":
+            mask_points = masks[i].xy[0]
+            return (
+                mask_points.tolist() if hasattr(mask_points, "tolist") else mask_points
+            )
+        elif mask_type == "binary":
+            mask_tensor = masks.data[i]
+            orig_h, orig_w = primary_results.orig_shape
+            mask_resized = torch.nn.functional.interpolate(
+                mask_tensor.float().unsqueeze(0).unsqueeze(0),
+                size=(orig_h, orig_w),
+                mode="bilinear",
+                align_corners=False,
+            ).squeeze()
+            binary_mask = (mask_resized > SAM_MASK_THRESHOLD).cpu().numpy()
+            return binary_mask.astype(np.uint8) * 255
+        else:
+            return None
+    except (IndexError, AttributeError) as e:
+        log_message(
+            f"Could not extract YOLO mask for detection {i}: {e}",
+            always_print=True,
+        )
+        return None
+def detect_speech_bubbles(
+    image_path: Path,
+    model_path,
+    confidence=0.6,
+    verbose=False,
+    device=None,
+    use_sam2: bool = True,
+    conjoined_detection: bool = True,
+    conjoined_confidence=0.35,
+    image_override: Optional[Image.Image] = None,
+    osb_enabled: bool = False,
+    osb_text_verification: bool = False,
+    osb_text_hf_token: str = "",
+):
+    """Detect speech bubbles using dual YOLO models and SAM2.
+    For conjoined bubbles detected by the secondary model, uses the inner bounding boxes
+    directly and processes each as a separate simple bubble through SAM2.
+    Args:
+        image_path (Path): Path to the input image
+        model_path (str): Path to the primary YOLO segmentation model
+        confidence (float): Confidence threshold for primary YOLO model detections
+        verbose (bool): Whether to show detailed processing information
+        device (torch.device, optional): The device to run the model on. Autodetects if None.
+        use_sam2 (bool): Whether to use SAM2.1 for enhanced segmentation
+        conjoined_detection (bool): Whether to enable conjoined bubble detection using secondary YOLO model
+        conjoined_confidence (float): Confidence threshold for secondary YOLO model (conjoined bubble detection)
+        osb_text_verification (bool): When True, expand bubble boxes to fully cover OSB text detections
+        osb_text_hf_token (str): Optional token for gated OSB text model downloads
+    Returns:
+        tuple[list, list]: (speech bubble detections, text_free boxes from secondary model)
+    """
+    detections = []
+    text_free_boxes: List[List[float]] = []
+    _device = (
+        device
+        if device is not None
+        else torch.device(
+            "cuda"
+            if torch.cuda.is_available()
+            else "mps" if torch.backends.mps.is_available() else "cpu"
+        )
+    )
+    try:
+        if image_override is not None:
+            image_pil = (
+                image_override
+                if image_override.mode == "RGB"
+                else image_override.convert("RGB")
+            )
+            image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+        else:
+            image_cv = cv2.imread(str(image_path))
+            if image_cv is None:
+                raise ImageProcessingError(f"Could not read image at {image_path}")
+            image_pil = Image.fromarray(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB))
+        log_message(
+            f"Processing image: {image_path.name} ({image_cv.shape[1]}x{image_cv.shape[0]})",
+            verbose=verbose,
+        )
+    except Exception as e:
+        raise ImageProcessingError(f"Error loading image: {e}")
+    model_manager = get_model_manager()
+    cache = get_cache()
+    try:
+        primary_model = model_manager.load_yolo_speech_bubble(model_path)
+        log_message(f"Loaded primary YOLO model: {model_path}", verbose=verbose)
+    except Exception as e:
+        raise ModelError(f"Error loading primary model: {e}")
+    yolo_cache_key = cache.get_yolo_cache_key(image_pil, model_path, confidence)
+    cached_yolo = cache.get_yolo_detection(yolo_cache_key)
+    if cached_yolo is not None:
+        log_message("Using cached YOLO detections", verbose=verbose)
+        primary_results, primary_boxes = cached_yolo
+    else:
+        primary_results = primary_model(
+            image_cv, conf=confidence, device=_device, verbose=False
+        )[0]
+        primary_boxes = (
+            primary_results.boxes.xyxy
+            if primary_results.boxes is not None
+            else torch.tensor([])
+        )
+        cache.set_yolo_detection(yolo_cache_key, (primary_results, primary_boxes))
+    # Remove duplicate primary detections using IoU-based NMS
+    if len(primary_boxes) > 1:
+        original_count = len(primary_boxes)
+        primary_boxes, _ = _deduplicate_primary_boxes(
+            primary_boxes, primary_results.boxes.conf, IOU_DUPLICATE_THRESHOLD
+        )
+        if len(primary_boxes) < original_count:
+            log_message(
+                f"Removed {original_count - len(primary_boxes)} duplicate detections",
+                verbose=verbose,
+            )
+    if len(primary_boxes) == 0:
+        log_message("No detections found", verbose=verbose)
+        return detections, text_free_boxes
+    log_message(
+        f"Detected {len(primary_boxes)} speech bubbles with YOLO", always_print=True
+    )
+    secondary_boxes = torch.tensor([])
+    if use_sam2:
+        try:
+            secondary_model = model_manager.load_yolo_conjoined_bubble()
+            log_message(
+                "Loaded secondary YOLO model for conjoined/fallback detection",
+                verbose=verbose,
+            )
+            secondary_results = secondary_model(
+                image_cv, conf=conjoined_confidence, device=_device, verbose=False
+            )[0]
+            secondary_boxes = (
+                secondary_results.boxes.xyxy
+                if secondary_results.boxes is not None
+                else torch.tensor([])
+            )
+            # Fallback: Add bubbles detected by secondary model but missed by primary
+            if len(secondary_boxes) > 0 and hasattr(secondary_model, "names"):
+                text_bubble_id = None
+                text_free_id = None
+                for cid, cname in secondary_model.names.items():
+                    if cname == "text_bubble":
+                        text_bubble_id = cid
+                    elif cname == "text_free":
+                        text_free_id = cid
+                secondary_cls = secondary_results.boxes.cls
+                # Collect text_free boxes regardless of OSB setting
+                if text_free_id is not None:
+                    for i, s_box in enumerate(secondary_boxes):
+                        if int(secondary_cls[i]) == text_free_id:
+                            text_free_boxes.append(s_box.tolist())
+                if text_bubble_id is not None:
+                    new_boxes = []
+                    primary_boxes_list = (
+                        primary_boxes.tolist() if len(primary_boxes) > 0 else []
+                    )
+                    for i, s_box in enumerate(secondary_boxes):
+                        if int(secondary_cls[i]) != text_bubble_id:
+                            continue
+                        s_box_list = s_box.tolist()
+                        is_covered = False
+                        for p_box_list in primary_boxes_list:
+                            ioa_s_in_p = _calculate_ioa(s_box_list, p_box_list)
+                            ioa_p_in_s = _calculate_ioa(p_box_list, s_box_list)
+                            if (
+                                ioa_s_in_p > IOA_OVERLAP_THRESHOLD
+                                or ioa_p_in_s > IOA_OVERLAP_THRESHOLD
+                            ):
+                                is_covered = True
+                                break
+                        if not is_covered:
+                            new_boxes.append(s_box)
+                    if new_boxes:
+                        log_message(
+                            f"Found {len(new_boxes)} missed bubbles from secondary model",
+                            always_print=True,
+                        )
+                        new_boxes_tensor = torch.stack(new_boxes)
+                        if len(primary_boxes) > 0:
+                            primary_boxes = torch.cat(
+                                (primary_boxes, new_boxes_tensor), dim=0
+                            )
+                        else:
+                            primary_boxes = new_boxes_tensor
+            # Remove text_free detections (route to OSB if enabled, discard otherwise)
+            if text_free_boxes and len(primary_boxes) > 0:
+                indices_to_remove = []
+                primary_boxes_list = primary_boxes.tolist()
+                for i, p_box in enumerate(primary_boxes_list):
+                    overlaps_text_free = False
+                    for tf_box in text_free_boxes:
+                        if (
+                            _calculate_ioa(p_box, tf_box) > IOA_OVERLAP_THRESHOLD
+                            or _calculate_ioa(tf_box, p_box) > IOA_OVERLAP_THRESHOLD
+                        ):
+                            overlaps_text_free = True
+                            break
+                    if overlaps_text_free:
+                        indices_to_remove.append(i)
+                if indices_to_remove:
+                    action = (
+                        "routing to OSB pipeline"
+                        if osb_enabled
+                        else "discarding (OSB disabled)"
+                    )
+                    log_message(
+                        f"Removing {len(indices_to_remove)} bubbles marked text_free ({action})",
+                        always_print=True,
+                    )
+                    keep_indices = [
+                        i
+                        for i in range(len(primary_boxes))
+                        if i not in indices_to_remove
+                    ]
+                    if keep_indices:
+                        primary_boxes = primary_boxes[keep_indices]
+                    else:
+                        primary_boxes = torch.tensor([])
+        except Exception as e:
+            log_message(
+                f"Warning: Could not load/run secondary YOLO model: {e}. "
+                "Proceeding without conjoined/fallback detection.",
+                verbose=verbose,
+            )
+            secondary_boxes = torch.tensor([])
+    if osb_text_verification and len(primary_boxes) > 0:
+        primary_boxes = _expand_boxes_with_osb_text(
+            image_cv,
+            image_pil,
+            primary_boxes,
+            cache,
+            model_manager,
+            _device,
+            confidence,
+            osb_text_hf_token,
+            verbose,
+        )
+    if not use_sam2:
+        log_message("SAM2 disabled, using YOLO segmentation masks", verbose=verbose)
+        for i, box in enumerate(primary_boxes):
+            x0_f, y0_f, x1_f, y1_f = box.tolist()
+            conf = float(primary_results.boxes.conf[i])
+            cls_id = int(primary_results.boxes.cls[i])
+            cls_name = primary_model.names[cls_id]
+            detection = {
+                "bbox": (
+                    int(round(x0_f)),
+                    int(round(y0_f)),
+                    int(round(x1_f)),
+                    int(round(y1_f)),
+                ),
+                "confidence": conf,
+                "class": cls_name,
+            }
+            detection["sam_mask"] = _fallback_to_yolo_mask(primary_results, i, "binary")
+            detections.append(detection)
+        return detections, text_free_boxes
+    conjoined_indices = []
+    simple_indices = list(range(len(primary_boxes)))
+    try:
+        log_message("Applying SAM2.1 segmentation refinement", verbose=verbose)
+        sam_cache_key = cache.get_sam_cache_key(
+            image_pil,
+            primary_boxes,
+            use_sam2,
+            conjoined_detection,
+            conjoined_confidence,
+        )
+        cached_sam = cache.get_sam_masks(sam_cache_key)
+        if cached_sam is not None:
+            log_message("Using cached SAM masks", verbose=verbose)
+            detections = cached_sam
+            return detections, text_free_boxes
+        processor, sam_model = model_manager.load_sam2()
+        if len(secondary_boxes) > 0 and conjoined_detection:
+            log_message(
+                "Categorizing detections (simple vs conjoined)...", verbose=verbose
+            )
+            conjoined_indices, simple_indices = _categorize_detections(
+                primary_boxes, secondary_boxes, ioa_threshold=IOA_THRESHOLD
+            )
+            log_message(
+                f"Found {len(simple_indices)} simple bubbles and {len(conjoined_indices)} conjoined groups",
+                verbose=verbose,
+            )
+            if len(conjoined_indices) > 0:
+                log_message(
+                    f"Detected {len(conjoined_indices)} conjoined speech bubbles with second YOLO",
+                    always_print=True,
+                )
+        else:
+            conjoined_indices = []
+            simple_indices = list(range(len(primary_boxes)))
+            log_message(
+                f"No secondary detections, processing all {len(simple_indices)} as simple bubbles",
+                verbose=verbose,
+            )
+        boxes_to_process = []
+        for idx in simple_indices:
+            boxes_to_process.append(primary_boxes[idx])
+        for _, s_indices in conjoined_indices:
+            for s_idx in s_indices:
+                boxes_to_process.append(secondary_boxes[s_idx])
+        if boxes_to_process:
+            all_boxes_tensor = torch.stack(boxes_to_process)
+            all_masks = _process_simple_bubbles(
+                image_pil,
+                all_boxes_tensor,
+                list(range(len(boxes_to_process))),
+                processor,
+                sam_model,
+                _device,
+            )
+            all_boxes = boxes_to_process
+            total_boxes = len(boxes_to_process)
+            simple_count = len(simple_indices)
+            conjoined_count = sum(len(s_indices) for _, s_indices in conjoined_indices)
+            if conjoined_indices:
+                log_message(
+                    f"Processing {total_boxes} bubbles ({simple_count} simple + "
+                    f"{conjoined_count} from conjoined groups)...",
+                    verbose=verbose,
+                )
+            else:
+                log_message(
+                    f"Processing {total_boxes} simple bubbles...", verbose=verbose
+                )
+        else:
+            all_masks = []
+            all_boxes = []
+        log_message(f"Refined {len(all_masks)} masks with SAM2", always_print=True)
+        log_message(f"Total masks generated: {len(all_masks)}", verbose=verbose)
+        img_h, img_w = image_cv.shape[:2]
+        for i, (mask, box) in enumerate(zip(all_masks, all_boxes)):
+            x0_f, y0_f, x1_f, y1_f = box.tolist()
+            x0 = int(np.floor(max(0, min(x0_f, img_w))))
+            y0 = int(np.floor(max(0, min(y0_f, img_h))))
+            x1 = int(np.ceil(max(0, min(x1_f, img_w))))
+            y1 = int(np.ceil(max(0, min(y1_f, img_h))))
+            if x1 <= x0 or y1 <= y0:
+                continue
+            bbox_mask = np.zeros((img_h, img_w), dtype=bool)
+            bbox_mask[y0:y1, x0:x1] = True
+            clipped_mask = np.logical_and(mask, bbox_mask)
+            detection = {
+                "bbox": (x0, y0, x1, y1),
+                "confidence": 1.0,  # Masks from SAM are high confidence
+                "class": "speech bubble",
+                "sam_mask": clipped_mask.astype(np.uint8) * 255,
+            }
+            detections.append(detection)
+        log_message("SAM2.1 segmentation completed successfully", verbose=verbose)
+        cache.set_sam_masks(sam_cache_key, detections)
+    except Exception as e:
+        log_message(
+            f"SAM2.1 segmentation failed: {e}. Falling back to YOLO segmentation masks.",
+            always_print=True,
+        )
+        detections = []
+        # Process primary boxes first in fallback to avoid duplicating secondary splits
+        fallback_boxes = []
+        if conjoined_detection and len(secondary_boxes) > 0 and conjoined_indices:
+            for idx in simple_indices:
+                fallback_boxes.append(("primary", idx, primary_boxes[idx]))
+            for _, s_indices in conjoined_indices:
+                for s_idx in s_indices:
+                    fallback_boxes.append(("secondary", s_idx, secondary_boxes[s_idx]))
+        elif len(primary_boxes) > 0:
+            for idx in range(len(primary_boxes)):
+                fallback_boxes.append(("primary", idx, primary_boxes[idx]))
+        img_h, img_w = image_cv.shape[:2]
+        primary_fallback_count = 0
+        secondary_fallback_count = 0
+        for _, (source, orig_idx, box) in enumerate(fallback_boxes):
+            x0_f, y0_f, x1_f, y1_f = box.tolist()
+            if source == "primary" and len(primary_results.boxes) > 0:
+                safe_idx = min(orig_idx, len(primary_results.boxes.conf) - 1)
+                conf = float(primary_results.boxes.conf[safe_idx])
+                cls_id = int(primary_results.boxes.cls[safe_idx])
+                cls_name = primary_model.names[cls_id]
+                sam_mask = _fallback_to_yolo_mask(primary_results, safe_idx, "binary")
+                primary_fallback_count += 1
+            elif source == "secondary" and "secondary_results" in locals():
+                try:
+                    safe_idx = min(orig_idx, len(secondary_results.boxes.conf) - 1)
+                    conf = float(secondary_results.boxes.conf[safe_idx])
+                except Exception:
+                    conf = conjoined_confidence
+                cls_name = "speech_bubble"
+                x0 = int(max(0, min(x0_f, img_w)))
+                y0 = int(max(0, min(y0_f, img_h)))
+                x1 = int(max(0, min(x1_f, img_w)))
+                y1 = int(max(0, min(y1_f, img_h)))
+                mask = np.zeros((img_h, img_w), dtype=np.uint8)
+                mask[y0:y1, x0:x1] = 255
+                sam_mask = mask
+                secondary_fallback_count += 1
+            else:
+                conf = conjoined_confidence
+                cls_name = "speech_bubble"
+                x0 = int(max(0, min(x0_f, img_w)))
+                y0 = int(max(0, min(y0_f, img_h)))
+                x1 = int(max(0, min(x1_f, img_w)))
+                y1 = int(max(0, min(y1_f, img_h)))
+                mask = np.zeros((img_h, img_w), dtype=np.uint8)
+                mask[y0:y1, x0:x1] = 255
+                sam_mask = mask
+            detection = {
+                "bbox": (
+                    int(round(x0_f)),
+                    int(round(y0_f)),
+                    int(round(x1_f)),
+                    int(round(y1_f)),
+                ),
+                "confidence": conf,
+                "class": cls_name,
+            }
+            detection["sam_mask"] = sam_mask
+            detections.append(detection)
+        log_message(
+            f"Fallback segmentation used {len(detections)} boxes "
+            f"(primary: {primary_fallback_count}, secondary splits: {secondary_fallback_count})",
+            verbose=verbose,
+        )
+        return detections, text_free_boxes
+    return detections, text_free_boxes
+def detect_panels(
+    image_path: Path,
+    confidence: float = 0.25,
+    device=None,
+    verbose=False,
+    image_override: Optional[Image.Image] = None,
+) -> List[Tuple[int, int, int, int]]:
+    """Detect manga/comic panels using YOLO model.
+    Args:
+        image_path (Path): Path to the input image
+        confidence (float): Confidence threshold for panel YOLO detections
+        device (torch.device, optional): The device to run the model on. Autodetects if None.
+        verbose (bool): Whether to show detailed processing information
+        image_override (Image.Image, optional): PIL Image to use instead of loading from path
+    Returns:
+        list: List of tuples (x1, y1, x2, y2) representing panel bounding boxes.
+              Only includes detections with class "frame".
+    """
+    _device = (
+        device
+        if device is not None
+        else torch.device(
+            "cuda"
+            if torch.cuda.is_available()
+            else "mps" if torch.backends.mps.is_available() else "cpu"
+        )
+    )
+    try:
+        if image_override is not None:
+            image_pil = (
+                image_override
+                if image_override.mode == "RGB"
+                else image_override.convert("RGB")
+            )
+            image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+        else:
+            image_cv = cv2.imread(str(image_path))
+            if image_cv is None:
+                raise ImageProcessingError(f"Could not read image at {image_path}")
+            image_pil = Image.fromarray(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB))
+        log_message(
+            f"Processing image for panel detection: {image_path.name if image_path else 'override'} "
+            f"({image_cv.shape[1]}x{image_cv.shape[0]})",
+            verbose=verbose,
+        )
+    except Exception as e:
+        raise ImageProcessingError(f"Error loading image: {e}")
+    model_manager = get_model_manager()
+    try:
+        panel_model = model_manager.load_yolo_panel(verbose=verbose)
+    except Exception as e:
+        raise ModelError(f"Error loading panel model: {e}")
+    try:
+        results = panel_model(image_cv, conf=confidence, device=_device, verbose=False)[
+            0
+        ]
+        boxes = results.boxes.xyxy if results.boxes is not None else torch.tensor([])
+        classes = results.boxes.cls if results.boxes is not None else torch.tensor([])
+        if len(boxes) == 0:
+            log_message("No panels detected", verbose=verbose)
+            return []
+        # Filter for "frame" class (panel class)
+        frame_class_id = None
+        if hasattr(panel_model, "names"):
+            for class_id, class_name in panel_model.names.items():
+                if class_name.lower() == "frame":
+                    frame_class_id = class_id
+                    break
+        panel_boxes = []
+        for i, box in enumerate(boxes):
+            # If we found a frame class ID, only include detections of that class
+            # Otherwise, include all detections (fallback)
+            if frame_class_id is not None:
+                if int(classes[i]) != frame_class_id:
+                    continue
+            x0_f, y0_f, x1_f, y1_f = box.tolist()
+            panel_boxes.append(
+                (
+                    int(round(x0_f)),
+                    int(round(y0_f)),
+                    int(round(x1_f)),
+                    int(round(y1_f)),
+                )
+            )
+        return panel_boxes
+    except Exception as e:
+        log_message(
+            f"Panel detection failed: {e}. Proceeding without panel information.",
+            always_print=True,
+        )
+        return []

core/image/image_utils.py ADDED Viewed

	@@ -0,0 +1,779 @@

+import gc
+import io
+import os
+import tempfile
+from pathlib import Path
+from typing import Tuple
+import cv2
+import numpy as np
+try:
+    import oxipng
+    OXIPNG_AVAILABLE = True
+except ImportError:
+    oxipng = None
+    OXIPNG_AVAILABLE = False
+import torch
+from PIL import Image
+from core.caching import get_cache
+from core.ml.model_manager import get_model_manager
+from utils.exceptions import ImageProcessingError
+from utils.logging import log_message
+def pil_to_cv2(pil_image):
+    """
+    Convert PIL Image to OpenCV format (numpy array)
+    Args:
+        pil_image (PIL.Image): PIL Image object
+    Returns:
+        numpy.ndarray: OpenCV image in BGR format
+    """
+    rgb_image = np.array(pil_image)
+    if len(rgb_image.shape) == 3:
+        if rgb_image.shape[2] == 3:  # RGB
+            return cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+        elif rgb_image.shape[2] == 4:  # RGBA
+            return cv2.cvtColor(rgb_image, cv2.COLOR_RGBA2BGRA)
+    return rgb_image
+def cv2_to_pil(cv2_image):
+    """
+    Convert OpenCV image to PIL Image
+    Args:
+        cv2_image (numpy.ndarray): OpenCV image in BGR or BGRA format
+    Returns:
+        PIL.Image: PIL Image object
+    """
+    if len(cv2_image.shape) == 3:
+        if cv2_image.shape[2] == 3:  # BGR
+            rgb_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB)
+            return Image.fromarray(rgb_image)
+        elif cv2_image.shape[2] == 4:  # BGRA
+            rgba_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGRA2RGBA)
+            return Image.fromarray(rgba_image)
+    return Image.fromarray(cv2_image)
+def save_image_with_compression(
+    image, output_path, jpeg_quality=95, png_compression=2, verbose=False
+):
+    """
+    Save an image with specified compression settings.
+    Args:
+        image (PIL.Image): Image to save
+        output_path (str or Path): Path to save the image
+        jpeg_quality (int): JPEG quality (1-100, higher is better quality)
+        png_compression (int): PNG compression level (0-6, higher is more compression)
+        verbose (bool): Whether to print verbose logging
+    Raises:
+        ImageProcessingError: If image saving fails
+    """
+    output_path = (
+        Path(output_path) if not isinstance(output_path, Path) else output_path
+    )
+    extension = output_path.suffix.lower()
+    output_format = None
+    save_options = {}
+    if extension in [".jpg", ".jpeg"]:
+        output_format = "JPEG"
+        # JPEG doesn't support transparency - composite on white background
+        if image.mode in ["RGBA", "LA"]:
+            log_message(
+                f"Converting {image.mode} to RGB for JPEG output", verbose=verbose
+            )
+            background = Image.new("RGB", image.size, (255, 255, 255))
+            alpha_channel = image.split()[-1] if image.mode in ["RGBA", "LA"] else None
+            background.paste(image, mask=alpha_channel)
+            image = background
+        elif image.mode == "P":  # Handle Palette mode
+            log_message("Converting P mode to RGB for JPEG output", verbose=verbose)
+            image = image.convert("RGB")
+        elif image.mode != "RGB":
+            log_message(
+                f"Converting {image.mode} mode to RGB for JPEG output", verbose=verbose
+            )
+            image = image.convert("RGB")
+        save_options["quality"] = max(1, min(jpeg_quality, 100))
+        log_message(
+            f"Saving JPEG image with quality {save_options['quality']} to {output_path}",
+            verbose=verbose,
+        )
+    elif extension == ".png":
+        output_format = "PNG"
+        oxipng_level = min(6, max(0, int(png_compression)))
+        log_message(
+            f"Saving PNG image with compression level {oxipng_level} to {output_path}",
+            verbose=verbose,
+        )
+    elif extension == ".webp":
+        output_format = "WEBP"
+        save_options["lossless"] = True
+        log_message(
+            f"Saving WEBP image with lossless quality to {output_path}", verbose=verbose
+        )
+    else:
+        log_message(
+            f"Warning: Unknown output extension '{extension}'. Saving as PNG.",
+            verbose=verbose,
+            always_print=True,
+        )
+        output_format = "PNG"
+        output_path = output_path.with_suffix(".png")
+        oxipng_level = min(6, max(0, int(png_compression)))
+        log_message(
+            f"Saving PNG image with compression level {oxipng_level} to {output_path}",
+            verbose=verbose,
+        )
+    try:
+        os.makedirs(output_path.parent, exist_ok=True)
+        if output_format == "PNG":
+            if OXIPNG_AVAILABLE:
+                buffer = io.BytesIO()
+                image.save(buffer, format="PNG")
+                png_data = buffer.getvalue()
+                try:
+                    optimized_data = oxipng.optimize_from_memory(
+                        png_data, level=oxipng_level, optimize_alpha=True
+                    )
+                    with open(output_path, "wb") as f:
+                        f.write(optimized_data)
+                except oxipng.PngError as e:
+                    log_message(
+                        f"oxipng optimization failed: {e}. Falling back to Pillow save.",
+                        verbose=verbose,
+                        always_print=True,
+                    )
+                    # Fallback to Pillow if oxipng fails
+                    image.save(
+                        str(output_path),
+                        format="PNG",
+                        compress_level=max(0, min(png_compression, 6)),
+                        optimize=True,
+                    )
+            else:
+                # oxipng not available, use Pillow directly
+                image.save(
+                    str(output_path),
+                    format="PNG",
+                    compress_level=max(0, min(png_compression, 6)),
+                    optimize=True,
+                )
+        else:
+            # Use Pillow for non-PNG formats
+            image.save(str(output_path), format=output_format, **save_options)
+        return True
+    except Exception as e:
+        log_message(f"Error saving image to {output_path}: {e}", always_print=True)
+        raise ImageProcessingError(f"Failed to save image to {output_path}") from e
+def calculate_centroid_expansion_box(
+    cleaned_mask: np.ndarray, padding_pixels: float = 5.0, verbose: bool = False
+) -> Tuple[Tuple[int, int, int, int], Tuple[float, float]]:
+    """
+    Calculates guaranteed safe rendering box using the 5-step Distance Transform Insetting Method.
+    This function implements a sophisticated algorithm to find the optimal text placement area
+    within a speech bubble, ensuring text never touches the bubble boundaries. The method uses
+    computer vision techniques to create a safe zone for text rendering.
+    Algorithm Overview:
+    The 5-step Distance Transform Insetting Method works as follows:
+    1. Establish Safe Zone:
+       - Uses cv2.distanceTransform() to compute the distance from each pixel to the nearest
+         bubble edge (0 pixels)
+       - Creates a safe_area_mask where distance >= padding_pixels
+       - This ensures all pixels in the safe zone are at least padding_pixels away from edges
+    2. Find Unbiased Anchor:
+       - Calculates the centroid (geometric center) of the safe_area_mask using cv2.moments()
+       - This provides an unbiased starting point for text placement
+       - The centroid represents the "center of mass" of the safe area
+    3. Measure Available Space:
+       - Performs ray-casting from the centroid in four cardinal directions (left, right, up, down)
+       - Measures distances to the nearest safe area boundary in each direction
+       - Uses numpy array operations for efficient distance calculation
+    4. Calculate Symmetrical Dimensions:
+       - Takes the minimum distance in each axis to ensure the box fits in all directions
+       - Multiplies by 2 to create symmetrical width and height around the centroid
+       - Subtracts 1 pixel margin for safety
+    5. Construct Final Box:
+       - Creates a centered rectangle within the safe zone
+       - Ensures the box is completely contained within the original mask bounds
+       - Returns both the box coordinates and the true centroid for precise text positioning
+    Why This Approach Works:
+    - Distance Transform provides accurate edge detection and safe zone calculation
+    - Ray-casting ensures the text box never touches bubble boundaries
+    - Centroid-based approach provides natural, visually appealing text placement
+    - Symmetrical dimensions prevent text from appearing off-center
+    - The method handles complex bubble shapes (ovals, irregular polygons, etc.)
+    Args:
+        cleaned_mask: Binary mask (0/255) of the cleaned speech bubble where 255 represents
+                     the bubble interior and 0 represents the background
+        padding_pixels: Minimum distance in pixels that text must maintain from bubble edges.
+                       Higher values create more padding but smaller text areas.
+        verbose: Whether to print detailed processing information for debugging
+    Returns:
+        Tuple containing:
+        - Tuple[int, int, int, int]: Safe box coordinates as [x, y, width, height] where
+          (x, y) is the top-left corner.
+        - Tuple[float, float]: True geometric center (centroid) of the safe area as (cx, cy).
+    Raises:
+        ImageProcessingError: If mask is invalid or calculation fails
+    Example:
+        >>> mask = np.zeros((100, 100), dtype=np.uint8)
+        >>> cv2.ellipse(mask, (50, 50), (40, 30), 0, 0, 360, 255, -1)
+        >>> box, centroid = calculate_centroid_expansion_box(mask, padding_pixels=10.0)
+        >>> log_message(f"Safe box: {box}, Centroid: {centroid}", verbose=True)
+        Safe box: (20, 30, 60, 40), Centroid: (50.0, 50.0)
+    """
+    if cleaned_mask is None or not np.any(cleaned_mask):
+        raise ImageProcessingError("Invalid or empty mask provided")
+    try:
+        # Create safe area using distance transform
+        distance_map = cv2.distanceTransform(
+            cleaned_mask, cv2.DIST_L2, cv2.DIST_MASK_PRECISE
+        )
+        safe_area_mask = (distance_map >= padding_pixels).astype(np.uint8) * 255
+        if not np.any(safe_area_mask):
+            log_message(
+                f"Safe area calculation failed: padding {padding_pixels:.0f}px too large",
+                verbose=verbose,
+                always_print=True,
+            )
+            raise ImageProcessingError("Failed to create safe area mask")
+        # Find centroid of safe area
+        moments = cv2.moments(safe_area_mask)
+        if moments["m00"] == 0:
+            raise ImageProcessingError("Safe area mask has no area")
+        centroid_x = moments["m10"] / moments["m00"]
+        centroid_y = moments["m01"] / moments["m00"]
+        # Check if centroid is in a constricted region (dual/conjoined bubbles)
+        _, max_val, _, max_loc = cv2.minMaxLoc(distance_map)
+        cx_int, cy_int = int(round(centroid_x)), int(round(centroid_y))
+        mask_h, mask_w = safe_area_mask.shape
+        cx_int = max(0, min(cx_int, mask_w - 1))
+        cy_int = max(0, min(cy_int, mask_h - 1))
+        dist_at_centroid = distance_map[cy_int, cx_int]
+        if dist_at_centroid < max_val * 0.70:
+            log_message(
+                f"Centroid in constricted region (dist={dist_at_centroid:.1f} vs max={max_val:.1f}). "
+                "Moving anchor to pole of inaccessibility.",
+                verbose=verbose,
+            )
+            centroid_x, centroid_y = float(max_loc[0]), float(max_loc[1])
+        centroid = (centroid_x, centroid_y)
+        # Ray-cast from centroid to find maximum safe dimensions
+        cx, cy = int(round(centroid_x)), int(round(centroid_y))
+        mask_h, mask_w = safe_area_mask.shape
+        # Verify centroid is within safe area, adjust if needed
+        if (
+            cy < 0
+            or cy >= mask_h
+            or cx < 0
+            or cx >= mask_w
+            or safe_area_mask[cy, cx] != 255
+        ):
+            # Centroid is outside safe area, find nearest safe pixel
+            safe_pixels = np.argwhere(safe_area_mask == 255)
+            if safe_pixels.size == 0:
+                raise ImageProcessingError("No safe pixels found in safe_area_mask")
+            # Find nearest safe pixel to calculated centroid
+            distances = np.sqrt(
+                (safe_pixels[:, 0] - centroid_y) ** 2
+                + (safe_pixels[:, 1] - centroid_x) ** 2
+            )
+            nearest_idx = np.argmin(distances)
+            cy, cx = safe_pixels[nearest_idx]
+            # Update centroid to the adjusted position
+            centroid_x, centroid_y = float(cx), float(cy)
+            centroid = (centroid_x, centroid_y)
+        left_zeros = np.where(safe_area_mask[cy, 0:cx] == 0)[0]
+        dist_to_left_edge = cx - (left_zeros.max() if left_zeros.size > 0 else 0)
+        right_zeros = np.where(safe_area_mask[cy, cx:] == 0)[0]
+        dist_to_right_edge = right_zeros.min() if right_zeros.size > 0 else mask_w - cx
+        up_zeros = np.where(safe_area_mask[0:cy, cx] == 0)[0]
+        dist_to_top_edge = cy - (up_zeros.max() if up_zeros.size > 0 else 0)
+        down_zeros = np.where(safe_area_mask[cy:, cx] == 0)[0]
+        dist_to_bottom_edge = down_zeros.min() if down_zeros.size > 0 else mask_h - cy
+        # Only subtract 1 if distance > 1, otherwise use the distance directly
+        # This prevents collapsing 1-pixel safe areas to 0x0
+        min_width_dist = min(dist_to_left_edge, dist_to_right_edge)
+        min_height_dist = min(dist_to_top_edge, dist_to_bottom_edge)
+        safe_width_base = min_width_dist - 1 if min_width_dist > 1 else min_width_dist
+        safe_height_base = (
+            min_height_dist - 1 if min_height_dist > 1 else min_height_dist
+        )
+        max_safe_width = 2 * max(0, safe_width_base)
+        max_safe_height = 2 * max(0, safe_height_base)
+        if max_safe_width <= 0 or max_safe_height <= 0:
+            log_message(
+                f"Invalid safe area dimensions: {max_safe_width:.0f}x{max_safe_height:.0f}",
+                verbose=verbose,
+                always_print=True,
+            )
+            raise ImageProcessingError("Failed to create safe area mask")
+        box_x_float = centroid_x - max_safe_width / 2.0
+        box_y_float = centroid_y - max_safe_height / 2.0
+        box_x = int(round(box_x_float))
+        box_y = int(round(box_y_float))
+        guaranteed_box = (box_x, box_y, max_safe_width, max_safe_height)
+        if (
+            box_x >= 0
+            and box_y >= 0
+            and box_x + max_safe_width <= mask_w
+            and box_y + max_safe_height <= mask_h
+        ):
+            log_message(
+                f"Safe area: {max_safe_width:.0f}x{max_safe_height:.0f} at ({centroid_x:.0f}, {centroid_y:.0f})",
+                verbose=verbose,
+            )
+            return guaranteed_box, centroid
+        else:
+            log_message(
+                f"Safe area validation failed: exceeds bounds {mask_w}x{mask_h}",
+                verbose=verbose,
+                always_print=True,
+            )
+            raise ImageProcessingError("Failed to create safe area mask")
+    except (cv2.error, ValueError, IndexError, ZeroDivisionError, OverflowError) as e:
+        log_message(
+            f"Safe area calculation error: {e}", verbose=verbose, always_print=True
+        )
+    except Exception as e:
+        log_message(
+            f"Safe area calculation failed: {e}", verbose=verbose, always_print=True
+        )
+    raise ImageProcessingError("Safe area calculation failed")
+def image_to_tensor(image: Image.Image, device: torch.device) -> torch.Tensor:
+    """Converts a PIL Image to a PyTorch tensor."""
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    img_np = np.array(image).astype(np.float32) / 255.0
+    if img_np.ndim == 2:  # Grayscale to RGB
+        img_np = np.stack((img_np,) * 3, axis=-1)
+    return torch.from_numpy(img_np).permute(2, 0, 1).unsqueeze(0).to(device)
+def tensor_to_image(tensor: torch.Tensor) -> Image.Image:
+    """Converts a PyTorch tensor to a PIL Image."""
+    img_np = (
+        tensor.squeeze(0).permute(1, 2, 0).clamp(0, 1).cpu().numpy() * 255
+    ).astype(np.uint8)
+    return Image.fromarray(img_np)
+def _upscale_image(model, image: Image.Image, device: torch.device) -> Image.Image:
+    """Upscales a PIL image using the provided model."""
+    tensor_in = image_to_tensor(image, device)
+    with torch.no_grad():
+        tensor_out = model(tensor_in)
+    return tensor_to_image(tensor_out)
+def upscale_image_to_dimension(
+    model,
+    image: Image.Image,
+    target: int,
+    device: torch.device,
+    mode: str,
+    model_type: str = "model",
+    verbose: bool = False,
+) -> Image.Image:
+    """
+    Upscale until a dimensional target is reached.
+    Args:
+        mode: 'max' ensures max(width, height) >= target, 'min' ensures min(width, height) >= target
+        model_type: Model type identifier ("model" or "model_lite")
+    """
+    if mode not in {"max", "min"}:
+        raise ImageProcessingError("mode must be 'max' or 'min'")
+    # Validate input image dimensions
+    if image.width <= 0 or image.height <= 0:
+        log_message(
+            f"Invalid image dimensions: {image.width}x{image.height}. Cannot upscale 0x0 images.",
+            always_print=True,
+        )
+        raise ImageProcessingError(
+            f"Invalid image dimensions: {image.width}x{image.height}. Cannot upscale 0x0 images."
+        )
+    cache = get_cache()
+    cache_key = cache.get_upscale_dimension_cache_key(image, target, mode, model_type)
+    cached_result = cache.get_upscaled_image(cache_key)
+    if cached_result is not None:
+        log_message("  - Using cached upscaled image", verbose=verbose)
+        return cached_result
+    current_image = image
+    def met(w: int, h: int) -> bool:
+        return (max(w, h) >= target) if mode == "max" else (min(w, h) >= target)
+    if met(current_image.width, current_image.height):
+        cache.set_upscaled_image(cache_key, current_image, verbose)
+        return current_image
+    log_message(
+        f"Upscaling from {current_image.width}x{current_image.height}...",
+        verbose=verbose,
+    )
+    current_image = _upscale_image(model, current_image, device)
+    log_message(f"...to {current_image.width}x{current_image.height}", verbose=verbose)
+    # Save intermediate image to disk if more passes will be needed
+    if not met(current_image.width, current_image.height):
+        temp_file = None
+        try:
+            temp_fd, temp_file = tempfile.mkstemp(suffix=".png")
+            os.close(temp_fd)
+            current_image.save(temp_file, format="PNG")
+            with Image.open(temp_file) as img_tmp:
+                img_tmp.load()
+                new_image = img_tmp.copy()
+            del current_image
+            gc.collect()
+            current_image = new_image
+            log_message(
+                "Saved and reloaded intermediate image before additional passes",
+                verbose=verbose,
+            )
+        except Exception as e:
+            log_message(
+                f"Warning: Failed to save intermediate image to disk: {e}. Continuing with in-memory processing.",
+                verbose=verbose,
+            )
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try:
+                    os.remove(temp_file)
+                except Exception:
+                    pass
+    while not met(current_image.width, current_image.height):
+        log_message(
+            f"Upscaling from {current_image.width}x{current_image.height} (additional pass)...",
+            verbose=verbose,
+        )
+        current_image = _upscale_image(model, current_image, device)
+        log_message(
+            f"...to {current_image.width}x{current_image.height}", verbose=verbose
+        )
+        # Save intermediate image to disk to free memory
+        temp_file = None
+        try:
+            temp_fd, temp_file = tempfile.mkstemp(suffix=".png")
+            os.close(temp_fd)
+            current_image.save(temp_file, format="PNG")
+            del current_image
+            gc.collect()
+            with Image.open(temp_file) as img_tmp:
+                img_tmp.load()
+                current_image = img_tmp.copy()
+            log_message(
+                "Saved and reloaded intermediate image to free memory",
+                verbose=verbose,
+            )
+        except Exception as e:
+            log_message(
+                f"Warning: Failed to save intermediate image to disk: {e}. Continuing with in-memory processing.",
+                verbose=verbose,
+            )
+        finally:
+            if temp_file and os.path.exists(temp_file):
+                try:
+                    os.remove(temp_file)
+                except Exception:
+                    pass  # Ignore errors during cleanup
+    cache.set_upscaled_image(cache_key, current_image, verbose)
+    return current_image
+def upscale_image(
+    image: Image.Image, factor: float, model_type: str = "model", verbose: bool = False
+) -> Image.Image:
+    """Upscales an image by a given factor.
+    Args:
+        image: Image to upscale
+        factor: Upscaling factor
+        model_type: Model type to use - "model" or "model_lite"
+        verbose: Whether to print verbose logging
+    """
+    if factor == 1.0:
+        return image
+    cache = get_cache()
+    cache_key = cache.get_upscale_cache_key(image, factor, model_type)
+    cached_upscale = cache.get_upscaled_image(cache_key)
+    if cached_upscale is not None:
+        log_message("  - Using cached upscaled image", verbose=verbose)
+        return cached_upscale
+    model_manager = get_model_manager()
+    if model_type == "model_lite":
+        upscale_model = model_manager.load_upscale_lite()
+        log_message(f"Upscaling image by {factor}x with lite model...", verbose=verbose)
+    else:
+        upscale_model = model_manager.load_upscale()
+        log_message(f"Upscaling image by {factor}x...", verbose=verbose)
+    device = model_manager.device
+    target_width = int(image.width * factor)
+    target_height = int(image.height * factor)
+    upscaled_image = upscale_image_to_dimension(
+        upscale_model,
+        image,
+        max(target_width, target_height),
+        device,
+        "max",
+        model_type,
+        verbose,
+    )
+    result = upscaled_image.resize((target_width, target_height), Image.LANCZOS)
+    cache.set_upscaled_image(cache_key, result)
+    return result
+def resize_to_max_side(
+    image: Image.Image, max_side: int, verbose: bool = False
+) -> Image.Image:
+    """Resize so that the largest side equals max_side (aspect ratio preserved)."""
+    width, height = image.size
+    current_max = max(width, height)
+    if current_max == max_side:
+        return image
+    scale = max_side / current_max
+    new_width = max(1, int(round(width * scale)))
+    new_height = max(1, int(round(height * scale)))
+    log_message(
+        f"Resizing to max-side {max_side}: {width}x{height} -> {new_width}x{new_height}",
+        verbose=verbose,
+    )
+    return image.resize((new_width, new_height), Image.LANCZOS)
+def resize_to_min_side(
+    image: Image.Image, min_side: int, verbose: bool = False
+) -> Image.Image:
+    """Resize so that the smallest side equals min_side (aspect ratio preserved)."""
+    width, height = image.size
+    # Validate input image dimensions
+    if width <= 0 or height <= 0:
+        log_message(
+            f"Invalid image dimensions: {width}x{height}. Cannot resize 0x0 images.",
+            always_print=True,
+        )
+        raise ImageProcessingError(
+            f"Invalid image dimensions: {width}x{height}. Cannot resize 0x0 images."
+        )
+    current_min = min(width, height)
+    if current_min == min_side:
+        return image
+    scale = min_side / current_min
+    new_width = max(1, int(round(width * scale)))
+    new_height = max(1, int(round(height * scale)))
+    log_message(
+        f"Resizing to min-side {min_side}: {width}x{height} -> {new_width}x{new_height}",
+        verbose=verbose,
+    )
+    return image.resize((new_width, new_height), Image.LANCZOS)
+def convert_image_to_target_mode(
+    pil_image: Image.Image, target_mode: str, verbose: bool = False
+) -> Image.Image:
+    """
+    Convert a PIL image to the target color mode (RGB or RGBA).
+    Handles complex transparency flattening and mode conversion with multiple
+    fallback strategies to ensure robust image processing.
+    Args:
+        pil_image: The PIL image to convert
+        target_mode: Target mode ("RGB" or "RGBA")
+        verbose: Whether to print detailed logging
+    Returns:
+        PIL.Image: The converted image in the target mode
+    """
+    if pil_image.mode == target_mode:
+        return pil_image
+    if target_mode == "RGB":
+        if (
+            pil_image.mode == "RGBA"
+            or pil_image.mode == "LA"
+            or (pil_image.mode == "P" and "transparency" in pil_image.info)
+        ):
+            log_message(
+                f"Converting {pil_image.mode} to RGB (flattening transparency)",
+                verbose=verbose,
+            )
+            background = Image.new("RGB", pil_image.size, (255, 255, 255))
+            try:
+                mask = None
+                if pil_image.mode == "RGBA":
+                    mask = pil_image.split()[3]
+                elif pil_image.mode == "LA":
+                    mask = pil_image.split()[1]
+                elif pil_image.mode == "P" and "transparency" in pil_image.info:
+                    temp_rgba = pil_image.convert("RGBA")
+                    mask = temp_rgba.split()[3]
+                if mask:
+                    background.paste(pil_image, mask=mask)
+                    pil_image = background
+                else:
+                    pil_image = pil_image.convert("RGB")
+            except Exception as paste_err:
+                log_message(
+                    f"Warning: Paste failed, trying alpha_composite: {paste_err}",
+                    verbose=verbose,
+                )
+                try:
+                    background_comp = Image.new("RGB", pil_image.size, (255, 255, 255))
+                    img_rgba_for_composite = (
+                        pil_image
+                        if pil_image.mode == "RGBA"
+                        else pil_image.convert("RGBA")
+                    )
+                    pil_image = Image.alpha_composite(
+                        background_comp.convert("RGBA"), img_rgba_for_composite
+                    ).convert("RGB")
+                    log_message(
+                        "Alpha composite conversion successful", verbose=verbose
+                    )
+                except Exception as composite_err:
+                    log_message(
+                        f"Warning: Alpha composite failed, using simple convert: {composite_err}",
+                        verbose=verbose,
+                    )
+                    pil_image = pil_image.convert("RGB")  # Final fallback conversion
+        else:  # Non-transparent conversion to RGB
+            log_message(f"Converting {pil_image.mode} to RGB", verbose=verbose)
+            pil_image = pil_image.convert("RGB")
+    elif target_mode == "RGBA":
+        log_message(f"Converting {pil_image.mode} to RGBA", verbose=verbose)
+        pil_image = pil_image.convert("RGBA")
+    return pil_image
+def process_bubble_image_cached(
+    bubble_image_pil: Image.Image,
+    upscale_model,
+    device: torch.device,
+    target_min_side: int = 200,
+    mode: str = "min",
+    model_type: str = "model",
+    verbose: bool = False,
+) -> Image.Image:
+    """
+    Process a bubble image with upscaling, using cache for the complete pipeline.
+    This function handles the complete bubble processing pipeline:
+    1. Upscales the bubble to meet minimum size requirements
+    2. Resizes to exact minimum side length
+    3. Caches the final result
+    Args:
+        bubble_image_pil: The bubble image to process
+        upscale_model: The upscaling model to use
+        device: PyTorch device for model inference
+        target_min_side: Target minimum side length
+        mode: Upscaling mode ('max' or 'min')
+        model_type: Model type identifier ("model" or "model_lite")
+        verbose: Whether to print detailed logging
+    Returns:
+        Image.Image: The processed bubble image
+    """
+    cache = get_cache()
+    cache_key = cache.get_bubble_processing_cache_key(
+        bubble_image_pil, target_min_side, mode, model_type
+    )
+    cached_result = cache.get_upscaled_image(cache_key)
+    if cached_result is not None:
+        log_message("  - Using cached bubble processing result", verbose=verbose)
+        return cached_result
+    upscaled_bubble = upscale_image_to_dimension(
+        upscale_model,
+        bubble_image_pil,
+        target_min_side,
+        device,
+        mode,
+        model_type,
+        verbose,
+    )
+    resized_bubble = resize_to_min_side(upscaled_bubble, target_min_side, verbose)
+    cache.set_upscaled_image(cache_key, resized_bubble, verbose)
+    return resized_bubble

core/image/inpainting.py ADDED Viewed

	@@ -0,0 +1,773 @@

+import gc
+import math
+from typing import Dict, Optional, Tuple
+import numpy as np
+import torch
+from PIL import Image
+from scipy.ndimage import distance_transform_edt
+from core.caching import get_cache
+from core.ml.model_manager import get_model_manager
+from utils.logging import log_message
+# Blur Parameters
+BLUR_SCALE_FACTOR = (
+    0.1  # Multiplier for bounding box dimensions to calculate blur radius
+)
+MIN_BLUR_RADIUS = 1  # Minimum blur radius in pixels
+MAX_BLUR_RADIUS = 10  # Maximum blur radius in pixels
+# Inpainting Parameters
+FLUX_GUIDANCE_SCALE = 2.5  # Flux Kontext guidance scale
+CONTEXT_PADDING_RATIO = 0.5  # Context padding is 50% of detection size
+MAX_CONTEXT_PADDING = 80  # Context padding capped at 80 pixels
+class FluxKontextInpainter:
+    """Inpainter using Flux Kontext models for text removal."""
+    def __init__(
+        self,
+        device: Optional[torch.device] = None,
+        huggingface_token: str = "",
+        num_inference_steps: int = 15,
+        residual_diff_threshold: float = 0.15,
+    ):
+        """Initialize the Flux Kontext Inpaint class.
+        Args:
+            device: PyTorch device to use. Auto-detects if None.
+            huggingface_token: HuggingFace token for model downloads.
+            num_inference_steps: Number of denoising steps for inference.
+            residual_diff_threshold: Residual diff threshold for Flux caching (0.0-1.0).
+        """
+        self.DEVICE = (
+            device
+            if device is not None
+            else torch.device(
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps"
+                if torch.backends.mps.is_available()
+                else "cpu"
+            )
+        )
+        self.DTYPE = (
+            torch.bfloat16
+            if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+            else torch.float16
+            if self.DEVICE.type == "mps"
+            else torch.float32
+        )
+        self.huggingface_token = huggingface_token
+        self.num_inference_steps = num_inference_steps
+        self.residual_diff_threshold = residual_diff_threshold
+        self.manager = get_model_manager()
+        self.cache = get_cache()
+        # Preferred resolutions for optimal Flux performance
+        self.PREFERED_KONTEXT_RESOLUTIONS = [
+            (672, 1568),
+            (688, 1504),
+            (720, 1456),
+            (752, 1392),
+            (800, 1328),
+            (832, 1248),
+            (880, 1184),
+            (944, 1104),
+            (1024, 1024),
+            (1104, 944),
+            (1184, 880),
+            (1248, 832),
+            (1328, 800),
+            (1392, 752),
+            (1456, 720),
+            (1504, 688),
+            (1568, 672),
+        ]
+        self.pipeline = None
+        self.transformer = None
+        self.text_encoder_2 = None
+        # Fixed parameters optimized for text removal
+        self.guidance_scale = FLUX_GUIDANCE_SCALE
+        self.prompt = "Remove all text."
+        self.context_padding_ratio = CONTEXT_PADDING_RATIO
+        self.max_context_padding = MAX_CONTEXT_PADDING
+    def load_models(self):
+        """Load Flux Kontext models via model manager."""
+        if self.pipeline is not None:
+            return
+        if self.huggingface_token:
+            self.manager.set_flux_hf_token(self.huggingface_token)
+        self.manager.set_flux_residual_diff_threshold(self.residual_diff_threshold)
+        self.transformer, self.text_encoder_2, self.pipeline = (
+            self.manager.load_flux_models()
+        )
+    def unload_models(self):
+        """Unload Flux models via model manager to free up memory."""
+        self.pipeline = None
+        self.transformer = None
+        self.text_encoder_2 = None
+        self.manager.unload_flux_models()
+    def convert_mask_to_tensor(self, mask_np):
+        """Convert a numpy mask to the tensor format expected by the pipeline.
+        Args:
+            mask_np: Numpy mask array (H, W) with True/False values
+        Returns:
+            torch.Tensor: Mask tensor in CHW format (1.0 for areas to keep, 0.0 for areas to inpaint)
+        """
+        # Invert mask: True = inpaint (0.0), False = keep (1.0)
+        mask_float = mask_np.astype(np.float32)
+        mask_inverted = 1.0 - mask_float
+        mask_tensor = torch.from_numpy(mask_inverted).unsqueeze(0)
+        return mask_tensor
+    def flux_kontext_image_scale(self, image_pil):
+        """Find the closest preferred resolution and resize the image.
+        Args:
+            image_pil (PIL.Image): Input image to scale
+        Returns:
+            PIL.Image: Scaled image at the closest preferred resolution
+        """
+        w_in, h_in = image_pil.size
+        if w_in == 0 or h_in == 0:
+            return image_pil
+        ar = w_in / h_in
+        # Find resolution with minimum aspect ratio difference
+        _, w_opt, h_opt = min(
+            (abs(ar - w / h), w, h) for (w, h) in self.PREFERED_KONTEXT_RESOLUTIONS
+        )
+        log_message(
+            f"  - Original image size: {w_in}x{h_in} (AR: {ar:.2f})", always_print=True
+        )
+        log_message(
+            f"  - Scaling to nearest preferred resolution: {w_opt}x{h_opt}",
+            always_print=True,
+        )
+        if (w_in, h_in) == (w_opt, h_opt):
+            return image_pil
+        # Use LANCZOS for high-quality downscaling
+        image_scaled = image_pil.resize((w_opt, h_opt), Image.Resampling.LANCZOS)
+        return image_scaled
+    def compute_mask_bbox_aspect_ratio(
+        self,
+        mask_chw,
+        padding,
+        blur_radius,
+        target_ar=None,
+        transpose=False,
+        preferred_resolutions=None,
+        verbose=False,
+    ):
+        """Compute an optimized bounding box for the mask with aspect ratio adjustment.
+        Args:
+            mask_chw (torch.Tensor): Input mask tensor in CHW format
+            padding (int): Padding around the mask bounding box
+            blur_radius (int): Radius for edge blur effect
+            target_ar (float, optional): Target aspect ratio
+            transpose (bool): Whether to transpose the aspect ratio logic
+            preferred_resolutions (list, optional): List of preferred resolutions
+            verbose (bool): Whether to print verbose output
+        Returns:
+            tuple: (mask_for_composite, x, y, width, height)
+        """
+        if mask_chw.dim() == 4:
+            mask = mask_chw[0, 0]
+        else:
+            mask = mask_chw[0]
+        H, W = mask.shape[0], mask.shape[1]
+        hard = mask.clone().unsqueeze(0)
+        if blur_radius > 0:
+            # Create smooth falloff at mask edges for better blending
+            m_bool = hard[0].cpu().to(torch.float32).numpy().astype(bool)
+            d_out = distance_transform_edt(~m_bool)
+            d_in = distance_transform_edt(m_bool)
+            alpha = np.zeros_like(d_out, np.float32)
+            alpha[d_in > 0] = 1.0
+            ramp = np.clip(1.0 - (d_out / blur_radius), 0.0, 1.0)
+            alpha[d_out > 0] = ramp[d_out > 0]
+            mask_blur_full = torch.from_numpy(alpha)[None, ...].to(hard.device)
+        else:
+            mask_blur_full = hard.clone()
+        ys, xs = torch.where(hard[0] > 0)
+        if len(ys) == 0:
+            return (
+                torch.zeros((1, H, W), device=mask_chw.device, dtype=mask_chw.dtype),
+                0,
+                0,
+                W,
+                H,
+            )
+        x1 = max(0, int(xs.min()) - padding)
+        x2 = min(W, int(xs.max()) + 1 + padding)
+        y1 = max(0, int(ys.min()) - padding)
+        y2 = min(H, int(ys.max()) + 1 + padding)
+        w0 = x2 - x1
+        h0 = y2 - y1
+        if preferred_resolutions:
+            if h0 == 0:
+                initial_ar = W / H
+            else:
+                initial_ar = w0 / h0
+            log_message(
+                f"  - Initial mask bounding box AR: {initial_ar:.2f}",
+                verbose=verbose,
+            )
+            # Snap to closest preferred aspect ratio
+            _, w_opt, h_opt = min(
+                (abs(initial_ar - w / h), w, h) for (w, h) in preferred_resolutions
+            )
+            ar = w_opt / h_opt
+            log_message(
+                f"  - Snapping to closest preferred AR: {ar:.2f} ({w_opt}x{h_opt})",
+                verbose=verbose,
+            )
+        else:
+            ar = target_ar
+        req_w = math.ceil(h0 * ar)
+        req_h = math.floor(w0 / ar)
+        new_x1, new_x2 = x1, x2
+        new_y1, new_y2 = y1, y2
+        flush_left = x1 == 0
+        flush_right = x2 == W
+        flush_top = y1 == 0
+        flush_bot = y2 == H
+        if not transpose:
+            if req_w > w0:
+                target_w = min(W, req_w)
+                delta = target_w - w0
+                if flush_right:
+                    new_x1, new_x2 = W - target_w, W
+                elif flush_left:
+                    new_x1, new_x2 = 0, target_w
+                else:
+                    off = delta // 2
+                    new_x1 = max(0, x1 - off)
+                    new_x2 = new_x1 + target_w
+                    if new_x2 > W:
+                        new_x2 = W
+                        new_x1 = W - target_w
+            elif req_h > h0:
+                target_h = min(H, req_h)
+                delta = target_h - h0
+                if flush_bot:
+                    new_y1, new_y2 = H - target_h, H
+                elif flush_top:
+                    new_y1, new_y2 = 0, target_h
+                else:
+                    off = delta // 2
+                    new_y1 = max(0, y1 - off)
+                    new_y2 = new_y1 + target_h
+                    if new_y2 > H:
+                        new_y2 = H
+                        new_y1 = H - target_h
+        else:  # Transpose logic
+            if req_h > h0:
+                target_h = min(H, req_h)
+                delta = target_h - h0
+                if flush_bot:
+                    new_y1, new_y2 = H - target_h, H
+                elif flush_top:
+                    new_y1, new_y2 = 0, target_h
+                else:
+                    off = delta // 2
+                    new_y1 = max(0, y1 - off)
+                    new_y2 = new_y1 + target_h
+                    if new_y2 > H:
+                        new_y2 = H
+                        new_y1 = H - target_h
+            elif req_w > w0:
+                target_w = min(W, req_w)
+                delta = target_w - w0
+                if flush_right:
+                    new_x1, new_x2 = W - target_w, W
+                elif flush_left:
+                    new_x1, new_x2 = 0, target_w
+                else:
+                    off = delta // 2
+                    new_x1 = max(0, x1 - off)
+                    new_x2 = new_x1 + target_w
+                    if new_x2 > W:
+                        new_x2 = W
+                        new_x1 = W - target_w
+        final_w = new_x2 - new_x1
+        final_h = new_y2 - new_y1
+        # Return cropped mask for compositing
+        mask_for_composite = mask_blur_full[:, new_y1:new_y2, new_x1:new_x2]
+        return (
+            mask_for_composite.to(mask_chw.device, dtype=mask_chw.dtype),
+            int(new_x1),
+            int(new_y1),
+            int(final_w),
+            int(final_h),
+        )
+    def image_alpha_fix(self, destination, source):
+        """Ensure destination and source tensors have compatible channel dimensions.
+        Args:
+            destination (torch.Tensor): Destination tensor
+            source (torch.Tensor): Source tensor
+        Returns:
+            tuple: (destination, source) with compatible dimensions
+        """
+        dest_channels = destination.shape[-1]
+        source_channels = source.shape[-1]
+        if dest_channels == source_channels:
+            return destination, source
+        if dest_channels > source_channels:
+            # Pad source to match destination's channel count
+            padding = torch.ones(
+                (*source.shape[:-1], dest_channels - source_channels),
+                device=source.device,
+                dtype=source.dtype,
+            )
+            source = torch.cat([source, padding], dim=-1)
+        else:  # source_channels > dest_channels
+            # Truncate source to match destination's channel count
+            source = source[..., :dest_channels]
+        return destination, source
+    def repeat_to_batch_size(self, tensor, batch_size):
+        """Adjust tensor batch size by repeating or truncating as needed.
+        Args:
+            tensor (torch.Tensor): Input tensor
+            batch_size (int): Target batch size
+        Returns:
+            torch.Tensor: Tensor with the specified batch size
+        """
+        if tensor.shape[0] > batch_size:
+            return tensor[:batch_size]
+        elif tensor.shape[0] < batch_size:
+            return tensor.repeat(batch_size, 1, 1, 1)
+        return tensor
+    def composite(
+        self, destination, source, x, y, mask=None, multiplier=1, resize_source=False
+    ):
+        """Composite source image onto destination at specified coordinates.
+        Args:
+            destination (torch.Tensor): Destination image tensor
+            source (torch.Tensor): Source image tensor
+            x (int): X coordinate for placement
+            y (int): Y coordinate for placement
+            mask (torch.Tensor, optional): Alpha mask for blending
+            multiplier (int): Coordinate multiplier
+            resize_source (bool): Whether to resize source to match destination
+        Returns:
+            torch.Tensor: Composited image tensor
+        """
+        source = source.to(destination.device)
+        if resize_source:
+            source = torch.nn.functional.interpolate(
+                source,
+                size=(destination.shape[2], destination.shape[3]),
+                mode="bilinear",
+            )
+        source = self.repeat_to_batch_size(source, destination.shape[0])
+        x = max(
+            -source.shape[3] * multiplier, min(x, destination.shape[3] * multiplier)
+        )
+        y = max(
+            -source.shape[2] * multiplier, min(y, destination.shape[2] * multiplier)
+        )
+        left, top = (x // multiplier, y // multiplier)
+        if mask is None:
+            mask = torch.ones_like(source)
+        else:
+            mask = mask.to(destination.device, copy=True)
+            mask = torch.nn.functional.interpolate(
+                mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1])),
+                size=(source.shape[2], source.shape[3]),
+                mode="bilinear",
+            )
+            mask = self.repeat_to_batch_size(mask, source.shape[0])
+        visible_width = max(0, min(source.shape[3], destination.shape[3] - left))
+        visible_height = max(0, min(source.shape[2], destination.shape[2] - top))
+        if visible_width == 0 or visible_height == 0:
+            return destination
+        source_portion = source[:, :, :visible_height, :visible_width]
+        mask_portion = mask[:, :, :visible_height, :visible_width]
+        inverse_mask_portion = torch.ones_like(mask_portion) - mask_portion
+        destination_portion = destination[
+            :, :, top : top + visible_height, left : left + visible_width
+        ]
+        # Alpha blend source and destination using mask
+        blended_portion = (source_portion * mask_portion) + (
+            destination_portion * inverse_mask_portion
+        )
+        destination[:, :, top : top + visible_height, left : left + visible_width] = (
+            blended_portion
+        )
+        return destination
+    def image_composite_masked(
+        self, destination, source, x, y, resize_source, mask=None
+    ):
+        """Wrapper function that handles channel dimension compatibility.
+        Args:
+            destination (torch.Tensor): Destination image tensor
+            source (torch.Tensor): Source image tensor
+            x (int): X coordinate for placement
+            y (int): Y coordinate for placement
+            resize_source (bool): Whether to resize source to match destination
+            mask (torch.Tensor, optional): Alpha mask for blending
+        Returns:
+            torch.Tensor: Composited image tensor
+        """
+        destination, source = self.image_alpha_fix(destination, source)
+        destination = destination.clone().movedim(-1, 1)
+        output = self.composite(
+            destination, source.movedim(-1, 1), x, y, mask, 1, resize_source
+        ).movedim(1, -1)
+        return output
+    def inpaint_mask(
+        self,
+        image_pil: Image.Image,
+        mask_np: np.ndarray,
+        seed: int = 1,
+        verbose: bool = False,
+        ocr_params: Optional[Dict] = None,
+        strict_mask_clipping: bool = False,
+        composite_clip_bbox: Optional[Tuple[int, int, int, int]] = None,
+    ) -> Image.Image:
+        """Inpaint a specific mask region in the image.
+        Args:
+            image_pil: PIL Image to inpaint
+            mask_np: Numpy mask array (H, W) with True for areas to inpaint
+            seed: Random seed for inference
+            verbose: Whether to print verbose output
+            ocr_params: Optional OCR parameters dict for cache key generation
+            strict_mask_clipping: When True, ensure compositing is limited to the
+                original mask extent (no bleed from padding/blur)
+            composite_clip_bbox: Optional (x1, y1, x2, y2) bbox to clip the final
+                composite mask to, in original image coordinates.
+        Returns:
+            PIL.Image: The inpainted image
+        """
+        mask_np = np.asarray(mask_np)
+        if mask_np.dtype != bool:
+            mask_np = mask_np.astype(bool)
+        if not np.any(mask_np):
+            return image_pil
+        log_message(
+            "  - Computing optimized mask bounding box with blur and aspect ratio...",
+            verbose=verbose,
+        )
+        ys, xs = np.where(mask_np)
+        if len(ys) == 0 or len(xs) == 0:
+            return image_pil
+        x_min, x_max = int(xs.min()), int(xs.max())
+        y_min, y_max = int(ys.min()), int(ys.max())
+        bbox_width = x_max - x_min
+        bbox_height = y_max - y_min
+        padding_pixels = int(max(bbox_width, bbox_height) * self.context_padding_ratio)
+        padding = min(padding_pixels, self.max_context_padding)
+        log_message(
+            f"  - Proportional context padding: {padding_pixels}px, capped to: {padding}px",
+            verbose=verbose,
+        )
+        blur_radius = int(max(bbox_width, bbox_height) * BLUR_SCALE_FACTOR)
+        blur_radius = max(
+            MIN_BLUR_RADIUS, min(blur_radius, MAX_BLUR_RADIUS)
+        )  # clamp between MIN and MAX
+        log_message(f"  - Dynamic blur radius set to: {blur_radius}", verbose=verbose)
+        mask_tensor = (
+            torch.from_numpy(mask_np.astype(np.float32)).unsqueeze(0).unsqueeze(0)
+        )
+        mask_for_composite, x, y, width, height = self.compute_mask_bbox_aspect_ratio(
+            mask_chw=mask_tensor,
+            padding=padding,
+            blur_radius=blur_radius,
+            preferred_resolutions=self.PREFERED_KONTEXT_RESOLUTIONS,
+            transpose=False,
+            verbose=verbose,
+        )
+        # Quantize bbox to improve cache stability against minor detection jitter
+        quant = 2
+        img_h, img_w = mask_np.shape
+        qx1 = max(0, min(img_w, int(round(x / quant) * quant)))
+        qy1 = max(0, min(img_h, int(round(y / quant) * quant)))
+        qx2 = max(qx1 + 1, min(img_w, int(round((x + width) / quant) * quant)))
+        qy2 = max(qy1 + 1, min(img_h, int(round((y + height) / quant) * quant)))
+        qwidth = max(1, qx2 - qx1)
+        qheight = max(1, qy2 - qy1)
+        # Adjust mask_for_composite to the quantized bbox via pad/crop
+        dx_left = x - qx1
+        dy_top = y - qy1
+        dx_right = (qx1 + qwidth) - (x + width)
+        dy_bottom = (qy1 + qheight) - (y + height)
+        if dx_left > 0 or dx_right > 0 or dy_top > 0 or dy_bottom > 0:
+            pad_l = max(dx_left, 0)
+            pad_r = max(dx_right, 0)
+            pad_t = max(dy_top, 0)
+            pad_b = max(dy_bottom, 0)
+            mask_for_composite = torch.nn.functional.pad(
+                mask_for_composite, (pad_l, pad_r, pad_t, pad_b)
+            )
+        if dx_left < 0:
+            mask_for_composite = mask_for_composite[:, :, -dx_left:]
+        if dy_top < 0:
+            mask_for_composite = mask_for_composite[:, -dy_top:, :]
+        if mask_for_composite.shape[-1] > qwidth:
+            mask_for_composite = mask_for_composite[:, :, :qwidth]
+        if mask_for_composite.shape[-2] > qheight:
+            mask_for_composite = mask_for_composite[:, :qheight, :]
+        x, y, width, height = qx1, qy1, qwidth, qheight
+        if strict_mask_clipping:
+            original_mask_crop = mask_tensor[0, 0, y : y + height, x : x + width]
+            mask_for_composite = mask_for_composite * original_mask_crop
+        if composite_clip_bbox is not None:
+            clip_x1, clip_y1, clip_x2, clip_y2 = composite_clip_bbox
+            img_h, img_w = mask_np.shape
+            clip_x1 = max(0, min(img_w, clip_x1))
+            clip_x2 = max(0, min(img_w, clip_x2))
+            clip_y1 = max(0, min(img_h, clip_y1))
+            clip_y2 = max(0, min(img_h, clip_y2))
+            start_x = max(0, clip_x1 - x)
+            end_x = min(width, clip_x2 - x)
+            start_y = max(0, clip_y1 - y)
+            end_y = min(height, clip_y2 - y)
+            if end_x <= start_x or end_y <= start_y:
+                mask_for_composite = torch.zeros_like(mask_for_composite)
+            else:
+                clipped_mask = torch.zeros_like(mask_for_composite)
+                clipped_mask[:, start_y:end_y, start_x:end_x] = mask_for_composite[
+                    :, start_y:end_y, start_x:end_x
+                ]
+                mask_for_composite = clipped_mask
+        log_message(
+            f"  - Optimized bbox found at ({x}, {y}) with size {width}x{height}",
+            verbose=verbose,
+        )
+        image_cropped_pil = image_pil.crop((x, y, x + width, y + height))
+        mask_crop_np = mask_np[y : y + height, x : x + width]
+        cache_params = {
+            "bbox": (x, y, width, height),
+            "padding": padding,
+            "blur": blur_radius,
+        }
+        if strict_mask_clipping:
+            cache_params["strict_clip"] = True
+        if composite_clip_bbox is not None:
+            cache_params["clip_bbox"] = tuple(composite_clip_bbox)
+        if ocr_params:
+            cache_params.update(ocr_params)
+        cache_key = None
+        cached_patch = None
+        if self.cache.should_use_inpaint_cache(seed):
+            # Downsample mask signature to reduce sensitivity to minor jitter
+            if mask_crop_np.size > 0:
+                sig_h = min(64, max(4, mask_crop_np.shape[0]))
+                sig_w = min(64, max(4, mask_crop_np.shape[1]))
+                mask_sig = (
+                    torch.from_numpy(mask_crop_np.astype(np.float32))
+                    .unsqueeze(0)
+                    .unsqueeze(0)
+                )
+                mask_sig = torch.nn.functional.interpolate(
+                    mask_sig, size=(sig_h, sig_w), mode="bilinear", align_corners=False
+                )
+                mask_sig_np = (mask_sig > 0.5).cpu().numpy().astype(np.uint8)[0, 0]
+            else:
+                mask_sig_np = mask_crop_np
+            cache_key = self.cache.get_inpaint_cache_key(
+                image_cropped_pil,
+                mask_sig_np,
+                seed,
+                self.num_inference_steps,
+                self.residual_diff_threshold,
+                self.guidance_scale,
+                self.prompt,
+                cache_params,
+            )
+            cached_patch = self.cache.get_inpainted_image(cache_key)
+            if cached_patch is not None:
+                log_message("  - Using cached inpainting patch", verbose=verbose)
+        patch_pil = cached_patch
+        if patch_pil is None:
+            self.load_models()
+            if self.pipeline is None:
+                log_message(
+                    "Warning: Flux Kontext pipeline not available. Skipping inpainting.",
+                    always_print=True,
+                )
+                return image_pil
+            image_scaled_for_inference_pil = self.flux_kontext_image_scale(
+                image_cropped_pil
+            )
+            inference_width, inference_height = image_scaled_for_inference_pil.size
+            if image_scaled_for_inference_pil.mode == "RGBA":
+                image_scaled_for_inference_pil = image_scaled_for_inference_pil.convert(
+                    "RGB"
+                )
+            log_message("  - Running inference...", verbose=verbose)
+            self.pipeline.text_encoder_2.to(self.DEVICE)
+            prompt_embeds, pooled_prompt_embeds, _ = self.pipeline.encode_prompt(
+                prompt=self.prompt,
+                prompt_2=None,
+                device=self.DEVICE,
+            )
+            self.pipeline.text_encoder_2.to("cpu")
+            gc.collect()
+            torch.cuda.empty_cache()
+            self.pipeline.transformer.to(self.DEVICE)
+            required_area = inference_width * inference_height
+            with torch.inference_mode():
+                gen = torch.Generator(device=self.DEVICE).manual_seed(seed)
+                out = self.pipeline(
+                    prompt_embeds=prompt_embeds,
+                    pooled_prompt_embeds=pooled_prompt_embeds,
+                    image=image_scaled_for_inference_pil,
+                    width=inference_width,
+                    height=inference_height,
+                    num_inference_steps=self.num_inference_steps,
+                    guidance_scale=self.guidance_scale,
+                    generator=gen,
+                    output_type="pt",
+                    max_area=required_area,
+                )
+                img = out.images[0]
+                torch.nan_to_num_(img, nan=0.0, posinf=1.0, neginf=0.0)
+                img.clamp_(0, 1)
+                generated_patch_pil = Image.fromarray(
+                    (
+                        img.mul(255)
+                        .round()
+                        .to(torch.uint8)
+                        .permute(1, 2, 0)
+                        .cpu()
+                        .numpy()
+                    )
+                )
+            self.pipeline.transformer.to("cpu")
+            gc.collect()
+            torch.cuda.empty_cache()
+            patch_pil = generated_patch_pil.resize(
+                (width, height), Image.Resampling.LANCZOS
+            )
+        dest_tensor = torch.from_numpy(
+            np.asarray(image_pil, dtype=np.float32) / 255.0
+        ).unsqueeze(0)
+        src_tensor = torch.from_numpy(
+            np.asarray(patch_pil, dtype=np.float32) / 255.0
+        ).unsqueeze(0)
+        composited_tensor = self.image_composite_masked(
+            destination=dest_tensor,
+            source=src_tensor,
+            x=x,
+            y=y,
+            resize_source=False,
+            mask=mask_for_composite,
+        )
+        composited_pil = Image.fromarray(
+            (composited_tensor[0].cpu().numpy() * 255).astype("uint8")
+        )
+        if (
+            self.cache.should_use_inpaint_cache(seed)
+            and cache_key is not None
+            and cached_patch is None
+        ):
+            self.cache.set_inpainted_image(cache_key, patch_pil)
+        return composited_pil

core/image/ocr_detection.py ADDED Viewed

	@@ -0,0 +1,730 @@

+import os
+from typing import List, Optional, Tuple
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from core.caching import get_cache
+from core.ml.model_manager import ModelType, get_model_manager
+from utils.exceptions import ImageProcessingError
+from utils.logging import log_message
+class OutsideTextDetector:
+    """Detects text outside speech bubbles to isolate SFX/captions from dialogue."""
+    def __init__(
+        self,
+        device: Optional[torch.device] = None,
+        hf_token: Optional[str] = None,
+    ):
+        """Initialize the outside text detector.
+        Args:
+            device: PyTorch device to use. Auto-detects if None.
+            hf_token: Hugging Face token for gated repo access.
+        """
+        self.device = (
+            device
+            if device is not None
+            else torch.device(
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps"
+                if torch.backends.mps.is_available()
+                else "cpu"
+            )
+        )
+        self.hf_token = hf_token
+        self.manager = get_model_manager()
+        self.cache = get_cache()
+    def boxes_overlap(self, box1, box2):
+        """Check if two bounding boxes overlap (have non-zero intersection).
+        Args:
+            box1: Bounding box in [x_min, y_min, x_max, y_max] format.
+            box2: Bounding box in YOLO format [x_min, y_min, x_max, y_max].
+        Returns:
+            bool: True if boxes overlap, False otherwise.
+        """
+        x1_min, y1_min, x1_max, y1_max = box1
+        x2_min, y2_min, x2_max, y2_max = box2
+        return not (
+            x1_max <= x2_min or x2_max <= x1_min or y1_max <= y2_min or y2_max <= y1_min
+        )
+    def box_is_inside(self, box1, box2):
+        """Check if box1 is completely inside box2.
+        Args:
+            box1: Bounding box in [x1, y1, x2, y2] format.
+            box2: Bounding box in [x1, y1, x2, y2] format.
+        Returns:
+            bool: True if box1 is completely inside box2, False otherwise.
+        """
+        x1_min, y1_min, x1_max, y1_max = box1
+        x2_min, y2_min, x2_max, y2_max = box2
+        return (
+            x1_min >= x2_min
+            and x1_max <= x2_max
+            and y1_min >= y2_min
+            and y1_max <= y2_max
+        )
+    def filter_nested_detections(self, results):
+        """Remove detections fully contained in larger ones to avoid duplicates.
+        Args:
+            results: List of detection results (bbox, text, confidence).
+        Returns:
+            list: Filtered results with nested detections removed.
+        """
+        if len(results) <= 1:
+            return results
+        # Prioritize larger detections to avoid removing important text
+        def get_area(result):
+            bbox = result[0]
+            x_min, y_min, x_max, y_max = bbox
+            return (x_max - x_min) * (y_max - y_min)
+        sorted_results = sorted(results, key=get_area, reverse=True)
+        filtered_results = []
+        for i, current_result in enumerate(sorted_results):
+            is_nested = False
+            current_bbox = current_result[0]
+            for kept_result in filtered_results:
+                kept_bbox = kept_result[0]
+                if self.box_is_inside(current_bbox, kept_bbox):
+                    is_nested = True
+                    break
+            if not is_nested:
+                filtered_results.append(current_result)
+        return filtered_results
+    def unload_models(self):
+        """Unload OCR models via model manager to free GPU/CPU memory."""
+        self.manager.unload_ocr_models()
+    def detect_outside_text(
+        self,
+        image_path: str,
+        yolo_model_path: Optional[str] = None,
+        confidence: float = 0.6,
+        conjoined_confidence: float = 0.35,
+        verbose: bool = False,
+        image_override: Optional[Image.Image] = None,
+        existing_bubbles: Optional[List] = None,
+        text_free_boxes: Optional[List] = None,
+    ):
+        """Detect non-dialogue text by subtracting YOLO speech bubbles from OCR results.
+        Args:
+            image_path: Path to the input image.
+            yolo_model_path: Optional custom YOLO model path.
+            confidence: Confidence threshold for primary YOLO model detections.
+            conjoined_confidence: Confidence threshold for secondary YOLO model (conjoined bubble detection).
+            verbose: If True, logs intermediate steps.
+            text_free_boxes: Optional list of text_free regions to use as fallback OSB detections.
+        Returns:
+            list: Detected regions outside bubbles as (bbox, confidence).
+        """
+        if image_override is None and not os.path.exists(image_path):
+            raise FileNotFoundError(f"Error: The file '{image_path}' was not found.")
+        try:
+            if image_override is not None:
+                image_pil = (
+                    image_override
+                    if image_override.mode == "RGB"
+                    else image_override.convert("RGB")
+                )
+                image_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+            else:
+                image_cv = cv2.imread(str(image_path))
+                if image_cv is None:
+                    raise ImageProcessingError(f"Could not read image at {image_path}")
+                image_pil = Image.fromarray(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB))
+            image_name = image_path if image_override is None else "override"
+            log_message(
+                f"Processing image: {image_name} "
+                f"({image_cv.shape[1]}x{image_cv.shape[0]})",
+                verbose=verbose,
+            )
+        except Exception as e:
+            raise ImageProcessingError(f"Error loading image: {e}")
+        provided_bubble_boxes = None
+        if existing_bubbles is not None:
+            try:
+                provided_bubble_boxes = []
+                for b in existing_bubbles:
+                    bbox = b.get("bbox") if isinstance(b, dict) else b
+                    if bbox is None or len(bbox) != 4:
+                        continue
+                    x0, y0, x1, y1 = bbox
+                    provided_bubble_boxes.append(
+                        [float(x0), float(y0), float(x1), float(y1)]
+                    )
+                if provided_bubble_boxes:
+                    log_message(
+                        f"Using {len(provided_bubble_boxes)} provided bubble boxes for OSB filtering",
+                        verbose=verbose,
+                    )
+            except Exception as e:
+                log_message(
+                    f"Warning: Failed to parse provided bubbles: {e}. Falling back to YOLO.",
+                    always_print=True,
+                )
+                provided_bubble_boxes = None
+        text_free_boxes = list(text_free_boxes) if text_free_boxes else []
+        if provided_bubble_boxes:
+            yolo_boxes = torch.tensor(
+                provided_bubble_boxes, device=self.device, dtype=torch.float32
+            )
+            num_yolo_boxes = len(yolo_boxes)
+            log_message(
+                f"Skipping YOLO; using provided bubbles ({num_yolo_boxes})",
+                verbose=verbose,
+            )
+        else:
+            log_message("Running YOLO detection for speech bubbles...", verbose=verbose)
+            sb_model_path = (
+                str(self.manager.model_paths[ModelType.YOLO_SPEECH_BUBBLE])
+                if yolo_model_path is None
+                else yolo_model_path
+            )
+            sb_cache_key = self.cache.get_yolo_cache_key(
+                image_pil, sb_model_path, confidence
+            )
+            cached_sb = self.cache.get_yolo_detection(sb_cache_key)
+            if cached_sb is not None:
+                log_message("Using cached Speech Bubble detections", verbose=verbose)
+                yolo_results, yolo_boxes = cached_sb
+            else:
+                yolo_model = self.manager.load_yolo_speech_bubble(yolo_model_path)
+                yolo_results = yolo_model(
+                    image_cv, conf=confidence, device=self.device, verbose=False
+                )[0]
+                yolo_boxes = (
+                    yolo_results.boxes.xyxy
+                    if yolo_results.boxes is not None
+                    else torch.tensor([])
+                )
+                self.cache.set_yolo_detection(sb_cache_key, (yolo_results, yolo_boxes))
+            num_yolo_boxes = len(yolo_boxes) if yolo_boxes.nelement() > 0 else 0
+            log_message(
+                f"YOLO detected {num_yolo_boxes} speech bubbles", verbose=verbose
+            )
+            log_message(
+                "Running Secondary YOLO to catch missed bubbles...", verbose=verbose
+            )
+            try:
+                sec_model = self.manager.load_yolo_conjoined_bubble()
+                sec_results = sec_model(
+                    image_cv,
+                    conf=conjoined_confidence,
+                    device=self.device,
+                    verbose=False,
+                )[0]
+                sec_boxes = (
+                    sec_results.boxes.xyxy
+                    if sec_results.boxes is not None
+                    else torch.tensor([])
+                )
+                sec_cls = (
+                    sec_results.boxes.cls
+                    if sec_results.boxes is not None
+                    else torch.tensor([])
+                )
+                # Find text_bubble and text_free classes
+                tb_id = None
+                tf_id = None
+                if hasattr(sec_model, "names"):
+                    for cid, cname in sec_model.names.items():
+                        if cname == "text_bubble":
+                            tb_id = cid
+                        elif cname == "text_free":
+                            tf_id = cid
+                if tf_id is not None and len(sec_boxes) > 0:
+                    for i, cls_id in enumerate(sec_cls):
+                        if int(cls_id) == tf_id:
+                            text_free_boxes.append(sec_boxes[i].detach().cpu().numpy())
+                if tb_id is not None and len(sec_boxes) > 0:
+                    boxes_to_add = []
+                    for i, cls_id in enumerate(sec_cls):
+                        if int(cls_id) == tb_id:
+                            boxes_to_add.append(sec_boxes[i])
+                    if boxes_to_add:
+                        log_message(
+                            f"Secondary YOLO found {len(boxes_to_add)} potential bubbles",
+                            verbose=verbose,
+                        )
+                        boxes_to_add_tensor = torch.stack(boxes_to_add)
+                        if yolo_boxes.nelement() > 0:
+                            yolo_boxes = torch.cat(
+                                (yolo_boxes, boxes_to_add_tensor), dim=0
+                            )
+                        else:
+                            yolo_boxes = boxes_to_add_tensor
+            except Exception as e:
+                log_message(f"Secondary YOLO failed: {e}", verbose=verbose)
+        log_message("Running YOLO OSB Text...", always_print=True)
+        osbtext_boxes = None
+        osbtext_confs = None
+        try:
+            osbtext_model_path = str(self.manager.model_paths[ModelType.YOLO_OSBTEXT])
+            osbtext_cache_key = self.cache.get_yolo_cache_key(
+                image_pil, osbtext_model_path, confidence
+            )
+            cached_osbtext = self.cache.get_yolo_detection(osbtext_cache_key)
+            if cached_osbtext is not None:
+                log_message("Using cached OSBText detections", verbose=verbose)
+                osbtext_results, osbtext_boxes, osbtext_confs = cached_osbtext
+            else:
+                osbtext_model = self.manager.load_yolo_osbtext(token=self.hf_token)
+                osbtext_results = osbtext_model(
+                    image_cv, conf=confidence, device=self.device, verbose=False
+                )[0]
+                osbtext_boxes = (
+                    osbtext_results.boxes.xyxy
+                    if osbtext_results.boxes is not None
+                    else None
+                )
+                osbtext_confs = (
+                    osbtext_results.boxes.conf
+                    if osbtext_results.boxes is not None
+                    else None
+                )
+                self.cache.set_yolo_detection(
+                    osbtext_cache_key, (osbtext_results, osbtext_boxes, osbtext_confs)
+                )
+        except Exception as e:
+            log_message(
+                f"OSB text model unavailable: {e}. Using text_free fallback if available.",
+                always_print=True,
+            )
+            if text_free_boxes:
+                log_message(
+                    f"Using {len(text_free_boxes)} text_free boxes as OSB fallback",
+                    always_print=True,
+                )
+                osbtext_boxes = torch.tensor(
+                    text_free_boxes, device=self.device, dtype=torch.float32
+                )
+                osbtext_confs = torch.ones(
+                    len(text_free_boxes), device=self.device, dtype=torch.float32
+                )
+            else:
+                log_message(
+                    "No text_free fallback available; skipping OSB text detections",
+                    always_print=True,
+                )
+        base_results = []
+        if osbtext_boxes is not None:
+            boxes_np = osbtext_boxes.detach().cpu().numpy()
+            confs_np = osbtext_confs.detach().cpu().numpy()
+            for i, box in enumerate(boxes_np):
+                conf = confs_np[i]
+                base_results.append((box, float(conf)))
+        final_results = list(base_results)
+        log_message("Filtering out nested detections...", verbose=verbose)
+        before_nested_filter = len(final_results)
+        final_results = self.filter_nested_detections(final_results)
+        after_nested_filter = len(final_results)
+        nested_removed = before_nested_filter - after_nested_filter
+        log_message(
+            f"Nested detections removed: {nested_removed}. Remaining detections: {after_nested_filter}.",
+            verbose=verbose,
+        )
+        if yolo_boxes is not None and yolo_boxes.nelement() > 0:
+            log_message(
+                "Filtering OCR results to keep text outside speech bubbles...",
+                verbose=verbose,
+            )
+            filtered_results = []
+            yolo_boxes_np = yolo_boxes.detach().cpu().numpy()
+            for ocr_result in final_results:
+                bbox, _ = ocr_result
+                overlaps_any_bubble = False
+                for yolo_box in yolo_boxes_np:
+                    if self.boxes_overlap(bbox, yolo_box):
+                        # Check if this bubble is actually a text_free region
+                        is_text_free_bubble = False
+                        if text_free_boxes:
+                            for tf_box in text_free_boxes:
+                                # We check if the YOLO bubble overlaps with a text_free detection
+                                if self.boxes_overlap(yolo_box, tf_box):
+                                    is_text_free_bubble = True
+                                    break
+                        if not is_text_free_bubble:
+                            overlaps_any_bubble = True
+                            break
+                if not overlaps_any_bubble:
+                    filtered_results.append(ocr_result)
+            filtered_out = len(final_results) - len(filtered_results)
+            log_message(
+                f"Filtered out {filtered_out} OCR results that overlapped with speech bubbles",
+                verbose=verbose,
+            )
+            final_results = filtered_results
+        log_message(
+            f"Found {len(final_results)} outside text regions", always_print=True
+        )
+        return final_results
+    def get_text_masks(
+        self,
+        image_path: str,
+        bbox_expansion_percent: float = 0.0,
+        text_box_proximity_ratio: float = 0.02,
+        verbose: bool = False,
+        image_override: Optional[Image.Image] = None,
+        existing_results: Optional[List] = None,
+    ) -> Tuple[Optional[List], Optional[Image.Image]]:
+        """Create rectangular masks from OCR bounding boxes for inpainting.
+        Args:
+            image_path: Path to the input image.
+            bbox_expansion_percent: Percentage to expand bounding boxes.
+            text_box_proximity_ratio: Ratio for grouping nearby text boxes (as fraction of image dimension).
+            verbose: Whether to print verbose output.
+        Returns:
+            tuple: (groups, image_pil) where groups is a list of dicts with:
+                {
+                    'combined_mask': np.array[H,W,bool],
+                    'bbox': dict,
+                    'individual_masks': [np.array],
+                    'mask_indices': [int],
+                    'confidence': float,
+                }.
+        """
+        results = (
+            existing_results
+            if existing_results is not None
+            else self.detect_outside_text(
+                image_path,
+                verbose=verbose,
+                image_override=image_override,
+            )
+        )
+        if not results:
+            return None, None
+        if image_override is not None:
+            image_pil = (
+                image_override.convert("RGB")
+                if image_override.mode != "RGB"
+                else image_override
+            )
+        else:
+            image_pil = Image.open(image_path).convert("RGB")
+        img_w, img_h = image_pil.size
+        log_message("Converting OCR results to axis-aligned boxes...", verbose=verbose)
+        boxes = [[int(c) for c in result[0]] for result in results]
+        expanded_boxes = []
+        for box in boxes:
+            x0, y0, x1, y1 = box
+            width = x1 - x0
+            height = y1 - y0
+            expand_x = width * bbox_expansion_percent
+            expand_y = height * bbox_expansion_percent
+            x0e = int(np.floor(max(0, x0 - expand_x)))
+            y0e = int(np.floor(max(0, y0 - expand_y)))
+            x1e = int(np.ceil(min(img_w, x1 + expand_x)))
+            y1e = int(np.ceil(min(img_h, y1 + expand_y)))
+            if x1e > x0e and y1e > y0e:
+                expanded_boxes.append([x0e, y0e, x1e, y1e])
+        log_message(
+            f"Grouping {len(expanded_boxes)} text boxes spatially...",
+            verbose=verbose,
+        )
+        grouped_boxes = self._group_text_boxes_spatially(
+            expanded_boxes, results, img_w, img_h, text_box_proximity_ratio, verbose
+        )
+        groups = []
+        for group_boxes, group_results in grouped_boxes:
+            combined_mask = np.zeros((img_h, img_w), dtype=bool)
+            individual_masks = []
+            mask_indices = []
+            avg_confidence = 0.0
+            min_x = min(box[0] for box in group_boxes)
+            min_y = min(box[1] for box in group_boxes)
+            max_x = max(box[2] for box in group_boxes)
+            max_y = max(box[3] for box in group_boxes)
+            # Ensure combined region doesn't exceed Flux Kontext preferred resolutions
+            max_dimension = 1568
+            if max_x - min_x > max_dimension or max_y - min_y > max_dimension:
+                log_message(
+                    f"  - Group too large ({max_x - min_x}x{max_y - min_y}), splitting...",
+                    verbose=verbose,
+                )
+                for i, (box, result) in enumerate(zip(group_boxes, group_results)):
+                    x0, y0, x1, y1 = box
+                    mask = np.zeros((img_h, img_w), dtype=bool)
+                    mask[y0:y1, x0:x1] = True
+                    bbox = {
+                        "x": int(x0),
+                        "y": int(y0),
+                        "width": int(x1 - x0),
+                        "height": int(y1 - y0),
+                    }
+                    raw_box = [int(c) for c in result[0]]
+                    raw_x0, raw_y0, raw_x1, raw_y1 = raw_box
+                    original_bbox = {
+                        "x": raw_x0,
+                        "y": raw_y0,
+                        "width": raw_x1 - raw_x0,
+                        "height": raw_y1 - raw_y0,
+                    }
+                    _, conf = result
+                    groups.append(
+                        {
+                            "combined_mask": mask,
+                            "bbox": bbox,
+                            "original_bbox": original_bbox,
+                            "individual_masks": [mask],
+                            "mask_indices": [i],
+                            "confidence": conf,
+                        }
+                    )
+                continue
+            raw_boxes = [[int(c) for c in res[0]] for res in group_results]
+            for i, (box, result, raw_box) in enumerate(
+                zip(group_boxes, group_results, raw_boxes)
+            ):
+                x0, y0, x1, y1 = box
+                mask = np.zeros((img_h, img_w), dtype=bool)
+                mask[y0:y1, x0:x1] = True
+                combined_mask |= mask
+                individual_masks.append(mask)
+                mask_indices.append(i)
+                _, conf = result
+                avg_confidence += conf
+            raw_min_x = min(box[0] for box in raw_boxes)
+            raw_min_y = min(box[1] for box in raw_boxes)
+            raw_max_x = max(box[2] for box in raw_boxes)
+            raw_max_y = max(box[3] for box in raw_boxes)
+            bbox = {
+                "x": int(min_x),
+                "y": int(min_y),
+                "width": int(max_x - min_x),
+                "height": int(max_y - min_y),
+            }
+            original_bbox = {
+                "x": int(raw_min_x),
+                "y": int(raw_min_y),
+                "width": int(raw_max_x - raw_min_x),
+                "height": int(raw_max_y - raw_min_y),
+            }
+            groups.append(
+                {
+                    "combined_mask": combined_mask,
+                    "bbox": bbox,
+                    "original_bbox": original_bbox,
+                    "individual_masks": individual_masks,
+                    "mask_indices": mask_indices,
+                    "confidence": avg_confidence / len(group_results),
+                }
+            )
+        log_message(
+            f"Created {len(groups)} grouped text regions for inpainting",
+            verbose=verbose,
+        )
+        return groups, image_pil
+    def _group_text_boxes_spatially(
+        self, boxes, results, img_w, img_h, text_box_proximity_ratio=0.02, verbose=False
+    ):
+        """
+        Group nearby text boxes based on spatial proximity.
+        Args:
+            boxes: List of bounding boxes [x0, y0, x1, y1]
+            results: List of OCR results corresponding to boxes
+            img_w: Image width
+            img_h: Image height
+            text_box_proximity_ratio: Ratio for grouping nearby text boxes (as fraction of image dimension).
+            verbose: Whether to print detailed logs
+        Returns:
+            List of tuples (group_boxes, group_results) where each group contains
+            spatially related text boxes
+        """
+        if not boxes:
+            return []
+        proximity_threshold = min(img_w, img_h) * text_box_proximity_ratio
+        parent = list(range(len(boxes)))
+        def find(x):
+            if parent[x] != x:
+                parent[x] = find(parent[x])
+            return parent[x]
+        def union(x, y):
+            px, py = find(x), find(y)
+            if px != py:
+                parent[px] = py
+        for i in range(len(boxes)):
+            for j in range(i + 1, len(boxes)):
+                if self._boxes_are_nearby(boxes[i], boxes[j], proximity_threshold):
+                    union(i, j)
+        groups = {}
+        for i in range(len(boxes)):
+            root = find(i)
+            if root not in groups:
+                groups[root] = ([], [])
+            groups[root][0].append(boxes[i])
+            groups[root][1].append(results[i])
+        grouped_boxes = list(groups.values())
+        log_message(
+            f"  - Grouped {len(boxes)} boxes into {len(grouped_boxes)} spatial groups",
+            verbose=verbose,
+        )
+        return grouped_boxes
+    def _boxes_are_nearby(self, box1, box2, threshold):
+        """
+        Check if two bounding boxes are spatially close enough to be grouped.
+        Args:
+            box1: First bounding box [x0, y0, x1, y1]
+            box2: Second bounding box [x0, y0, x1, y1]
+            threshold: Maximum distance for boxes to be considered nearby
+        Returns:
+            True if boxes are nearby, False otherwise
+        """
+        x1_min, y1_min, x1_max, y1_max = box1
+        x2_min, y2_min, x2_max, y2_max = box2
+        cx1 = (x1_min + x1_max) / 2
+        cy1 = (y1_min + y1_max) / 2
+        cx2 = (x2_min + x2_max) / 2
+        cy2 = (y2_min + y2_max) / 2
+        distance = np.sqrt((cx1 - cx2) ** 2 + (cy1 - cy2) ** 2)
+        return distance <= threshold
+def extract_text_with_manga_ocr(
+    images: List[Image.Image], verbose: bool = False
+) -> List[str]:
+    """Extract text from images using manga-ocr library.
+    Args:
+        images: List of PIL Images to process
+        verbose: Whether to print verbose output
+    Returns:
+        List of extracted text strings (one per image). Returns [OCR FAILED] on errors.
+    """
+    if not images:
+        return []
+    try:
+        model_manager = get_model_manager()
+        manga_ocr_instance = model_manager.get_manga_ocr(verbose=verbose)
+        extracted_texts = []
+        for i, img in enumerate(images):
+            try:
+                if img is None:
+                    log_message(
+                        f"Image {i + 1} is None (decode failure), skipping",
+                        always_print=True,
+                    )
+                    extracted_texts.append("[OCR FAILED]")
+                    continue
+                log_message(
+                    f"Processing image {i + 1}/{len(images)} with manga-ocr",
+                    verbose=verbose,
+                )
+                text = manga_ocr_instance(img)
+                extracted_texts.append(text.strip() if text else "")
+            except Exception as e:
+                log_message(
+                    f"manga-ocr failed for image {i + 1}: {e}", always_print=True
+                )
+                extracted_texts.append("[OCR FAILED]")
+        return extracted_texts
+    except Exception as e:
+        log_message(f"Error with manga-ocr: {e}", always_print=True)
+        return ["[OCR FAILED]"] * len(images)

core/image/sorting.py ADDED Viewed

	@@ -0,0 +1,359 @@

+from typing import Any, Dict, List
+def sort_bubbles_by_reading_order(detections, reading_direction="rtl", panels=None):
+    """
+    Hybrid Algorithm (veto system):
+    - Macro: graph sort with ceiling + right-neighbor veto to enforce Z flow.
+    - Micro: tuned spatial banding with looser thresholds for offset bubbles.
+    """
+    if not detections:
+        return []
+    rtl = (reading_direction or "rtl").lower() == "rtl"
+    # Micro layout: keep slightly offset bubbles grouped into lines/columns.
+    def _get_features(bbox):
+        x1, y1, x2, y2 = bbox
+        w = max(1.0, float(x2 - x1))
+        h = max(1.0, float(y2 - y1))
+        cx = (x1 + x2) / 2.0
+        cy = (y1 + y2) / 2.0
+        return x1, y1, x2, y2, w, h, cx, cy
+    def _spatial_sort(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Robust spatial sort for bubbles (vertical columns + horizontal rows)."""
+        if not items:
+            return []
+        # Tuned thresholds to keep slightly offset bubbles in the same line.
+        y_overlap_ratio_threshold = 0.25
+        y_center_band_factor = 0.5
+        x_overlap_ratio_threshold = 0.2
+        x_center_band_factor = 0.5
+        enriched = []
+        for item in items:
+            x1, y1, x2, y2, w, h, cx, cy = _get_features(item["bbox"])
+            enriched.append(
+                {
+                    "item": item,
+                    "x1": x1,
+                    "y1": y1,
+                    "x2": x2,
+                    "y2": y2,
+                    "w": w,
+                    "h": h,
+                    "cx": cx,
+                    "cy": cy,
+                }
+            )
+        enriched.sort(key=lambda e: e["cy"])
+        bands = []
+        for e in enriched:
+            y1, y2, h = e["y1"], e["y2"], e["h"]
+            best_band_idx = -1
+            best_score = -1.0
+            for i, band in enumerate(bands):
+                band_h = max(1.0, float(band["y_max"] - band["y_min"]))
+                overlap_v = max(0.0, min(y2, band["y_max"]) - max(y1, band["y_min"]))
+                overlap_ratio = overlap_v / min(h, band_h)
+                center_delta_y = abs(e["cy"] - (band["y_min"] + band["y_max"]) / 2.0)
+                same_row = (overlap_ratio >= y_overlap_ratio_threshold) or (
+                    center_delta_y <= y_center_band_factor * min(h, band_h)
+                )
+                if same_row:
+                    score = overlap_ratio - (center_delta_y / (h + band_h)) * 0.1
+                    if score > best_score:
+                        best_score = score
+                        best_band_idx = i
+            if best_band_idx == -1:
+                bands.append({"y_min": y1, "y_max": y2, "items": [e]})
+            else:
+                band = bands[best_band_idx]
+                band["items"].append(e)
+                band["y_min"] = min(band["y_min"], y1)
+                band["y_max"] = max(band["y_max"], y2)
+        bands.sort(key=lambda b: b["y_min"])
+        ordered_items = []
+        for band in bands:
+            items_in_band = band["items"]
+            columns = []
+            for e in items_in_band:
+                x1, x2, w = e["x1"], e["x2"], e["w"]
+                best_col_idx = -1
+                best_score = -1.0
+                for i, col in enumerate(columns):
+                    col_w = max(1.0, float(col["x_max"] - col["x_min"]))
+                    overlap_h = max(0.0, min(x2, col["x_max"]) - max(x1, col["x_min"]))
+                    overlap_ratio = overlap_h / min(w, col_w)
+                    col_center_x = (col["x_min"] + col["x_max"]) / 2.0
+                    center_delta_x = abs(e["cx"] - col_center_x)
+                    same_col = (overlap_ratio >= x_overlap_ratio_threshold) or (
+                        center_delta_x <= x_center_band_factor * min(w, col_w)
+                    )
+                    if same_col:
+                        score = overlap_ratio - (center_delta_x / (w + col_w)) * 0.1
+                        if score > best_score:
+                            best_score = score
+                            best_col_idx = i
+                if best_col_idx == -1:
+                    columns.append({"x_min": x1, "x_max": x2, "items": [e]})
+                else:
+                    col = columns[best_col_idx]
+                    col["items"].append(e)
+                    col["x_min"] = min(col["x_min"], x1)
+                    col["x_max"] = max(col["x_max"], x2)
+            if rtl:
+                columns.sort(key=lambda c: -((c["x_min"] + c["x_max"]) / 2.0))
+            else:
+                columns.sort(key=lambda c: ((c["x_min"] + c["x_max"]) / 2.0))
+            for col in columns:
+                col["items"].sort(key=lambda e: e["cy"])
+                ordered_items.extend([e["item"] for e in col["items"]])
+        return ordered_items
+    # Macro layout: panel graph with root detection and dual veto for Z-flow.
+    def _iou_x(boxA, boxB):
+        xa1, _, xa2, _ = boxA
+        xb1, _, xb2, _ = boxB
+        inter = max(0, min(xa2, xb2) - max(xa1, xb1))
+        union = (xa2 - xa1) + (xb2 - xb1) - inter
+        return inter / union if union > 0 else 0
+    def _iou_y_overlap(boxA, boxB):
+        _, ya1, _, ya2 = boxA
+        _, yb1, _, yb2 = boxB
+        inter = max(0, min(ya2, yb2) - max(ya1, yb1))
+        min_h = min(ya2 - ya1, yb2 - yb1)
+        return inter / min_h if min_h > 0 else 0
+    def sort_panels_strict(panels_list, rtl=True):
+        if not panels_list:
+            return []
+        nodes = []
+        for i, bbox in enumerate(panels_list):
+            nodes.append(
+                {
+                    "id": i,
+                    "bbox": bbox,
+                    "center": ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2),
+                    "visited": False,
+                }
+            )
+        sorted_indices = []
+        # Roots: panels with no panel above in the same column.
+        root_nodes = []
+        for n in nodes:
+            is_root = True
+            for parent in nodes:
+                if n["id"] == parent["id"]:
+                    continue
+                is_above = parent["bbox"][3] <= (n["bbox"][1] + 50)
+                x_overlap = _iou_x(parent["bbox"], n["bbox"])
+                if is_above and x_overlap > 0.2:
+                    is_root = False
+                    break
+            if is_root:
+                root_nodes.append(n)
+        if root_nodes:
+            start_node = (
+                max(root_nodes, key=lambda n: n["bbox"][2])
+                if rtl
+                else min(root_nodes, key=lambda n: n["bbox"][0])
+            )
+        else:
+            start_node = min(nodes, key=lambda n: n["bbox"][1])
+        current = start_node
+        current["visited"] = True
+        sorted_indices.append(current["id"])
+        while len(sorted_indices) < len(nodes):
+            c_box = current["bbox"]
+            candidates = [n for n in nodes if not n["visited"]]
+            if not candidates:
+                break
+            col_cand = None
+            col_candidates = []
+            for cand in candidates:
+                cand_box = cand["bbox"]
+                overlap = _iou_x(c_box, cand_box)
+                is_below = cand_box[1] >= (c_box[1] + (c_box[3] - c_box[1]) * 0.5)
+                if overlap > 0.2 and is_below:
+                    dist_y = max(0, cand_box[1] - c_box[3])
+                    col_candidates.append((dist_y, cand))
+            if col_candidates:
+                col_candidates.sort(
+                    key=lambda x: (
+                        int(x[0] / 50),
+                        -x[1]["center"][0] if rtl else x[1]["center"][0],
+                    )
+                )
+                col_cand = col_candidates[0][1]
+            row_cand = None
+            row_candidates = []
+            for cand in candidates:
+                cand_box = cand["bbox"]
+                if rtl:
+                    is_row_neighbor = cand_box[2] <= (c_box[0] + 50)
+                    dist_x = c_box[0] - cand_box[2]
+                else:
+                    is_row_neighbor = cand_box[0] >= (c_box[2] - 50)
+                    dist_x = cand_box[0] - c_box[2]
+                if is_row_neighbor:
+                    y_inter = max(
+                        0, min(c_box[3], cand_box[3]) - max(c_box[1], cand_box[1])
+                    )
+                    if y_inter > 0:
+                        row_candidates.append((dist_x, cand))
+            if row_candidates:
+                row_candidates.sort(key=lambda x: x[0])
+                row_cand = row_candidates[0][1]
+            # Dual veto: ceiling (topological) + right-neighbor (row start).
+            if col_cand:
+                is_blocked = False
+                for other in candidates:
+                    if other["id"] == col_cand["id"]:
+                        continue
+                    is_above = other["bbox"][3] <= (col_cand["bbox"][1] + 50)
+                    x_overlap = _iou_x(other["bbox"], col_cand["bbox"])
+                    if is_above and x_overlap > 0.2:
+                        is_blocked = True
+                        break
+                    if rtl:
+                        has_block_neighbor = other["bbox"][0] > (
+                            col_cand["bbox"][0] + 20
+                        )
+                    else:
+                        has_block_neighbor = other["bbox"][2] < (
+                            col_cand["bbox"][2] - 20
+                        )
+                    y_overlap_ratio = _iou_y_overlap(col_cand["bbox"], other["bbox"])
+                    if has_block_neighbor and y_overlap_ratio > 0.3:
+                        is_blocked = True
+                        break
+                if is_blocked:
+                    col_cand = None
+            next_node = None
+            if row_cand and not col_cand:
+                next_node = row_cand
+            elif col_cand and not row_cand:
+                next_node = col_cand
+            elif row_cand and col_cand:
+                curr_h = c_box[3] - c_box[1]
+                bottom_diff = abs(c_box[3] - row_cand["bbox"][3])
+                is_row_aligned = bottom_diff < (curr_h * 0.25)
+                next_node = row_cand if is_row_aligned else col_cand
+            if not next_node:
+                # Recompute roots among remaining nodes to find a new entry.
+                sub_roots = []
+                for n in candidates:
+                    is_root = True
+                    for parent in candidates:
+                        if n["id"] == parent["id"]:
+                            continue
+                        is_above = parent["bbox"][3] <= (n["bbox"][1] + 50)
+                        x_overlap = _iou_x(parent["bbox"], n["bbox"])
+                        if is_above and x_overlap > 0.2:
+                            is_root = False
+                            break
+                    if is_root:
+                        sub_roots.append(n)
+                if sub_roots:
+                    next_node = (
+                        max(sub_roots, key=lambda n: n["bbox"][2])
+                        if rtl
+                        else min(sub_roots, key=lambda n: n["bbox"][0])
+                    )
+                else:
+                    next_node = min(candidates, key=lambda n: n["bbox"][1])
+            current = next_node
+            current["visited"] = True
+            sorted_indices.append(current["id"])
+        return sorted_indices
+    if not panels:
+        return _spatial_sort(detections)
+    sorted_panel_indices = sort_panels_strict(panels, rtl)
+    if not sorted_panel_indices:
+        sorted_panel_indices = list(range(len(panels)))
+    panel_bins = {pid: [] for pid in sorted_panel_indices}
+    unassigned = []
+    for detection in detections:
+        bx1, by1, bx2, by2 = detection["bbox"]
+        bcx, bcy = (bx1 + bx2) / 2.0, (by1 + by2) / 2.0
+        assigned = False
+        for i, pbbox in enumerate(panels):
+            px1, py1, px2, py2 = pbbox
+            if px1 <= bcx <= px2 and py1 <= bcy <= py2:
+                panel_bins.setdefault(i, []).append(detection)
+                detection["panel_id"] = i
+                assigned = True
+                break
+        if not assigned:
+            best_dist = float("inf")
+            best_pid = -1
+            for i, pbbox in enumerate(panels):
+                px1, py1, px2, py2 = pbbox
+                dx = max(px1 - bcx, 0, bcx - px2)
+                dy = max(py1 - bcy, 0, bcy - py2)
+                dist = (dx**2 + dy**2) ** 0.5
+                if dist < best_dist:
+                    best_dist = dist
+                    best_pid = i
+            if best_dist < 300:
+                panel_bins.setdefault(best_pid, []).append(detection)
+                detection["panel_id"] = best_pid
+                assigned = True
+        if not assigned:
+            detection["panel_id"] = None
+            unassigned.append(detection)
+    final_order = []
+    for pid in sorted_panel_indices:
+        final_order.extend(_spatial_sort(panel_bins.get(pid, [])))
+    if unassigned:
+        final_order.extend(_spatial_sort(unassigned))
+    return final_order

core/llm_defaults.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""Provider-specific default sampling parameters."""
+from __future__ import annotations
+from typing import Dict, Optional
+# Canonical provider names used across the app
+DEFAULT_LLM_PROVIDER = "Google"
+_PROVIDER_SAMPLING_DEFAULTS: Dict[str, Dict[str, float | int]] = {
+    "Google": {"temperature": 0.1, "top_p": 0.95, "top_k": 64},
+    "OpenAI": {"temperature": 0.1, "top_p": 1.0, "top_k": 0},
+    "Anthropic": {"temperature": 0.1, "top_p": 1.0, "top_k": 0},
+    "xAI": {"temperature": 0.1, "top_p": 1.0, "top_k": 0},
+    "DeepSeek": {"temperature": 0.1, "top_p": 0.95, "top_k": 0},
+    "Z.ai": {"temperature": 0.1, "top_p": 0.95, "top_k": 40},
+    "Moonshot AI": {"temperature": 0.1, "top_p": 1.0, "top_k": 0},
+    "OpenRouter": {"temperature": 0.1, "top_p": 0.95, "top_k": 64},
+    "OpenAI-Compatible": {"temperature": 0.1, "top_p": 0.95, "top_k": 40},
+}
+def get_provider_sampling_defaults(provider: Optional[str]) -> Dict[str, float | int]:
+    """Return a copy of the sampling defaults for the specified provider."""
+    fallback = _PROVIDER_SAMPLING_DEFAULTS[DEFAULT_LLM_PROVIDER]
+    if not provider:
+        return fallback.copy()
+    return _PROVIDER_SAMPLING_DEFAULTS.get(provider, fallback).copy()
+__all__ = ["DEFAULT_LLM_PROVIDER", "get_provider_sampling_defaults"]

core/ml/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+Machine learning model management for MangaTranslator.
+This subpackage contains modules for:
+- Centralized ML model loading and caching
+- Model manager for YOLO, SAM, Flux, and other models
+"""
+from .model_manager import ModelManager, get_model_manager
+__all__ = [
+    "ModelManager",
+    "get_model_manager",
+]

core/ml/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (542 Bytes). View file

core/ml/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (507 Bytes). View file

core/ml/__pycache__/model_manager.cpython-311.pyc ADDED Viewed

Binary file (44.7 kB). View file

core/ml/__pycache__/model_manager.cpython-314.pyc ADDED Viewed

Binary file (45.6 kB). View file

core/ml/model_manager.py ADDED Viewed

	@@ -0,0 +1,854 @@

+import gc
+import shutil
+import threading
+import urllib.request
+from contextlib import contextmanager
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+import torch
+from huggingface_hub import hf_hub_download, snapshot_download
+from spandrel import ModelLoader
+from transformers import Sam2Model, Sam2Processor
+from ultralytics import YOLO
+from utils.exceptions import ModelError
+from utils.logging import log_message
+class ModelType(Enum):
+    """Enumeration of available model types."""
+    UPSCALE = "upscale"
+    UPSCALE_LITE = "upscale_lite"
+    YOLO_SPEECH_BUBBLE = "yolo_speech_bubble"
+    YOLO_CONJOINED_BUBBLE = "yolo_conjoined_bubble"
+    YOLO_OSBTEXT = "yolo_osbtext"
+    YOLO_PANEL = "yolo_panel"
+    SAM2 = "sam2"
+    MANGA_OCR = "manga_ocr"
+    FLUX_TRANSFORMER = "flux_transformer"
+    FLUX_TEXT_ENCODER = "flux_text_encoder"
+    FLUX_PIPELINE = "flux_pipeline"
+class ModelManager:
+    """Singleton model manager for MangaTranslator."""
+    _instance = None
+    _lock = threading.RLock()
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+    def __init__(self):
+        """Initialize the model manager (only once due to singleton pattern)."""
+        with self._lock:
+            if self._initialized:
+                return
+            self.device = torch.device(
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps" if torch.backends.mps.is_available() else "cpu"
+            )
+            self.dtype = (
+                torch.bfloat16
+                if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+                else torch.float16 if self.device.type == "mps" else torch.float32
+            )
+            # Model storage
+            self.models = {}
+            self.model_paths = self._init_model_paths()
+            self.model_urls = self._init_model_urls()
+            self.model_hf_repos = self._init_hf_repos()
+            # Flux-specific configuration
+            self.flux_cache_dir = Path("./models/flux")
+            self.flux_hf_token = None
+            self.flux_residual_diff_threshold = 0.15
+            self._initialized = True
+            log_message(
+                f"Model Manager initialized on device: {self.device}", always_print=True
+            )
+    def _init_model_paths(self):
+        """Initialize model file paths."""
+        model_dir = Path("./models").resolve()
+        return {
+            ModelType.UPSCALE: (
+                model_dir / "upscale" / "2x-AnimeSharpV4_RCAN.safetensors"
+            ),
+            ModelType.UPSCALE_LITE: (
+                model_dir / "upscale" / "2x-AnimeSharpV4_Fast_RCAN_PU.safetensors"
+            ),
+            ModelType.YOLO_SPEECH_BUBBLE: (
+                model_dir / "yolo" / "yolov8m_seg-speech-bubble.pt"
+            ),
+            ModelType.YOLO_CONJOINED_BUBBLE: (
+                model_dir / "yolo" / "comic-speech-bubble-detector-yolov8m.pt"
+            ),
+            ModelType.YOLO_OSBTEXT: (model_dir / "yolo" / "animetext_yolov12x.pt"),
+            ModelType.YOLO_PANEL: (
+                model_dir / "yolo" / "manga109_v2023.12.07_l_yolov11.pt"
+            ),
+            ModelType.MANGA_OCR: (model_dir / "manga-ocr-base"),
+        }
+    def _init_model_urls(self):
+        """Initialize model download URLs."""
+        return {
+            ModelType.UPSCALE: (
+                "https://huggingface.co/Kim2091/2x-AnimeSharpV4/resolve/main/"
+                "2x-AnimeSharpV4_RCAN.safetensors"
+            ),
+            ModelType.UPSCALE_LITE: (
+                "https://huggingface.co/Kim2091/2x-AnimeSharpV4/resolve/main/"
+                "2x-AnimeSharpV4_Fast_RCAN_PU.safetensors"
+            ),
+        }
+    def _init_hf_repos(self):
+        """Initialize Hugging Face repository information."""
+        repos = {
+            ModelType.UPSCALE: {
+                "repo_id": "Kim2091/2x-AnimeSharpV4",
+                "filename": "2x-AnimeSharpV4_RCAN.safetensors",
+            },
+            ModelType.UPSCALE_LITE: {
+                "repo_id": "Kim2091/2x-AnimeSharpV4",
+                "filename": "2x-AnimeSharpV4_Fast_RCAN_PU.safetensors",
+            },
+            ModelType.YOLO_SPEECH_BUBBLE: {
+                "repo_id": "kitsumed/yolov8m_seg-speech-bubble",
+                "filename": "model.pt",
+            },
+            ModelType.YOLO_CONJOINED_BUBBLE: {
+                "repo_id": "ogkalu/comic-speech-bubble-detector-yolov8m",
+                "filename": "comic-speech-bubble-detector.pt",
+            },
+            ModelType.YOLO_OSBTEXT: {
+                "repo_id": "deepghs/AnimeText_yolo",
+                "filename": "yolo12x_animetext/model.pt",
+            },
+            ModelType.YOLO_PANEL: {
+                "repo_id": "deepghs/manga109_yolo",
+                "filename": "v2023.12.07_l_yv11/model.pt",
+            },
+            ModelType.SAM2: {
+                "repo_id": "facebook/sam2.1-hiera-large",
+            },
+            ModelType.FLUX_PIPELINE: {
+                "repo_id": "black-forest-labs/FLUX.1-Kontext-dev",
+                "filename": None,  # Pipeline loaded via from_pretrained
+            },
+        }
+        repos[ModelType.FLUX_TRANSFORMER] = {
+            "repo_id": "nunchaku-tech/nunchaku-flux.1-kontext-dev",
+            "filename": None,  # Will be constructed dynamically in load_flux_models()
+        }
+        repos[ModelType.FLUX_TEXT_ENCODER] = {
+            "repo_id": "nunchaku-tech/nunchaku-t5",
+            "filename": "awq-int4-flux.1-t5xxl.safetensors",
+        }
+        repos[ModelType.MANGA_OCR] = {
+            "repo_id": "kha-white/manga-ocr-base",
+        }
+        return repos
+    def _ensure_file(self, path: Path, url: str, verbose: bool = False) -> None:
+        """Download file from URL if it doesn't exist.
+        Args:
+            path: Path where file should be saved
+            url: URL to download from
+            verbose: Whether to print verbose logging
+        """
+        if path.exists():
+            return
+        path.parent.mkdir(parents=True, exist_ok=True)
+        log_message(f"Downloading {path.name}...", verbose=verbose)
+        try:
+            req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
+            with urllib.request.urlopen(req) as response, open(path, "wb") as f:
+                shutil.copyfileobj(response, f)
+            log_message(f"Downloaded {path.name} successfully.", verbose=verbose)
+        except Exception as e:
+            if path.exists():
+                path.unlink()
+            raise ModelError(f"Failed to download {path.name}: {e}")
+    def _ensure_hf_file(
+        self,
+        repo_id: str,
+        filename: str,
+        target: Path,
+        token: Optional[str] = None,
+        verbose: bool = False,
+    ) -> Path:
+        """Download file from Hugging Face if it doesn't exist.
+        Args:
+            repo_id: Hugging Face repository ID
+            filename: Name of file to download
+            target: Path where file should be saved
+            token: Optional Hugging Face token
+            verbose: Whether to print verbose logging
+        """
+        if target.exists():
+            return target
+        target.parent.mkdir(parents=True, exist_ok=True)
+        log_message(
+            f"Downloading {target.name} from Hugging Face ({repo_id})...",
+            verbose=verbose,
+        )
+        downloaded = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_dir=str(target.parent),
+            token=token,
+        )
+        downloaded_path = Path(downloaded)
+        if downloaded_path != target:
+            downloaded_parent = downloaded_path.parent
+            try:
+                downloaded_path.replace(target)
+            except Exception:
+                shutil.copyfile(downloaded_path, target)
+                try:
+                    downloaded_path.unlink()
+                except Exception:
+                    pass
+            # Clean up empty directory if it was created by hf_hub_download
+            if downloaded_parent != target.parent and downloaded_parent.exists():
+                try:
+                    if not any(downloaded_parent.iterdir()):
+                        downloaded_parent.rmdir()
+                except (OSError, PermissionError):
+                    pass
+        log_message(f"Downloaded {target.name} successfully.", verbose=verbose)
+        return target
+    def _ensure_hf_repo(
+        self,
+        repo_id: str,
+        target_dir: Path,
+        token: Optional[str] = None,
+        verbose: bool = False,
+    ) -> Path:
+        """Download entire repository from Hugging Face if it doesn't exist.
+        Args:
+            repo_id: Hugging Face repository ID
+            target_dir: Directory where repository should be saved
+            token: Optional Hugging Face token
+            verbose: Whether to print verbose logging
+        Returns:
+            Path to the downloaded repository directory
+        """
+        # Check for larger model file to ensure download is complete
+        critical_file = target_dir / "pytorch_model.bin"
+        if target_dir.exists() and critical_file.exists():
+            return target_dir
+        target_dir.mkdir(parents=True, exist_ok=True)
+        log_message(
+            f"Downloading repository {repo_id} from Hugging Face...",
+            verbose=verbose,
+        )
+        try:
+            snapshot_download(
+                repo_id=repo_id,
+                local_dir=str(target_dir),
+                token=token,
+            )
+            log_message(
+                f"Downloaded repository {repo_id} successfully.", verbose=verbose
+            )
+        except Exception as e:
+            if target_dir.exists():
+                models_dir = Path("./models").resolve()
+                target_resolved = target_dir.resolve()
+                try:
+                    target_resolved.relative_to(models_dir)
+                    # Safe to delete
+                    try:
+                        shutil.rmtree(target_dir)
+                    except Exception:
+                        pass
+                except ValueError:
+                    # target_dir is not within models/, skip deletion for safety
+                    log_message(
+                        f"Warning: Skipping deletion of {target_dir} as it is outside models/ directory",
+                        always_print=True,
+                    )
+            raise ModelError(f"Failed to download repository {repo_id}: {e}") from e
+        return target_dir
+    def is_loaded(self, model_type: ModelType) -> bool:
+        """Check if a model is currently loaded."""
+        with self._lock:
+            return model_type in self.models and self.models[model_type] is not None
+    def load_upscale(self, verbose: bool = False):
+        """Load upscale model (AnimeSharpV4 RCAN)."""
+        with self._lock:
+            if self.is_loaded(ModelType.UPSCALE):
+                return self.models[ModelType.UPSCALE]
+            log_message(
+                "Loading upscale model (2x-AnimeSharpV4_RCAN)...", verbose=verbose
+            )
+            path = self.model_paths[ModelType.UPSCALE]
+            # Try HF download first, fallback to direct URL
+            try:
+                hf_info = self.model_hf_repos[ModelType.UPSCALE]
+                self._ensure_hf_file(
+                    hf_info["repo_id"], hf_info["filename"], path, verbose=verbose
+                )
+            except Exception:
+                self._ensure_file(
+                    path, self.model_urls[ModelType.UPSCALE], verbose=verbose
+                )
+            # Load model
+            if path.suffix == ".safetensors":
+                from safetensors import safe_open
+                state_dict = {}
+                with safe_open(path, framework="pt", device=str(self.device)) as f:
+                    for key in f.keys():
+                        state_dict[key] = f.get_tensor(key)
+            else:
+                state_dict = torch.load(
+                    path, map_location=self.device, weights_only=False
+                )
+            model = (
+                ModelLoader().load_from_state_dict(state_dict).to(self.device).eval()
+            )
+            self.models[ModelType.UPSCALE] = model
+            log_message("Upscale model loaded.", verbose=verbose)
+            return model
+    def load_upscale_lite(self, verbose: bool = False):
+        """Load upscale lite model (AnimeSharpV4 Fast RCAN PU)."""
+        with self._lock:
+            if self.is_loaded(ModelType.UPSCALE_LITE):
+                return self.models[ModelType.UPSCALE_LITE]
+            log_message(
+                "Loading upscale lite model (2x-AnimeSharpV4_Fast_RCAN_PU)...",
+                verbose=verbose,
+            )
+            path = self.model_paths[ModelType.UPSCALE_LITE]
+            # Try HF download first, fallback to direct URL
+            try:
+                hf_info = self.model_hf_repos[ModelType.UPSCALE_LITE]
+                self._ensure_hf_file(
+                    hf_info["repo_id"], hf_info["filename"], path, verbose=verbose
+                )
+            except Exception:
+                self._ensure_file(
+                    path, self.model_urls[ModelType.UPSCALE_LITE], verbose=verbose
+                )
+            # Load model
+            if path.suffix == ".safetensors":
+                from safetensors import safe_open
+                state_dict = {}
+                with safe_open(path, framework="pt", device=str(self.device)) as f:
+                    for key in f.keys():
+                        state_dict[key] = f.get_tensor(key)
+            else:
+                state_dict = torch.load(
+                    path, map_location=self.device, weights_only=False
+                )
+            model = (
+                ModelLoader().load_from_state_dict(state_dict).to(self.device).eval()
+            )
+            self.models[ModelType.UPSCALE_LITE] = model
+            log_message("Upscale lite model loaded.", verbose=verbose)
+            return model
+    def load_yolo_speech_bubble(
+        self, model_path: Optional[str] = None, verbose: bool = False
+    ):
+        """Load YOLO model for speech bubble detection.
+        Args:
+            model_path: Optional custom path to YOLO model. If None, uses default.
+            verbose: Whether to print verbose logging
+        """
+        with self._lock:
+            if self.is_loaded(ModelType.YOLO_SPEECH_BUBBLE):
+                return self.models[ModelType.YOLO_SPEECH_BUBBLE]
+            log_message(
+                "Loading YOLO speech bubble detection model...", verbose=verbose
+            )
+            path = (
+                self.model_paths[ModelType.YOLO_SPEECH_BUBBLE]
+                if model_path is None
+                else Path(model_path)
+            )
+            if path == self.model_paths[ModelType.YOLO_SPEECH_BUBBLE]:
+                hf_info = self.model_hf_repos[ModelType.YOLO_SPEECH_BUBBLE]
+                self._ensure_hf_file(
+                    hf_info["repo_id"], hf_info["filename"], path, verbose=verbose
+                )
+            model = YOLO(str(path))
+            self.models[ModelType.YOLO_SPEECH_BUBBLE] = model
+            log_message("YOLO model loaded.", verbose=verbose)
+            return model
+    def load_yolo_conjoined_bubble(self, verbose: bool = False):
+        """Load YOLO model for conjoined speech bubble detection."""
+        with self._lock:
+            if self.is_loaded(ModelType.YOLO_CONJOINED_BUBBLE):
+                return self.models[ModelType.YOLO_CONJOINED_BUBBLE]
+            log_message(
+                "Loading YOLO conjoined bubble detection model...", verbose=verbose
+            )
+            path = self.model_paths[ModelType.YOLO_CONJOINED_BUBBLE]
+            # Try HF download
+            hf_info = self.model_hf_repos[ModelType.YOLO_CONJOINED_BUBBLE]
+            self._ensure_hf_file(
+                hf_info["repo_id"], hf_info["filename"], path, verbose=verbose
+            )
+            model = YOLO(str(path))
+            self.models[ModelType.YOLO_CONJOINED_BUBBLE] = model
+            log_message("YOLO conjoined bubble model loaded.", verbose=verbose)
+            return model
+    def load_yolo_osbtext(self, token: Optional[str] = None, verbose: bool = False):
+        """Load YOLO model for outside text detection.
+        Args:
+            token: Hugging Face token for gated repo access.
+            verbose: Whether to print verbose logging
+        """
+        with self._lock:
+            if self.is_loaded(ModelType.YOLO_OSBTEXT):
+                return self.models[ModelType.YOLO_OSBTEXT]
+            log_message("Loading YOLO OSB Text detection model...", verbose=verbose)
+            path = self.model_paths[ModelType.YOLO_OSBTEXT]
+            hf_info = self.model_hf_repos[ModelType.YOLO_OSBTEXT]
+            self._ensure_hf_file(
+                hf_info["repo_id"],
+                hf_info["filename"],
+                path,
+                token=token,
+                verbose=verbose,
+            )
+            model = YOLO(str(path))
+            self.models[ModelType.YOLO_OSBTEXT] = model
+            log_message("YOLO OSB Text model loaded.", verbose=verbose)
+            return model
+    def load_yolo_panel(self, verbose: bool = False):
+        """Load YOLO model for panel detection.
+        Args:
+            verbose: Whether to print verbose logging
+        """
+        with self._lock:
+            if self.is_loaded(ModelType.YOLO_PANEL):
+                return self.models[ModelType.YOLO_PANEL]
+            log_message("Loading YOLO panel detection model...", verbose=verbose)
+            path = self.model_paths[ModelType.YOLO_PANEL]
+            hf_info = self.model_hf_repos[ModelType.YOLO_PANEL]
+            self._ensure_hf_file(
+                hf_info["repo_id"],
+                hf_info["filename"],
+                path,
+                verbose=verbose,
+            )
+            model = YOLO(str(path))
+            self.models[ModelType.YOLO_PANEL] = model
+            log_message("YOLO panel model loaded.", verbose=verbose)
+            return model
+    def load_manga_ocr(self, verbose: bool = False) -> Path:
+        """Ensure manga-ocr model repository is downloaded.
+        Args:
+            verbose: Whether to print verbose logging
+        Returns:
+            Path to the downloaded manga-ocr model directory
+        """
+        with self._lock:
+            model_path = self.model_paths[ModelType.MANGA_OCR]
+            hf_info = self.model_hf_repos[ModelType.MANGA_OCR]
+            self._ensure_hf_repo(hf_info["repo_id"], model_path, verbose=verbose)
+            log_message("manga-ocr model repository ready.", verbose=verbose)
+            return model_path
+    def get_manga_ocr(self, verbose: bool = False):
+        """Get manga-ocr instance, loading it if necessary.
+        Args:
+            verbose: Whether to print verbose logging
+        Returns:
+            MangaOcr instance
+        """
+        with self._lock:
+            if self.is_loaded(ModelType.MANGA_OCR):
+                return self.models[ModelType.MANGA_OCR]
+            log_message("Initializing manga-ocr...", verbose=verbose)
+            # Fix for MeCab/Fugashi on non-Windows systems
+            try:
+                import os
+                import unidic_lite
+                os.environ["MECABRC"] = os.path.join(unidic_lite.DICDIR, "mecabrc")
+            except ImportError:
+                log_message(
+                    "Warning: unidic_lite not found, skipping MeCab fix",
+                    verbose=verbose,
+                )
+            except Exception as e:
+                log_message(f"Warning: Failed to apply MeCab fix: {e}", verbose=verbose)
+            from manga_ocr import MangaOcr
+            # Ensure model is downloaded
+            model_path = self.load_manga_ocr(verbose=verbose)
+            manga_ocr_instance = MangaOcr(pretrained_model_name_or_path=str(model_path))
+            self.models[ModelType.MANGA_OCR] = manga_ocr_instance
+            log_message("manga-ocr initialized", verbose=verbose)
+            return manga_ocr_instance
+    def load_sam2(self, verbose: bool = False):
+        """Load SAM 2.1 model and processor.
+        Returns:
+            tuple: (processor, model) - SAM2 processor and model instances
+        """
+        with self._lock:
+            if self.is_loaded(ModelType.SAM2):
+                return self.models[ModelType.SAM2]
+            log_message("Loading SAM 2.1 model...", verbose=verbose)
+            hf_info = self.model_hf_repos[ModelType.SAM2]
+            cache_dir = "models/sam"
+            processor = Sam2Processor.from_pretrained(
+                hf_info["repo_id"], cache_dir=cache_dir
+            )
+            model = Sam2Model.from_pretrained(
+                hf_info["repo_id"], torch_dtype=self.dtype, cache_dir=cache_dir
+            ).to(self.device)
+            model.eval()
+            # Store as tuple
+            self.models[ModelType.SAM2] = (processor, model)
+            log_message("SAM 2.1 model loaded.", verbose=verbose)
+            return self.models[ModelType.SAM2]
+    def set_flux_hf_token(self, token: str):
+        """Set the HuggingFace token for Flux model downloads.
+        Args:
+            token: HuggingFace API token
+        """
+        self.flux_hf_token = token if token else None
+    def set_flux_residual_diff_threshold(self, threshold: float):
+        """Set the residual diff threshold for Flux caching.
+        Args:
+            threshold: Residual diff threshold (0.0-1.0)
+        """
+        self.flux_residual_diff_threshold = max(0.0, min(1.0, threshold))
+    def load_flux_models(self, verbose: bool = False):
+        """Load all Flux Kontext inpainting models (transformer, text encoder, pipeline).
+        Returns:
+            tuple: (transformer, text_encoder, pipeline)
+        """
+        with self._lock:
+            if self.is_loaded(ModelType.FLUX_PIPELINE):
+                return (
+                    self.models[ModelType.FLUX_TRANSFORMER],
+                    self.models[ModelType.FLUX_TEXT_ENCODER],
+                    self.models[ModelType.FLUX_PIPELINE],
+                )
+            log_message("Loading Flux Kontext inpainting models...", verbose=verbose)
+            try:
+                # Lazy imports for Nunchaku and diffusers
+                from diffusers import FluxKontextPipeline
+                from nunchaku.caching.diffusers_adapters import apply_cache_on_pipe
+                from nunchaku.models.text_encoders.t5_encoder import (
+                    NunchakuT5EncoderModel,
+                )
+                from nunchaku.models.transformers.transformer_flux import (
+                    NunchakuFluxTransformer2dModel,
+                )
+                from nunchaku.utils import get_precision
+                hf_info = self.model_hf_repos[ModelType.FLUX_TRANSFORMER]
+                if hf_info["filename"] is None:
+                    hf_info["filename"] = (
+                        f"svdq-{get_precision()}_r32-flux.1-kontext-dev.safetensors"
+                    )
+                transformer_path = self._ensure_hf_file(
+                    hf_info["repo_id"],
+                    hf_info["filename"],
+                    self.flux_cache_dir / hf_info["filename"],
+                    verbose=verbose,
+                )
+                transformer = NunchakuFluxTransformer2dModel.from_pretrained(
+                    str(transformer_path),
+                    torch_dtype=self.dtype,
+                    offload=True,
+                    precision="int4",
+                    set_attention_impl="nunchaku-fp16",
+                )
+                self.models[ModelType.FLUX_TRANSFORMER] = transformer
+                # Load text encoder
+                hf_info = self.model_hf_repos[ModelType.FLUX_TEXT_ENCODER]
+                text_encoder_path = self._ensure_hf_file(
+                    hf_info["repo_id"],
+                    hf_info["filename"],
+                    self.flux_cache_dir / hf_info["filename"],
+                    verbose=verbose,
+                )
+                text_encoder = NunchakuT5EncoderModel.from_pretrained(
+                    str(text_encoder_path),
+                    torch_dtype=self.dtype,
+                )
+                self.models[ModelType.FLUX_TEXT_ENCODER] = text_encoder
+                # Load pipeline
+                pipeline_repo = self.model_hf_repos[ModelType.FLUX_PIPELINE]["repo_id"]
+                pipeline = FluxKontextPipeline.from_pretrained(
+                    pipeline_repo,
+                    transformer=transformer,
+                    text_encoder_2=text_encoder,
+                    torch_dtype=self.dtype,
+                    cache_dir=str(self.flux_cache_dir),
+                    token=self.flux_hf_token,
+                ).to(self.device)
+                # Apply caching for faster inference
+                apply_cache_on_pipe(
+                    pipeline, residual_diff_threshold=self.flux_residual_diff_threshold
+                )
+                self.models[ModelType.FLUX_PIPELINE] = pipeline
+                log_message("Flux Kontext models loaded successfully.", verbose=verbose)
+                return transformer, text_encoder, pipeline
+            except ImportError as e:
+                raise ModelError(
+                    "Nunchaku not installed or incompatible. Inpainting requires Nunchaku."
+                ) from e
+            except Exception as e:
+                raise ModelError(
+                    f"Failed to load Flux/Nunchaku inpainting models: {e}"
+                ) from e
+    def unload_model(
+        self, model_type: ModelType, force_gc: bool = True, verbose: bool = False
+    ):
+        """Unload a specific model and free memory.
+        Args:
+            model_type: Type of model to unload
+            force_gc: Whether to force garbage collection
+            verbose: Whether to print verbose logging
+        """
+        with self._lock:
+            if not self.is_loaded(model_type):
+                return
+            log_message(f"Unloading {model_type.value}...", verbose=verbose)
+            del self.models[model_type]
+            self.models[model_type] = None
+            if force_gc and torch.cuda.is_available():
+                gc.collect()
+                torch.cuda.empty_cache()
+    def unload_upscale_models(self, verbose: bool = False):
+        """Unload upscale models (both regular and lite)."""
+        self.unload_model(ModelType.UPSCALE, force_gc=False, verbose=verbose)
+        self.unload_model(ModelType.UPSCALE_LITE, force_gc=False, verbose=verbose)
+        if torch.cuda.is_available():
+            gc.collect()
+            torch.cuda.empty_cache()
+        log_message("Upscale models unloaded.", verbose=verbose)
+    def unload_ocr_models(self, verbose: bool = False):
+        """Unload OCR-related models (YOLO, SAM2, and manga-ocr)."""
+        models_unloaded = []
+        if self.is_loaded(ModelType.YOLO_SPEECH_BUBBLE):
+            models_unloaded.append("yolo_speech_bubble")
+        if self.is_loaded(ModelType.YOLO_CONJOINED_BUBBLE):
+            models_unloaded.append("yolo_conjoined_bubble")
+        if self.is_loaded(ModelType.SAM2):
+            models_unloaded.append("sam2")
+        if self.is_loaded(ModelType.YOLO_OSBTEXT):
+            models_unloaded.append("yolo_osbtext")
+        if self.is_loaded(ModelType.YOLO_PANEL):
+            models_unloaded.append("yolo_panel")
+        if self.is_loaded(ModelType.MANGA_OCR):
+            models_unloaded.append("manga_ocr")
+        self.unload_model(ModelType.YOLO_SPEECH_BUBBLE, force_gc=False, verbose=verbose)
+        self.unload_model(
+            ModelType.YOLO_CONJOINED_BUBBLE, force_gc=False, verbose=verbose
+        )
+        self.unload_model(ModelType.SAM2, force_gc=False, verbose=verbose)
+        self.unload_model(ModelType.YOLO_OSBTEXT, force_gc=False, verbose=verbose)
+        self.unload_model(ModelType.YOLO_PANEL, force_gc=False, verbose=verbose)
+        self.unload_model(ModelType.MANGA_OCR, force_gc=True, verbose=verbose)
+        if models_unloaded:
+            log_message("OCR models unloaded.", verbose=verbose)
+    def unload_flux_models(self, verbose: bool = False):
+        """Unload all Flux Kontext models."""
+        models_unloaded = []
+        if self.is_loaded(ModelType.FLUX_TRANSFORMER):
+            models_unloaded.append("flux_transformer")
+        if self.is_loaded(ModelType.FLUX_TEXT_ENCODER):
+            models_unloaded.append("flux_text_encoder")
+        if self.is_loaded(ModelType.FLUX_PIPELINE):
+            models_unloaded.append("flux_pipeline")
+        self.unload_model(ModelType.FLUX_TRANSFORMER, force_gc=False, verbose=verbose)
+        self.unload_model(ModelType.FLUX_TEXT_ENCODER, force_gc=False, verbose=verbose)
+        self.unload_model(ModelType.FLUX_PIPELINE, force_gc=True, verbose=verbose)
+        if models_unloaded:
+            log_message("Flux Kontext models unloaded.", verbose=verbose)
+    def unload_all(self, verbose: bool = False):
+        """Unload all models and free all GPU memory."""
+        with self._lock:
+            log_message("Unloading all models...", verbose=verbose)
+            for model_type in list(self.models.keys()):
+                if self.is_loaded(model_type):
+                    del self.models[model_type]
+                    self.models[model_type] = None
+            if torch.cuda.is_available():
+                gc.collect()
+                torch.cuda.empty_cache()
+            log_message("All models unloaded.", verbose=verbose)
+    def get_memory_stats(self):
+        """Get current GPU memory usage statistics."""
+        if not torch.cuda.is_available():
+            return {"device": "cpu", "memory": "N/A"}
+        allocated = torch.cuda.memory_allocated() / 1024**3
+        reserved = torch.cuda.memory_reserved() / 1024**3
+        return {
+            "device": torch.cuda.get_device_name(0),
+            "allocated_gb": f"{allocated:.2f}",
+            "reserved_gb": f"{reserved:.2f}",
+        }
+    def print_memory_stats(self):
+        """Print current GPU memory usage."""
+        stats = self.get_memory_stats()
+        if stats["memory"] == "N/A":
+            log_message(f"Device: {stats['device']}", always_print=True)
+        else:
+            log_message(
+                f"GPU Memory - Allocated: {stats['allocated_gb']} GB, "
+                f"Reserved: {stats['reserved_gb']} GB",
+                always_print=True,
+            )
+    @contextmanager
+    def upscale_context(self, verbose: bool = False):
+        """Context manager for upscale model - auto-loads and unloads."""
+        try:
+            self.load_upscale(verbose=verbose)
+            yield self.models[ModelType.UPSCALE]
+        finally:
+            self.unload_upscale_models(verbose=verbose)
+    @contextmanager
+    def upscale_lite_context(self, verbose: bool = False):
+        """Context manager for upscale lite model - auto-loads and unloads."""
+        try:
+            self.load_upscale_lite(verbose=verbose)
+            yield self.models[ModelType.UPSCALE_LITE]
+        finally:
+            self.unload_upscale_models(verbose=verbose)
+    @contextmanager
+    def ocr_context(self, hf_token=None, verbose: bool = False):
+        """Context manager for OCR models - auto-loads and unloads.
+        Args:
+            hf_token: Hugging Face token for gated repo access
+            verbose: Whether to print verbose logging
+        """
+        try:
+            yolo = self.load_yolo_speech_bubble(verbose=verbose)
+            yolo_osbtext = self.load_yolo_osbtext(token=hf_token, verbose=verbose)
+            yield yolo, yolo_osbtext
+        finally:
+            self.unload_ocr_models(verbose=verbose)
+    @contextmanager
+    def flux_context(self, verbose: bool = False):
+        """Context manager for Flux models - auto-loads and unloads."""
+        try:
+            transformer, text_encoder, pipeline = self.load_flux_models(verbose=verbose)
+            yield transformer, text_encoder, pipeline
+        finally:
+            self.unload_flux_models(verbose=verbose)
+# Global singleton instance
+_model_manager = None
+def get_model_manager() -> ModelManager:
+    """Get the global model manager instance."""
+    global _model_manager
+    if _model_manager is None:
+        _model_manager = ModelManager()
+    return _model_manager

core/outside_text_processor.py ADDED Viewed

	@@ -0,0 +1,638 @@

+import base64
+import gc
+import os
+import random
+import re
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+import cv2
+import numpy as np
+from PIL import Image
+from sklearn.cluster import KMeans
+from core.config import MangaTranslatorConfig
+from core.image.image_utils import cv2_to_pil, pil_to_cv2, process_bubble_image_cached
+from core.image.inpainting import FluxKontextInpainter
+from core.image.ocr_detection import OutsideTextDetector, extract_text_with_manga_ocr
+from core.ml.model_manager import get_model_manager
+from utils.logging import log_message
+def process_outside_text(
+    pil_image: Image.Image,
+    config: MangaTranslatorConfig,
+    image_path: Union[str, Path],
+    image_format: Optional[str],
+    verbose: bool = False,
+    bubble_data: Optional[List[Dict[str, Any]]] = None,
+    text_free_boxes: Optional[List[List[float]]] = None,
+) -> Tuple[Image.Image, List[Dict[str, Any]]]:
+    """
+    Process outside text detection, inpainting, and prepare data for translation.
+    This function handles the complete outside text processing pipeline:
+    1. Detects text outside speech bubbles using OCR
+    2. Inpaints the detected text regions using FluxKontext
+    3. Prepares the outside text data for translation API calls
+    Args:
+        pil_image: The PIL image to process
+        config: MangaTranslatorConfig containing all settings
+        image_path: Path to the original image file
+        image_format: Original image format (PNG, JPEG, etc.)
+        processing_scale: The scale factor for image processing
+        verbose: Whether to print detailed logging
+    Returns:
+        Tuple containing:
+        - processed_pil_image: The image after outside text inpainting
+        - outside_text_data: List of dicts with outside text information for translation
+    """
+    if not config.outside_text.enabled:
+        return pil_image, []
+    log_message("Detecting text outside speech bubbles...", verbose=verbose)
+    try:
+        outside_detector = OutsideTextDetector(
+            device=config.device, hf_token=config.outside_text.huggingface_token
+        )
+        outside_text_results = outside_detector.detect_outside_text(
+            str(image_path),
+            yolo_model_path=config.yolo_model_path,
+            confidence=config.outside_text.osb_confidence,
+            conjoined_confidence=config.detection.conjoined_confidence,
+            verbose=verbose,
+            image_override=pil_image,
+            existing_bubbles=bubble_data,
+            text_free_boxes=text_free_boxes,
+        )
+        if not outside_text_results:
+            log_message("No outside text regions found", verbose=verbose)
+            outside_detector.unload_models()
+            return pil_image, []
+        img_w, img_h = pil_image.size
+        # Filter out probable page numbers
+        # Only run OCR on "suspicious" detections (small & in margin)
+        if config.outside_text.enable_page_number_filtering and outside_text_results:
+            suspicious_crops = []
+            suspicious_indices = []
+            safe_results = []
+            margin_threshold = max(
+                0.0, min(0.3, config.outside_text.page_filter_margin_threshold)
+            )
+            min_area_threshold = max(
+                0.0, min(0.2, config.outside_text.page_filter_min_area_ratio)
+            )
+            for i, res in enumerate(outside_text_results):
+                bbox, _ = res
+                x1, y1, x2, y2 = [int(c) for c in bbox]
+                cy = (y1 + y2) / 2
+                is_in_margin = (cy < img_h * margin_threshold) or (
+                    cy > img_h * (1 - margin_threshold)
+                )
+                area = (x2 - x1) * (y2 - y1)
+                is_small = area < (img_w * img_h * min_area_threshold)
+                if is_in_margin and is_small:
+                    suspicious_crops.append(pil_image.crop((x1, y1, x2, y2)))
+                    suspicious_indices.append(i)
+                else:
+                    safe_results.append(res)
+            if suspicious_crops:
+                log_message(
+                    f"Verifying {len(suspicious_crops)} suspicious OSB regions with OCR...",
+                    verbose=verbose,
+                )
+                suspicious_texts = extract_text_with_manga_ocr(
+                    suspicious_crops, verbose=verbose
+                )
+                kept_suspicious_count = 0
+                for i, text in enumerate(suspicious_texts):
+                    # Regex for page numbers: digits, "Page 20", "p. 20", etc.
+                    is_page_number = bool(
+                        re.match(
+                            r"^\s*(?:page\.?|p\.?)?\s*\d+\s*$", text, re.IGNORECASE
+                        )
+                    )
+                    if not is_page_number:
+                        safe_results.append(outside_text_results[suspicious_indices[i]])
+                        kept_suspicious_count += 1
+                    else:
+                        log_message(
+                            f"Filtered out page number: '{text}'", verbose=verbose
+                        )
+                outside_text_results = safe_results
+                log_message(
+                    f"Remaining OSB regions after filtering: {len(outside_text_results)}",
+                    verbose=verbose,
+                )
+        # Build a mask of all detected speech bubbles to prevent OSB inpainting overlap
+        total_bubble_mask = np.zeros((img_h, img_w), dtype=bool)
+        if bubble_data:
+            for bubble in bubble_data:
+                try:
+                    mask = bubble.get("sam_mask") if isinstance(bubble, dict) else None
+                    if mask is not None:
+                        mask_np = np.asarray(mask)
+                        if mask_np.ndim == 3:
+                            mask_np = mask_np[..., 0]
+                        mask_bool = mask_np > 0
+                        if mask_bool.shape[0] == img_h and mask_bool.shape[1] == img_w:
+                            total_bubble_mask |= mask_bool
+                            continue
+                    bbox = bubble.get("bbox") if isinstance(bubble, dict) else None
+                    if bbox and len(bbox) == 4:
+                        x0, y0, x1, y1 = [int(c) for c in bbox]
+                        x0 = max(0, min(img_w, x0))
+                        x1 = max(0, min(img_w, x1))
+                        y0 = max(0, min(img_h, y0))
+                        y1 = max(0, min(img_h, y1))
+                        if x1 > x0 and y1 > y0:
+                            total_bubble_mask[y0:y1, x0:x1] = True
+                except Exception as e:
+                    log_message(
+                        f"Warning: Failed to apply bubble mask for OSB exclusion: {e}",
+                        verbose=verbose,
+                    )
+        mime_type = (
+            "image/png"
+            if image_format and image_format.upper() == "PNG"
+            else "image/jpeg"
+        )
+        cv2_ext = ".png" if image_format and image_format.upper() == "PNG" else ".jpg"
+        # Probe original text color for OSB rendering
+        original_text_colors = {}
+        for ocr_result in outside_text_results:
+            bbox_coords, conf = ocr_result
+            x1, y1, x2, y2 = [int(c) for c in bbox_coords]
+            bbox_tuple = (x1, y1, x2, y2)
+            bbox_area_img = pil_image.crop((x1, y1, x2, y2))
+            bbox_array = np.array(bbox_area_img)
+            if bbox_array.shape[-1] == 4:
+                bbox_array = bbox_array[..., :3]
+            pixels = bbox_array.reshape(-1, 3)
+            # Use K-Means to find 2 dominant colors
+            kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
+            kmeans.fit(pixels)
+            labels = kmeans.labels_
+            centers = kmeans.cluster_centers_
+            unique, counts = np.unique(labels, return_counts=True)
+            dominant_cluster_idx = unique[np.argmax(counts)]
+            # Dominant cluster is usually the background (text pixels are sparse)
+            bg_color_rgb = centers[dominant_cluster_idx]
+            # Use proper luminance calculation (ITU-R BT.601)
+            bg_brightness = (
+                0.299 * bg_color_rgb[0]
+                + 0.587 * bg_color_rgb[1]
+                + 0.114 * bg_color_rgb[2]
+            )
+            is_dark_text = (
+                bg_brightness < 128
+            )  # passed downstream; renderer inverts for text color
+            original_text_colors[bbox_tuple] = is_dark_text
+            log_message(
+                f"OSB bbox {bbox_tuple}: "
+                f"{'Dark' if is_dark_text else 'Light'} background detected "
+                f"(luminance={bg_brightness:.1f})",
+                verbose=verbose,
+            )
+        log_message("Inpainting outside text regions...", verbose=verbose)
+        inpainter = (
+            None
+            if config.outside_text.force_cv2_inpainting
+            else FluxKontextInpainter(
+                device=config.device,
+                huggingface_token=config.outside_text.huggingface_token,
+                num_inference_steps=config.outside_text.flux_num_inference_steps,
+                residual_diff_threshold=config.outside_text.flux_residual_diff_threshold,
+            )
+        )
+        mask_groups, _ = outside_detector.get_text_masks(
+            str(image_path),
+            bbox_expansion_percent=config.outside_text.bbox_expansion_percent,
+            text_box_proximity_ratio=config.outside_text.text_box_proximity_ratio,
+            verbose=verbose,
+            image_override=pil_image,
+            existing_results=outside_text_results,
+        )
+        current_image = pil_image
+        temp_files = []
+        try:
+            if mask_groups:
+                base_seed = (
+                    random.randint(1, 999999)
+                    if config.outside_text.seed == -1
+                    else config.outside_text.seed
+                )
+                flux_inpaints = 0
+                cv2_inpaints = 0
+                for i, group in enumerate(mask_groups):
+                    log_message(
+                        f"Inpainting outside text region {i + 1}/{len(mask_groups)}",
+                        verbose=verbose,
+                    )
+                    combined_mask = group["combined_mask"]
+                    combined_mask = np.logical_and(
+                        combined_mask, np.logical_not(total_bubble_mask)
+                    )
+                    if not np.any(combined_mask):
+                        log_message(
+                            "Skipping outside text region after bubble masking (no remaining area)",
+                            verbose=verbose,
+                        )
+                        continue
+                    region_seed = base_seed + i if base_seed > 0 else base_seed
+                    original_bbox_dict = group.get("original_bbox")
+                    composite_clip_bbox = None
+                    fill_color = None
+                    fallback_fill_color = None
+                    ox0 = oy0 = ox1 = oy1 = None
+                    if original_bbox_dict:
+                        ox = int(original_bbox_dict.get("x", 0))
+                        oy = int(original_bbox_dict.get("y", 0))
+                        ow = int(original_bbox_dict.get("width", 0))
+                        oh = int(original_bbox_dict.get("height", 0))
+                        if ow > 0 and oh > 0:
+                            ox0 = max(0, min(img_w, ox))
+                            oy0 = max(0, min(img_h, oy))
+                            ox1 = max(0, min(img_w, ox + ow))
+                            oy1 = max(0, min(img_h, oy + oh))
+                            composite_clip_bbox = (ox, oy, ox + ow, oy + oh)
+                            # Determine detected text color for this region to ensure contrast
+                            group_bg_is_dark = None
+                            if original_text_colors:
+                                votes_dark = 0
+                                votes_light = 0
+                                gx1, gy1, gx2, gy2 = ox, oy, ox + ow, oy + oh
+                                for (
+                                    bx1,
+                                    by1,
+                                    bx2,
+                                    by2,
+                                ), t_dark in original_text_colors.items():
+                                    # Check if center of OCR box is inside group box
+                                    bcx = (bx1 + bx2) / 2
+                                    bcy = (by1 + by2) / 2
+                                    if (
+                                        bcx >= gx1
+                                        and bcx <= gx2
+                                        and bcy >= gy1
+                                        and bcy <= gy2
+                                    ):
+                                        if t_dark:
+                                            votes_dark += 1
+                                        else:
+                                            votes_light += 1
+                                if votes_dark > 0 or votes_light > 0:
+                                    group_bg_is_dark = votes_dark >= votes_light
+                                    # Detected value represents background brightness
+                                    fallback_fill_color = (
+                                        (0, 0, 0)
+                                        if group_bg_is_dark
+                                        else (255, 255, 255)
+                                    )
+                                    t_type = "Dark" if group_bg_is_dark else "Light"
+                                    f_col = (
+                                        "White"
+                                        if fallback_fill_color == (255, 255, 255)
+                                        else "Black"
+                                    )
+                                    log_message(
+                                        f"OSB Region {i + 1}: Detected {t_type} background. "
+                                        f"Fallback fill: {f_col}.",
+                                        verbose=verbose,
+                                    )
+                            # Expanded sampling around the original bbox to find background color
+                            expansion_px = 2
+                            sx1 = max(0, ox - expansion_px)
+                            sy1 = max(0, oy - expansion_px)
+                            sx2 = min(img_w, ox + ow + expansion_px)
+                            sy2 = min(img_h, oy + oh + expansion_px)
+                            if sx2 > sx1 and sy2 > sy1:
+                                mask_h, mask_w = sy2 - sy1, sx2 - sx1
+                                local_mask = np.ones((mask_h, mask_w), dtype=bool)
+                                lx0 = max(0, ox0 - sx1)
+                                ly0 = max(0, oy0 - sy1)
+                                lx1 = min(mask_w, ox1 - sx1)
+                                ly1 = min(mask_h, oy1 - sy1)
+                                if lx1 > lx0 and ly1 > ly0:
+                                    local_mask[ly0:ly1, lx0:lx1] = False
+                                border_pixels = None
+                                min_border_pixels = 20
+                                if np.count_nonzero(local_mask) >= min_border_pixels:
+                                    sampling_crop = current_image.crop(
+                                        (sx1, sy1, sx2, sy2)
+                                    )
+                                    crop_np = np.array(sampling_crop.convert("RGB"))
+                                    border_pixels = crop_np[local_mask]
+                                if border_pixels is not None and border_pixels.size > 0:
+                                    white_thresh = 250
+                                    black_thresh = 5
+                                    ratio_threshold = 0.95
+                                    white_ratio = np.mean(
+                                        np.all(border_pixels >= white_thresh, axis=1)
+                                    )
+                                    black_ratio = np.mean(
+                                        np.all(border_pixels <= black_thresh, axis=1)
+                                    )
+                                    if fallback_fill_color is None:
+                                        fallback_fill_color = (
+                                            (255, 255, 255)
+                                            if white_ratio >= black_ratio
+                                            else (0, 0, 0)
+                                        )
+                                    force_fill = (
+                                        config.outside_text.force_cv2_inpainting
+                                    )
+                                    should_simple_fill = (
+                                        white_ratio >= ratio_threshold
+                                        or black_ratio >= ratio_threshold
+                                        or force_fill
+                                    )
+                                    if should_simple_fill:
+                                        fill_color = fallback_fill_color
+                                        if force_fill and not (
+                                            white_ratio >= ratio_threshold
+                                            or black_ratio >= ratio_threshold
+                                        ):
+                                            log_message(
+                                                "Forcing CV2 fill: defaulting to "
+                                                f"{'white' if fill_color == (255, 255, 255) else 'black'} background",
+                                                verbose=verbose,
+                                            )
+                                        else:
+                                            log_message(
+                                                "Skipping Flux for OSB region: detected pure "
+                                                f"{'white' if fill_color == (255, 255, 255) else 'black'} background",
+                                                verbose=verbose,
+                                            )
+                    def apply_simple_fill(color_to_use):
+                        new_img = current_image.copy()
+                        if (
+                            original_bbox_dict
+                            and ox1 is not None
+                            and ox0 is not None
+                            and oy1 is not None
+                            and oy0 is not None
+                            and ox1 > ox0
+                            and oy1 > oy0
+                        ):
+                            # Restricted fill logic: Clip mask to bbox
+                            region_mask = combined_mask[oy0:oy1, ox0:ox1]
+                            if not np.any(region_mask):
+                                return new_img
+                            mask_pil = Image.fromarray(
+                                (region_mask * 255).astype(np.uint8), mode="L"
+                            )
+                            patch = Image.new(
+                                "RGB", (ox1 - ox0, oy1 - oy0), color_to_use
+                            )
+                            new_img.paste(patch, (ox0, oy0), mask=mask_pil)
+                        else:
+                            # Full mask fill
+                            mask_pil = Image.fromarray(
+                                (combined_mask * 255).astype(np.uint8), mode="L"
+                            )
+                            patch = Image.new("RGB", new_img.size, color_to_use)
+                            new_img.paste(patch, (0, 0), mask=mask_pil)
+                        return new_img
+                    if fill_color is not None:
+                        current_image = apply_simple_fill(fill_color)
+                        cv2_inpaints += 1
+                        continue
+                    flux_failed = False
+                    flux_fail_reason = None
+                    inpainted_image = None
+                    if inpainter is None:
+                        flux_failed = True
+                        flux_fail_reason = "Flux inpainter unavailable"
+                    else:
+                        try:
+                            inpainted_image = inpainter.inpaint_mask(
+                                current_image,
+                                combined_mask,
+                                seed=region_seed,
+                                verbose=verbose,
+                                strict_mask_clipping=True,
+                                composite_clip_bbox=composite_clip_bbox,
+                            )
+                            if inpainted_image is current_image:
+                                flux_failed = True
+                                flux_fail_reason = (
+                                    "Flux returned original image (no inpaint)"
+                                )
+                        except Exception as e:
+                            flux_failed = True
+                            flux_fail_reason = f"Flux inpainting error: {e}"
+                    if flux_failed:
+                        fallback_color_to_use = (
+                            fallback_fill_color
+                            if fallback_fill_color
+                            else (255, 255, 255)
+                        )
+                        log_message(
+                            f"Flux failed for OSB region {i + 1}"
+                            + (f" ({flux_fail_reason})" if flux_fail_reason else "")
+                            + f"; falling back to CV2 fill ({fallback_color_to_use})",
+                            always_print=True,
+                        )
+                        current_image = apply_simple_fill(fallback_color_to_use)
+                        cv2_inpaints += 1
+                        continue
+                    flux_inpaints += 1
+                    # Save to disk if more regions remain to reduce memory usage
+                    if i < len(mask_groups) - 1:
+                        temp_file = None
+                        try:
+                            temp_fd, temp_file = tempfile.mkstemp(suffix=".png")
+                            os.close(temp_fd)
+                            inpainted_image.save(temp_file, format="PNG")
+                            temp_files.append(temp_file)
+                            with Image.open(temp_file) as img_tmp:
+                                img_tmp.load()
+                                current_image = img_tmp.copy()
+                            del inpainted_image
+                            gc.collect()
+                            log_message(
+                                "Saved intermediate inpainting result to disk",
+                                verbose=verbose,
+                            )
+                        except Exception as e:
+                            log_message(
+                                "Warning: Failed to save intermediate image to disk: "
+                                f"{e}. Continuing with in-memory processing.",
+                                verbose=verbose,
+                            )
+                            # Fallback to in-memory if disk save fails
+                            current_image = inpainted_image
+                            if temp_file and temp_file in temp_files:
+                                temp_files.remove(temp_file)
+                    else:
+                        current_image = inpainted_image
+                log_message("Outside text inpainting completed", verbose=verbose)
+                log_message(
+                    f"Inpainted {len(mask_groups)} outside text regions (Flux: {flux_inpaints}, CV2: {cv2_inpaints})",
+                    always_print=True,
+                )
+        finally:
+            for temp_file in temp_files:
+                if temp_file and os.path.exists(temp_file):
+                    try:
+                        os.remove(temp_file)
+                    except Exception:
+                        pass
+        outside_text_data = []
+        original_cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
+        for ocr_result in outside_text_results:
+            bbox_coords, conf = ocr_result
+            x1, y1, x2, y2 = [int(c) for c in bbox_coords]
+            bbox_tuple = (x1, y1, x2, y2)
+            outside_text_image_cv = original_cv_image[y1:y2, x1:x2].copy()
+            outside_text_image_pil = cv2_to_pil(outside_text_image_cv)
+            original_crop_pil = outside_text_image_pil.copy()
+            # Disable upscaling in test_mode
+            osb_upscale_method = (
+                "none" if config.test_mode else config.translation.upscale_method
+            )
+            if osb_upscale_method == "model":
+                model_manager = get_model_manager()
+                with model_manager.upscale_context() as upscale_model:
+                    final_text_pil = process_bubble_image_cached(
+                        outside_text_image_pil,
+                        upscale_model,
+                        config.device,
+                        config.translation.osb_min_side_pixels,
+                        "min",
+                        "model",
+                        verbose,
+                    )
+            elif osb_upscale_method == "model_lite":
+                model_manager = get_model_manager()
+                with model_manager.upscale_lite_context() as upscale_model:
+                    final_text_pil = process_bubble_image_cached(
+                        outside_text_image_pil,
+                        upscale_model,
+                        config.device,
+                        config.translation.osb_min_side_pixels,
+                        "min",
+                        "model_lite",
+                        verbose,
+                    )
+            elif osb_upscale_method == "lanczos":
+                w, h = outside_text_image_pil.size
+                min_side = min(w, h)
+                if min_side < config.translation.osb_min_side_pixels:
+                    scale_factor = config.translation.osb_min_side_pixels / min_side
+                    new_w = int(w * scale_factor)
+                    new_h = int(h * scale_factor)
+                    resized_text = outside_text_image_pil.resize(
+                        (new_w, new_h), Image.LANCZOS
+                    )
+                else:
+                    resized_text = outside_text_image_pil
+                final_text_pil = resized_text
+            else:
+                final_text_pil = outside_text_image_pil
+            outside_text_image_cv = pil_to_cv2(final_text_pil)
+            w = max(1, x2 - x1)
+            h = max(1, y2 - y1)
+            aspect_ratio = float(h) / float(w)
+            try:
+                is_success, buffer = cv2.imencode(cv2_ext, outside_text_image_cv)
+                if is_success:
+                    image_b64 = base64.b64encode(buffer).decode("utf-8")
+                    outside_text_data.append(
+                        {
+                            "bbox": bbox_tuple,
+                            "confidence": conf,
+                            "is_outside_text": True,
+                            "image_b64": image_b64,
+                            "mime_type": mime_type,
+                            "is_dark_text": original_text_colors.get(bbox_tuple, True),
+                            "aspect_ratio": aspect_ratio,
+                            "original_crop_pil": original_crop_pil,
+                        }
+                    )
+            except Exception as e:
+                log_message(
+                    f"Error encoding outside text bbox {(x1, y1, x2, y2)}: {e}",
+                    verbose=verbose,
+                )
+        return current_image, outside_text_data
+    except Exception as e:
+        log_message(
+            f"Error during outside text detection/inpainting: {e}",
+            always_print=True,
+        )
+        return pil_image, []

core/pipeline.py ADDED Viewed

	@@ -0,0 +1,1295 @@

+import base64
+import math
+import os
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+import cv2
+from PIL import Image
+from core.caching import get_cache
+from core.config import MangaTranslatorConfig, PreprocessingConfig, RenderingConfig
+from core.scaling import scale_font_size, scale_length, scale_scalar
+from utils.exceptions import (
+    CancellationError,
+    CleaningError,
+    FontError,
+    ImageProcessingError,
+    RenderingError,
+    TranslationError,
+)
+from utils.logging import log_message
+from .image.cleaning import clean_speech_bubbles, retry_cleaning_with_otsu
+from .image.detection import detect_panels, detect_speech_bubbles
+from .image.image_utils import (
+    convert_image_to_target_mode,
+    cv2_to_pil,
+    pil_to_cv2,
+    resize_to_max_side,
+    save_image_with_compression,
+    upscale_image,
+    upscale_image_to_dimension,
+)
+from .image.sorting import sort_bubbles_by_reading_order
+from .ml.model_manager import get_model_manager
+from .outside_text_processor import process_outside_text
+from .services.translation import (
+    call_translation_api_batch,
+    prepare_bubble_images_for_translation,
+)
+from .text.text_processing import is_latin_style_language
+from .text.text_renderer import render_text_skia
+if TYPE_CHECKING:
+    from ui.cancellation import CancellationManager
+def get_image_encoding_params(pil_image_format: Optional[str]) -> Tuple[str, str]:
+    """Returns (mime_type, cv2_ext) for a given PIL image format."""
+    if pil_image_format and pil_image_format.upper() == "PNG":
+        return "image/png", ".png"
+    return "image/jpeg", ".jpg"
+def _resolve_pre_upscale_factor(
+    pre_cfg: Optional[PreprocessingConfig],
+    verbose: bool = False,
+) -> float:
+    if pre_cfg is None or not pre_cfg.enabled:
+        return 1.0
+    factor = max(1.0, min(float(pre_cfg.factor or 1.0), 8.0))
+    if factor <= 1.01:
+        return 1.0
+    log_message(f"Initial upscaling enabled: {factor:.2f}x", verbose=verbose)
+    return factor
+def _apply_pre_upscale_if_needed(
+    image: Image.Image,
+    config: MangaTranslatorConfig,
+    verbose: bool = False,
+) -> Tuple[Image.Image, float]:
+    factor = _resolve_pre_upscale_factor(
+        getattr(config, "preprocessing", None), verbose
+    )
+    if factor == 1.0:
+        return image, 1.0
+    # Use the output upscale model setting for initial upscaling as well
+    model_type = (
+        getattr(config.output, "image_upscale_model", "model_lite")
+        if hasattr(config, "output")
+        else "model_lite"
+    )
+    upscaled = upscale_image(image, factor, model_type=model_type, verbose=verbose)
+    return upscaled, factor
+def translate_and_render(
+    image_path: Union[str, Path],
+    config: MangaTranslatorConfig,
+    output_path: Optional[Union[str, Path]] = None,
+    cancellation_manager: Optional["CancellationManager"] = None,
+):
+    """
+    Main function to translate manga speech bubbles and render translations using a config object.
+    Args:
+        image_path (str or Path): Path to input image
+        config (MangaTranslatorConfig): Configuration object containing all settings.
+        output_path (str or Path, optional): Path to save the final image. If None, image is not saved.
+    Returns:
+        PIL.Image: Final translated image
+    """
+    start_time = time.time()
+    image_path = Path(image_path)
+    verbose = config.verbose
+    device = config.device
+    log_message(f"Using device: {device}", verbose=verbose)
+    try:
+        pil_original = Image.open(image_path)
+        image_format = pil_original.format
+        mime_type, cv2_ext = get_image_encoding_params(image_format)
+        log_message(
+            f"Original image format: {image_format} -> MIME: {mime_type}",
+            verbose=verbose,
+        )
+    except FileNotFoundError:
+        log_message(f"Error: Input image not found at {image_path}", always_print=True)
+        raise
+    except Exception as e:
+        log_message(f"Error opening image {image_path}: {e}", always_print=True)
+        raise
+    if cancellation_manager and cancellation_manager.is_cancelled():
+        raise TranslationError("Process cancelled by user.")
+    desired_format = config.output.output_format
+    output_ext_for_mode = (
+        Path(output_path).suffix.lower() if output_path else image_path.suffix.lower()
+    )
+    if desired_format == "jpeg" or (
+        desired_format == "auto" and output_ext_for_mode in [".jpg", ".jpeg"]
+    ):
+        target_mode = "RGB"
+    else:  # Default to RGBA for PNG, WEBP, or other formats in auto mode
+        target_mode = "RGBA"
+    log_message(f"Target mode: {target_mode}", verbose=verbose)
+    pil_image_processed = convert_image_to_target_mode(
+        pil_original, target_mode, verbose
+    )
+    pil_image_processed, _ = _apply_pre_upscale_if_needed(
+        pil_image_processed, config, verbose
+    )
+    # Check for Upscaling Only Mode (skip detection, cleaning, and translation)
+    if config.upscaling_only:
+        log_message(
+            "Upscaling only mode - skipping detection and translation",
+            always_print=True,
+        )
+        final_image_to_save = pil_image_processed
+        if config.output.upscale_final_image:
+            log_message("Upscaling final image...", verbose=verbose, always_print=True)
+            final_image_to_save = upscale_image(
+                final_image_to_save,
+                config.output.image_upscale_factor,
+                model_type=config.output.image_upscale_model,
+                verbose=verbose,
+            )
+        if output_path:
+            if final_image_to_save.mode != target_mode:
+                log_message(f"Converting final image to {target_mode}", verbose=verbose)
+                final_image_to_save = final_image_to_save.convert(target_mode)
+            try:
+                save_image_with_compression(
+                    final_image_to_save,
+                    output_path,
+                    jpeg_quality=config.output.jpeg_quality,
+                    png_compression=config.output.png_compression,
+                    verbose=verbose,
+                )
+            except ImageProcessingError as e:
+                log_message(f"Failed to save image: {e}", always_print=True)
+                raise
+        end_time = time.time()
+        processing_time = end_time - start_time
+        log_message(
+            f"Processing completed in {processing_time:.2f}s", always_print=True
+        )
+        return final_image_to_save
+    # Calculate dynamic processing scale based on image area relative to 1MP (if enabled)
+    if config.preprocessing.auto_scale:
+        width, height = pil_image_processed.size
+        processing_scale = math.sqrt((width * height) / 1_000_000)
+        log_message(
+            f"Dynamic processing scale: {processing_scale:.2f}x", verbose=verbose
+        )
+    else:
+        processing_scale = 1.0
+    get_cache().set_current_image(pil_image_processed, verbose)
+    original_cv_image = pil_to_cv2(pil_image_processed)
+    # Detect speech bubbles first so OSB processing can respect bubble regions
+    log_message("Detecting speech bubbles...", verbose=verbose)
+    try:
+        bubble_data, text_free_boxes = detect_speech_bubbles(
+            image_path,
+            config.yolo_model_path,
+            config.detection.confidence,
+            verbose=verbose,
+            device=device,
+            use_sam2=config.detection.use_sam2,
+            conjoined_detection=config.detection.conjoined_detection,
+            conjoined_confidence=config.detection.conjoined_confidence,
+            image_override=pil_image_processed,
+            osb_enabled=config.outside_text.enabled,
+            osb_text_verification=config.detection.use_osb_text_verification,
+            osb_text_hf_token=config.outside_text.huggingface_token,
+        )
+    except Exception as e:
+        log_message(f"Error during detection: {e}", always_print=True)
+        bubble_data = []
+        text_free_boxes = []
+    # Process outside text detection and inpainting (bubble-aware)
+    pil_image_processed, outside_text_data = process_outside_text(
+        pil_image_processed,
+        config,
+        image_path,
+        image_format,
+        verbose,
+        bubble_data=bubble_data,
+        text_free_boxes=text_free_boxes,
+    )
+    original_cv_image = pil_to_cv2(pil_image_processed)
+    full_image_b64 = None
+    full_image_mime_type = None
+    if config.translation.send_full_page_context:
+        try:
+            # processing_scale is intentionally not used for context_image_max_side_pixels
+            context_image_pil = cv2_to_pil(original_cv_image)
+            effective_context_max_side = scale_length(
+                config.translation.context_image_max_side_pixels,
+                None,
+                minimum=512,
+                maximum=4096,
+            )
+            # Disable upscaling in test_mode
+            context_upscale_method = (
+                "none" if config.test_mode else config.translation.upscale_method
+            )
+            if context_upscale_method == "model":
+                # Use upscaling model for full page context
+                model_manager = get_model_manager()
+                with model_manager.upscale_context() as upscale_model:
+                    context_image_pil = upscale_image_to_dimension(
+                        upscale_model,
+                        context_image_pil,
+                        effective_context_max_side,
+                        config.device,
+                        "max",
+                        "model",
+                        verbose,
+                    )
+                    # Resize to exact target dimension (downscale if needed)
+                    context_image_pil = resize_to_max_side(
+                        context_image_pil,
+                        effective_context_max_side,
+                        verbose=verbose,
+                    )
+                    log_message(
+                        "Upscaled full image for context with model", verbose=verbose
+                    )
+            elif context_upscale_method == "model_lite":
+                # Use upscaling lite model for full page context
+                model_manager = get_model_manager()
+                with model_manager.upscale_lite_context() as upscale_model:
+                    context_image_pil = upscale_image_to_dimension(
+                        upscale_model,
+                        context_image_pil,
+                        effective_context_max_side,
+                        config.device,
+                        "max",
+                        "model_lite",
+                        verbose,
+                    )
+                    # Resize to exact target dimension (downscale if needed)
+                    context_image_pil = resize_to_max_side(
+                        context_image_pil,
+                        effective_context_max_side,
+                        verbose=verbose,
+                    )
+                    log_message(
+                        "Upscaled full image for context with lite model",
+                        verbose=verbose,
+                    )
+            elif context_upscale_method == "lanczos":
+                # Use LANCZOS resampling
+                context_image_pil = resize_to_max_side(
+                    context_image_pil,
+                    effective_context_max_side,
+                    verbose=verbose,
+                )
+                log_message(
+                    "Resized full image for context with LANCZOS", verbose=verbose
+                )
+            else:  # upscale_method == "none"
+                # No resizing/upscaling
+                log_message(
+                    "Using full image for context without resizing", verbose=verbose
+                )
+            context_image_cv = pil_to_cv2(context_image_pil)
+            is_success, buffer = cv2.imencode(cv2_ext, context_image_cv)
+            if not is_success:
+                raise ImageProcessingError(f"Full image encoding to {cv2_ext} failed")
+            full_image_b64 = base64.b64encode(buffer).decode("utf-8")
+            full_image_mime_type = mime_type
+            log_message("Encoded full image for context", verbose=verbose)
+        except Exception as e:
+            log_message(
+                f"Warning: Failed to encode full image context: {e}", always_print=True
+            )
+    if cancellation_manager and cancellation_manager.is_cancelled():
+        raise CancellationError("Process cancelled by user.")
+    final_image_to_save = pil_image_processed
+    if not bubble_data and not outside_text_data:
+        log_message("No speech bubbles or outside text detected", always_print=True)
+    else:
+        if bubble_data:
+            log_message(f"Detected {len(bubble_data)} bubbles", verbose=verbose)
+        if outside_text_data:
+            log_message(
+                f"Detected {len(outside_text_data)} outside text regions",
+                verbose=verbose,
+            )
+        if cancellation_manager and cancellation_manager.is_cancelled():
+            raise CancellationError("Process cancelled by user.")
+        if bubble_data:
+            log_message("Cleaning speech bubbles...", verbose=verbose)
+            try:
+                use_otsu = config.cleaning.use_otsu_threshold
+                if config.cleaning.inpaint_colored_bubbles:
+                    log_message(
+                        "Flux inpainting enabled for colored bubbles",
+                        verbose=verbose,
+                    )
+                cleaned_image_cv, processed_bubbles_info = clean_speech_bubbles(
+                    pil_image_processed,
+                    config.yolo_model_path,
+                    config.detection.confidence,
+                    pre_computed_detections=bubble_data,
+                    device=device,
+                    thresholding_value=config.cleaning.thresholding_value,
+                    use_otsu_threshold=use_otsu,
+                    roi_shrink_px=config.cleaning.roi_shrink_px,
+                    verbose=verbose,
+                    processing_scale=processing_scale,
+                    conjoined_confidence=config.detection.conjoined_confidence,
+                    inpaint_colored_bubbles=config.cleaning.inpaint_colored_bubbles,
+                    flux_hf_token=config.outside_text.huggingface_token,
+                    flux_num_inference_steps=config.outside_text.flux_num_inference_steps,
+                    flux_residual_diff_threshold=config.outside_text.flux_residual_diff_threshold,
+                    flux_seed=config.outside_text.seed,
+                    osb_text_verification=config.detection.use_osb_text_verification,
+                    osb_text_hf_token=config.outside_text.huggingface_token,
+                    force_cv2_inpainting=config.outside_text.force_cv2_inpainting,
+                )
+            except CleaningError as e:
+                log_message(f"Cleaning failed: {e}", always_print=True)
+                cleaned_image_cv = original_cv_image.copy()
+                processed_bubbles_info = []
+            except Exception as e:
+                log_message(f"Error during cleaning: {e}", always_print=True)
+                cleaned_image_cv = original_cv_image.copy()
+                processed_bubbles_info = []
+            pil_cleaned_image = cv2_to_pil(cleaned_image_cv)
+            if pil_cleaned_image.mode != target_mode:
+                log_message(
+                    f"Converting cleaned image to {target_mode}", verbose=verbose
+                )
+                pil_cleaned_image = pil_cleaned_image.convert(target_mode)
+            final_image_to_save = pil_cleaned_image
+        else:
+            processed_bubbles_info = []
+            pil_cleaned_image = pil_image_processed
+            if pil_cleaned_image.mode != target_mode:
+                log_message(f"Converting image to {target_mode}", verbose=verbose)
+                pil_cleaned_image = pil_cleaned_image.convert(target_mode)
+            final_image_to_save = pil_cleaned_image
+        # Check for Cleaning Only Mode
+        if config.cleaning_only:
+            log_message("Cleaning only mode - skipping translation", always_print=True)
+        else:
+            main_min_font = scale_font_size(
+                config.rendering.min_font_size, processing_scale, minimum=4, maximum=256
+            )
+            main_max_font = scale_font_size(
+                config.rendering.max_font_size,
+                processing_scale,
+                minimum=main_min_font,
+                maximum=384,
+            )
+            padding_pixels = scale_scalar(
+                config.rendering.padding_pixels,
+                processing_scale,
+                minimum=1.0,
+                maximum=80.0,
+            )
+            osb_min_font = scale_font_size(
+                config.outside_text.osb_min_font_size,
+                processing_scale,
+                minimum=4,
+                maximum=512,
+            )
+            osb_max_font = scale_font_size(
+                config.outside_text.osb_max_font_size,
+                processing_scale,
+                minimum=osb_min_font,
+                maximum=640,
+            )
+            osb_outline_width = scale_scalar(
+                config.outside_text.osb_outline_width,
+                processing_scale,
+                minimum=0.0,
+                maximum=24.0,
+            )
+            # Prepare images for Translation
+            log_message("Preparing bubble images...", verbose=verbose)
+            # Disable upscaling in test_mode
+            bubble_upscale_method = (
+                "none" if config.test_mode else config.translation.upscale_method
+            )
+            model_manager = get_model_manager()
+            # Use appropriate context manager based on upscale_method
+            if bubble_upscale_method == "model":
+                context_manager = model_manager.upscale_context()
+            elif bubble_upscale_method == "model_lite":
+                context_manager = model_manager.upscale_lite_context()
+            else:
+                # For lanczos/none, create a dummy context manager that yields None
+                from contextlib import nullcontext
+                context_manager = nullcontext(None)
+            with context_manager as upscale_model:
+                bubble_data = prepare_bubble_images_for_translation(
+                    bubble_data,
+                    original_cv_image,
+                    upscale_model,
+                    config.device,
+                    mime_type,
+                    config.translation.bubble_min_side_pixels,
+                    bubble_upscale_method,
+                    verbose,
+                )
+            if bubble_upscale_method != "none":
+                log_message(
+                    f"Upscaled {len(bubble_data)} bubble images for translation",
+                    always_print=True,
+                )
+            else:
+                log_message(
+                    f"Prepared {len(bubble_data)} bubble images for translation",
+                    always_print=True,
+                )
+            valid_bubble_data = [b for b in bubble_data if b.get("image_b64")]
+            if not valid_bubble_data and not outside_text_data:
+                log_message(
+                    "No valid bubble images or outside text for translation",
+                    always_print=True,
+                )
+            else:  # Proceed if we have valid bubble data or outside text
+                if cancellation_manager and cancellation_manager.is_cancelled():
+                    raise CancellationError("Process cancelled by user.")
+                # Sort and Translate
+                reading_direction = config.translation.reading_direction
+                # Merge outside text data with speech bubbles for reading order calculation
+                if outside_text_data:
+                    log_message(
+                        f"Including {len(outside_text_data)} outside text regions in reading order calculation",
+                        verbose=verbose,
+                    )
+                    # Combine speech bubbles and OSB text for unified reading order sorting
+                    all_text_data = valid_bubble_data + outside_text_data
+                else:
+                    all_text_data = valid_bubble_data
+                log_message(
+                    f"Sorting all text elements ({reading_direction.upper()})",
+                    verbose=verbose,
+                )
+                # Detect panels if panel-aware sorting is enabled
+                panels = None
+                if config.detection.use_panel_sorting:
+                    try:
+                        log_message(
+                            "Detecting panels for panel-aware sorting...",
+                            verbose=verbose,
+                        )
+                        panels = detect_panels(
+                            image_path,
+                            confidence=config.detection.panel_confidence,
+                            device=config.device,
+                            verbose=verbose,
+                        )
+                        if panels:
+                            log_message(
+                                f"Detected {len(panels)} panels for sorting",
+                                always_print=True,
+                            )
+                        else:
+                            log_message(
+                                "No panels detected, using global sorting",
+                                verbose=verbose,
+                            )
+                    except Exception as e:
+                        log_message(
+                            f"Panel detection failed: {e}. Using global sorting.",
+                            always_print=True,
+                        )
+                        panels = None
+                # Sort all text elements (speech bubbles + OSB text) by reading order
+                sorted_bubble_data = sort_bubbles_by_reading_order(
+                    all_text_data, reading_direction, panels=panels
+                )
+                bubble_images_b64 = [
+                    bubble["image_b64"]
+                    for bubble in sorted_bubble_data
+                    if "image_b64" in bubble
+                ]
+                bubble_mime_types = [
+                    bubble["mime_type"]
+                    for bubble in sorted_bubble_data
+                    if "image_b64" in bubble and "mime_type" in bubble
+                ]
+                translated_texts = []
+                if not bubble_images_b64:
+                    log_message("No valid bubbles after sorting", always_print=True)
+                else:
+                    if getattr(config, "test_mode", False):
+                        placeholder_long = "Lorem **ipsum** *dolor* sit amet, consectetur adipiscing elit."
+                        placeholder_short = "Lorem **ipsum** *dolor* sit amet..."
+                        placeholder_osb = "Lorem"
+                        log_message(
+                            f"Test mode: generating placeholders for {len(sorted_bubble_data)} bubbles",
+                            always_print=True,
+                        )
+                        # Map for rendering info used in probe
+                        bubble_render_info_map_probe = {
+                            tuple(info["bbox"]): {
+                                "color": info["color"],
+                                "mask": info.get("mask"),
+                            }
+                            for info in processed_bubbles_info
+                            if "bbox" in info and "color" in info and "mask" in info
+                        }
+                        for i, bubble in enumerate(sorted_bubble_data):
+                            bbox = bubble["bbox"]
+                            is_outside_text = bubble.get("is_outside_text", False)
+                            # Use simple "Lorem ipsum" for OSB text in test mode
+                            if is_outside_text:
+                                translated_texts.append(placeholder_osb)
+                                continue
+                            probe_info = bubble_render_info_map_probe.get(
+                                tuple(bbox), {}
+                            )
+                            bubble_color_bgr = probe_info.get("color", (255, 255, 255))
+                            cleaned_mask = probe_info.get("mask")
+                            # Probe fit at max size without mutating the working image
+                            _probe_canvas = pil_cleaned_image.copy()
+                            probe_config = RenderingConfig(
+                                min_font_size=main_max_font,
+                                max_font_size=main_max_font,
+                                line_spacing_mult=config.rendering.line_spacing_mult,
+                                use_subpixel_rendering=config.rendering.use_subpixel_rendering,
+                                font_hinting=config.rendering.font_hinting,
+                                use_ligatures=config.rendering.use_ligatures,
+                                hyphenate_before_scaling=config.rendering.hyphenate_before_scaling,
+                                hyphen_penalty=config.rendering.hyphen_penalty,
+                                hyphenation_min_word_length=config.rendering.hyphenation_min_word_length,
+                                badness_exponent=config.rendering.badness_exponent,
+                                padding_pixels=padding_pixels,
+                                supersampling_factor=1,  # No supersampling for probe
+                            )
+                            try:
+                                _ = render_text_skia(
+                                    pil_image=_probe_canvas,
+                                    text=placeholder_long,
+                                    bbox=bbox,
+                                    font_dir=config.rendering.font_dir,
+                                    cleaned_mask=cleaned_mask,
+                                    bubble_color_bgr=bubble_color_bgr,
+                                    config=probe_config,
+                                    verbose=verbose,
+                                    bubble_id=str(i + 1),
+                                )
+                                fits = True
+                            except (RenderingError, FontError) as e:
+                                log_message(
+                                    f"Probe rendering failed: {e}", verbose=verbose
+                                )
+                                fits = False
+                            except Exception as e:
+                                log_message(
+                                    f"Probe rendering unexpected error: {e}",
+                                    always_print=True,
+                                )
+                                fits = False
+                            translated_texts.append(
+                                placeholder_long if fits else placeholder_short
+                            )
+                    else:
+                        log_message(
+                            f"Translating {len(bubble_images_b64)} bubbles: "
+                            f"{config.translation.input_language} → {config.translation.output_language}",
+                            always_print=True,
+                        )
+                        try:
+                            translated_texts = call_translation_api_batch(
+                                config=config.translation,
+                                images_b64=bubble_images_b64,
+                                full_image_b64=full_image_b64 or "",
+                                mime_types=bubble_mime_types,
+                                full_image_mime_type=full_image_mime_type
+                                or "image/jpeg",
+                                bubble_metadata=sorted_bubble_data,
+                                debug=verbose,
+                            )
+                        except TranslationError as e:
+                            error_str = str(e).lower()
+                            critical_tokens = (
+                                "429",
+                                "rate limit",
+                                "rate-limit",
+                                "auth",
+                                "unauthorized",
+                                "forbidden",
+                                "payment",
+                                "quota",
+                                "empty response",
+                                "api failed",
+                            )
+                            if any(token in error_str for token in critical_tokens):
+                                raise
+                            log_message(f"Translation failed: {e}", always_print=True)
+                            translated_texts = [f"[Translation Error: {e}]"] * len(
+                                bubble_images_b64
+                            )
+                        except Exception as e:
+                            log_message(
+                                f"Translation API error: {e}", always_print=True
+                            )
+                            translated_texts = [
+                                "[Translation Error: API call raised exception]"
+                                for _ in sorted_bubble_data
+                            ]
+                        valid_translations = [
+                            t
+                            for t in translated_texts
+                            if t
+                            and not t.startswith("[Translation Error")
+                            and not t.startswith("API Error")
+                            and t.strip()
+                            not in {
+                                "[OCR FAILED]",
+                                "[Empty response / no content]",
+                                f"[{config.translation.provider}: API call failed/blocked]",
+                                f"[{config.translation.provider}: OCR call failed/blocked]",
+                                f"[{config.translation.provider}: Failed to parse response]",
+                            }
+                        ]
+                        if bubble_images_b64 and not valid_translations:
+                            raise TranslationError(
+                                "Total translation failure: All bubbles failed."
+                            )
+                # Render Translations
+                bubble_render_info_map = {
+                    tuple(info["bbox"]): {
+                        "color": info["color"],
+                        "mask": info.get("mask"),
+                        "base_mask": info.get("base_mask"),
+                        "is_sam": info.get("is_sam", False),
+                        "is_colored": info.get("is_colored", False),
+                        "text_bbox": info.get("text_bbox"),
+                    }
+                    for info in processed_bubbles_info
+                    if "bbox" in info and "color" in info and "mask" in info
+                }
+                log_message("Rendering translations...", verbose=verbose)
+                if len(translated_texts) == len(sorted_bubble_data):
+                    for i, bubble in enumerate(sorted_bubble_data):
+                        bubble["translation"] = translated_texts[i]
+                        bbox = bubble["bbox"]
+                        text = bubble.get("translation", "")
+                        is_outside_text = bubble.get("is_outside_text", False)
+                        # Convert OSB text to uppercase
+                        if is_outside_text and text:
+                            text = text.upper()
+                            bubble["translation"] = text
+                        if (
+                            not text
+                            or text.startswith("API Error")
+                            or text.startswith("[Translation Error]")
+                            or text.startswith("[Translation Error:")
+                            or text.strip()
+                            in {
+                                "[OCR FAILED]",
+                                "[Empty response / no content]",
+                                f"[{config.translation.provider}: API call failed/blocked]",
+                                f"[{config.translation.provider}: OCR call failed/blocked]",
+                                f"[{config.translation.provider}: Failed to parse response]",
+                            }
+                        ):
+                            entry_type = "outside text" if is_outside_text else "bubble"
+                            log_message(
+                                f"Skipping {entry_type} {bbox} - invalid translation",
+                                verbose=verbose,
+                            )
+                            continue
+                        # Use OSB-specific settings for outside text, regular settings for speech bubbles
+                        if is_outside_text:
+                            log_message(
+                                f"Rendering outside text {bbox}: '{text[:30]}...'",
+                                verbose=verbose,
+                            )
+                            font_dir = (
+                                config.outside_text.osb_font_name
+                                if config.outside_text.osb_font_name
+                                else config.rendering.font_dir
+                            )
+                            min_font = osb_min_font
+                            max_font = osb_max_font
+                            line_spacing = config.outside_text.osb_line_spacing
+                            use_ligs = config.outside_text.osb_use_ligatures
+                            # Outside text was inpainted, no mask needed
+                            cleaned_mask = None
+                            # Use the detected text color from outside_text_processor
+                            is_dark_text = bubble.get("is_dark_text", True)
+                            # Set bubble_color_bgr to mimic the original text color
+                            # Dark text → dark background value → white rendering
+                            # Light text → light background value → black rendering
+                            bubble_color_bgr = (
+                                (50, 50, 50) if is_dark_text else (255, 255, 255)
+                            )
+                            # OSB renders default to horizontal; vertical stacking is fallback-only
+                            rotation_deg = 0.0
+                            vertical_stack = False
+                        else:
+                            log_message(
+                                f"Rendering bubble {bbox}: '{text[:30]}...'",
+                                verbose=verbose,
+                            )
+                            font_dir = config.rendering.font_dir
+                            min_font = main_min_font
+                            max_font = main_max_font
+                            line_spacing = config.rendering.line_spacing_mult
+                            use_ligs = config.rendering.use_ligatures
+                            render_info = bubble_render_info_map.get(tuple(bbox))
+                            bubble_color_bgr = (255, 255, 255)
+                            cleaned_mask = None
+                            base_mask = None
+                            is_sam_mask = False
+                            if render_info:
+                                bubble_color_bgr = render_info["color"]
+                                cleaned_mask = render_info.get("mask")
+                                base_mask = render_info.get("base_mask")
+                                is_sam_mask = render_info.get("is_sam", False)
+                            # No rotation/stacking for regular bubbles
+                            vertical_stack = False
+                            rotation_deg = 0.0
+                        # Only apply hyphenation for Latin-style languages
+                        should_hyphenate = config.rendering.hyphenate_before_scaling
+                        if not is_latin_style_language(
+                            config.translation.output_language
+                        ):
+                            should_hyphenate = False
+                        render_config = RenderingConfig(
+                            min_font_size=min_font,
+                            max_font_size=max_font,
+                            line_spacing_mult=line_spacing,
+                            use_subpixel_rendering=(
+                                config.outside_text.osb_use_subpixel_rendering
+                                if is_outside_text
+                                else config.rendering.use_subpixel_rendering
+                            ),
+                            font_hinting=(
+                                config.outside_text.osb_font_hinting
+                                if is_outside_text
+                                else config.rendering.font_hinting
+                            ),
+                            use_ligatures=use_ligs,
+                            hyphenate_before_scaling=should_hyphenate,
+                            hyphen_penalty=config.rendering.hyphen_penalty,
+                            hyphenation_min_word_length=config.rendering.hyphenation_min_word_length,
+                            badness_exponent=config.rendering.badness_exponent,
+                            padding_pixels=padding_pixels,
+                            outline_width=(
+                                osb_outline_width if is_outside_text else 0.0
+                            ),
+                            supersampling_factor=config.rendering.supersampling_factor,
+                        )
+                        success = False
+                        if is_outside_text:
+                            try:
+                                rendered_image = render_text_skia(
+                                    pil_image=pil_cleaned_image,
+                                    text=text,
+                                    bbox=bbox,
+                                    font_dir=font_dir,
+                                    cleaned_mask=cleaned_mask,
+                                    bubble_color_bgr=bubble_color_bgr,
+                                    config=render_config,
+                                    verbose=verbose,
+                                    bubble_id=str(i + 1),
+                                    rotation_deg=rotation_deg,
+                                    vertical_stack=vertical_stack,
+                                    raise_on_safe_error=False,
+                                )
+                                success = True
+                            except Exception as e:
+                                log_message(
+                                    f"Text rendering failed: {e}", verbose=verbose
+                                )
+                                rendered_image = pil_cleaned_image
+                                success = False
+                                # Absolute last-chance fallback: force vertical stacking before giving up
+                                if not vertical_stack:
+                                    # Fallback uses neutral rotation since we no longer track orientation
+                                    forced_stack_rotation = 0.0
+                                    try:
+                                        log_message(
+                                            "OSB render failed, retrying with vertical-stack fallback",
+                                            verbose=verbose,
+                                            always_print=True,
+                                        )
+                                        rendered_image = render_text_skia(
+                                            pil_image=pil_cleaned_image,
+                                            text=text,
+                                            bbox=bbox,
+                                            font_dir=font_dir,
+                                            cleaned_mask=cleaned_mask,
+                                            bubble_color_bgr=bubble_color_bgr,
+                                            config=render_config,
+                                            verbose=verbose,
+                                            bubble_id=str(i + 1),
+                                            rotation_deg=forced_stack_rotation,
+                                            vertical_stack=True,
+                                            raise_on_safe_error=False,
+                                        )
+                                        log_message(
+                                            "Vertical-stack fallback succeeded",
+                                            verbose=verbose,
+                                        )
+                                        success = True
+                                    except Exception as e2:
+                                        log_message(
+                                            f"Vertical-stack fallback failed: {e2}",
+                                            verbose=verbose,
+                                        )
+                                        # Restore original OSB patch if available
+                                        if "original_crop_pil" in bubble:
+                                            log_message(
+                                                f"Restoring original OSB patch for {bbox}",
+                                                verbose=verbose,
+                                                always_print=True,
+                                            )
+                                            rendered_image = pil_cleaned_image.copy()
+                                            original_patch = bubble["original_crop_pil"]
+                                            rendered_image.paste(
+                                                original_patch, (bbox[0], bbox[1])
+                                            )
+                                            success = True
+                                        else:
+                                            rendered_image = pil_cleaned_image
+                                            success = False
+                                else:
+                                    if "original_crop_pil" in bubble:
+                                        log_message(
+                                            f"Restoring original OSB patch for {bbox}",
+                                            verbose=verbose,
+                                            always_print=True,
+                                        )
+                                        rendered_image = pil_cleaned_image.copy()
+                                        original_patch = bubble["original_crop_pil"]
+                                        rendered_image.paste(
+                                            original_patch, (bbox[0], bbox[1])
+                                        )
+                                        success = True
+                                    else:
+                                        rendered_image = pil_cleaned_image
+                                        success = False
+                        else:
+                            try:
+                                rendered_image = render_text_skia(
+                                    pil_image=pil_cleaned_image,
+                                    text=text,
+                                    bbox=bbox,
+                                    font_dir=font_dir,
+                                    cleaned_mask=cleaned_mask,
+                                    bubble_color_bgr=bubble_color_bgr,
+                                    config=render_config,
+                                    verbose=verbose,
+                                    bubble_id=str(i + 1),
+                                    rotation_deg=rotation_deg,
+                                    vertical_stack=vertical_stack,
+                                    raise_on_safe_error=True,
+                                )
+                                success = True
+                            except ImageProcessingError as e:
+                                safe_area_failed = (
+                                    "Safe area calculation failed" in str(e)
+                                )
+                                retry_result = None
+                                if safe_area_failed and base_mask is not None:
+                                    log_message(
+                                        f"Safe area failed for bubble {bbox}, retrying mask with Otsu",
+                                        verbose=verbose,
+                                        always_print=True,
+                                    )
+                                    retry_result = retry_cleaning_with_otsu(
+                                        original_cv_image,
+                                        {
+                                            "base_mask": base_mask,
+                                            "bbox": bbox,
+                                            "is_sam": is_sam_mask,
+                                            "is_colored": (
+                                                render_info.get("is_colored", False)
+                                                if render_info
+                                                else False
+                                            ),
+                                            "text_bbox": (
+                                                render_info.get("text_bbox")
+                                                if render_info
+                                                else None
+                                            ),
+                                        },
+                                        config.cleaning.thresholding_value,
+                                        config.cleaning.roi_shrink_px,
+                                        processing_scale,
+                                        verbose=verbose,
+                                        classify_colored=(
+                                            config.cleaning.inpaint_colored_bubbles
+                                        ),
+                                    )
+                                if (
+                                    retry_result
+                                    and retry_result.get("mask") is not None
+                                ):
+                                    cleaned_mask = retry_result["mask"]
+                                    bubble_color_bgr = retry_result.get(
+                                        "color", bubble_color_bgr
+                                    )
+                                    base_mask = retry_result.get("base_mask", base_mask)
+                                    if render_info is not None:
+                                        render_info.update(
+                                            {
+                                                "mask": cleaned_mask,
+                                                "color": bubble_color_bgr,
+                                                "base_mask": base_mask,
+                                                "is_colored": retry_result.get(
+                                                    "is_colored",
+                                                    render_info.get(
+                                                        "is_colored", False
+                                                    ),
+                                                ),
+                                                "text_bbox": retry_result.get(
+                                                    "text_bbox",
+                                                    render_info.get("text_bbox"),
+                                                ),
+                                            }
+                                        )
+                                    try:
+                                        rendered_image = render_text_skia(
+                                            pil_image=pil_cleaned_image,
+                                            text=text,
+                                            bbox=bbox,
+                                            font_dir=font_dir,
+                                            cleaned_mask=cleaned_mask,
+                                            bubble_color_bgr=bubble_color_bgr,
+                                            config=render_config,
+                                            verbose=verbose,
+                                            bubble_id=str(i + 1),
+                                            rotation_deg=rotation_deg,
+                                            vertical_stack=vertical_stack,
+                                            raise_on_safe_error=False,
+                                        )
+                                        success = True
+                                    except (
+                                        RenderingError,
+                                        FontError,
+                                        ImageProcessingError,
+                                    ) as e2:
+                                        log_message(
+                                            f"Text rendering failed after Otsu retry: {e2}",
+                                            verbose=verbose,
+                                        )
+                                        rendered_image = pil_cleaned_image
+                                        success = False
+                                if not success:
+                                    # Final fallback to padded bbox path
+                                    fallback_msg = (
+                                        f"Safe area calculation failed for {bbox}, using padded bbox fallback"
+                                        if safe_area_failed
+                                        else f"Rendering retry fallback for {bbox}, using padded bbox method"
+                                    )
+                                    log_message(
+                                        fallback_msg,
+                                        verbose=verbose,
+                                    )
+                                    try:
+                                        rendered_image = render_text_skia(
+                                            pil_image=pil_cleaned_image,
+                                            text=text,
+                                            bbox=bbox,
+                                            font_dir=font_dir,
+                                            cleaned_mask=cleaned_mask,
+                                            bubble_color_bgr=bubble_color_bgr,
+                                            config=render_config,
+                                            verbose=verbose,
+                                            bubble_id=str(i + 1),
+                                            rotation_deg=rotation_deg,
+                                            vertical_stack=vertical_stack,
+                                            raise_on_safe_error=False,
+                                        )
+                                        success = True
+                                    except (RenderingError, FontError) as e2:
+                                        log_message(
+                                            f"Text rendering failed: {e2}",
+                                            verbose=verbose,
+                                        )
+                                        rendered_image = pil_cleaned_image
+                                        success = False
+                            except (RenderingError, FontError) as e:
+                                log_message(
+                                    f"Text rendering failed: {e}", verbose=verbose
+                                )
+                                rendered_image = pil_cleaned_image
+                                success = False
+                        if success:
+                            pil_cleaned_image = rendered_image
+                            final_image_to_save = pil_cleaned_image
+                        else:
+                            log_message(
+                                f"Failed to render bubble {bbox}", verbose=verbose
+                            )
+                else:
+                    log_message(
+                        f"Warning: Bubble/translation count mismatch "
+                        f"({len(sorted_bubble_data)}/{len(translated_texts)})",
+                        always_print=True,
+                    )
+    # Final Image Upscaling (optional)
+    if config.output.upscale_final_image:
+        log_message("Upscaling final image...", verbose=verbose, always_print=True)
+        final_image_to_save = upscale_image(
+            final_image_to_save,
+            config.output.image_upscale_factor,
+            model_type=config.output.image_upscale_model,
+            verbose=verbose,
+        )
+    # Save Output
+    if output_path:
+        if final_image_to_save.mode != target_mode:
+            log_message(f"Converting final image to {target_mode}", verbose=verbose)
+            final_image_to_save = final_image_to_save.convert(target_mode)
+        try:
+            save_image_with_compression(
+                final_image_to_save,
+                output_path,
+                jpeg_quality=config.output.jpeg_quality,
+                png_compression=config.output.png_compression,
+                verbose=verbose,
+            )
+        except ImageProcessingError as e:
+            log_message(f"Failed to save image: {e}", always_print=True)
+            raise
+    end_time = time.time()
+    processing_time = end_time - start_time
+    log_message(f"Processing completed in {processing_time:.2f}s", always_print=True)
+    return final_image_to_save
+def batch_translate_images(
+    input_dir: Union[str, Path],
+    config: MangaTranslatorConfig,
+    output_dir: Optional[Union[str, Path]] = None,
+    progress_callback: Optional[Callable[[float, str], None]] = None,
+    preserve_structure: bool = False,
+    cancellation_manager: Optional["CancellationManager"] = None,
+) -> Dict[str, Any]:
+    """
+    Process all images in a directory using a configuration object.
+    Args:
+        input_dir (str or Path): Directory containing images to process
+        config (MangaTranslatorConfig): Configuration object containing all settings.
+        output_dir (str or Path, optional): Directory to save translated images.
+                                            If None, uses input_dir / "output_translated".
+        progress_callback (callable, optional): Function to call with progress updates (0.0-1.0, message).
+        preserve_structure (bool): If True, recursively process subdirectories and preserve folder structure
+                                   in the output. If False, only processes files in the root directory.
+    Returns:
+        dict: Processing results with keys:
+            - "success_count": Number of successfully processed images
+            - "error_count": Number of images that failed to process
+            - "errors": Dictionary mapping filenames to error messages
+    """
+    input_dir = Path(input_dir)
+    if not input_dir.is_dir():
+        log_message(f"Input path '{input_dir}' is not a directory", always_print=True)
+        return {"success_count": 0, "error_count": 0, "errors": {}}
+    if output_dir:
+        output_dir = Path(output_dir)
+    else:
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        output_dir = Path("./output") / timestamp
+    os.makedirs(output_dir, exist_ok=True)
+    image_extensions = [".jpg", ".jpeg", ".png", ".webp"]
+    if preserve_structure:
+        # Recursively find all image files preserving directory structure
+        image_files = []
+        for root, dirs, files in os.walk(input_dir):
+            for file in files:
+                file_path = Path(root) / file
+                if file_path.suffix.lower() in image_extensions:
+                    image_files.append(file_path)
+    else:
+        image_files = [
+            f
+            for f in input_dir.iterdir()
+            if f.is_file() and f.suffix.lower() in image_extensions
+        ]
+    if not image_files:
+        log_message(f"No image files found in '{input_dir}'", always_print=True)
+        return {"success_count": 0, "error_count": 0, "errors": {}}
+    results = {"success_count": 0, "error_count": 0, "errors": {}}
+    total_images = len(image_files)
+    start_batch_time = time.time()
+    log_message(f"Starting batch processing: {total_images} images", always_print=True)
+    if progress_callback:
+        progress_callback(0.0, f"Starting batch processing of {total_images} images...")
+    for i, img_path in enumerate(image_files):
+        try:
+            # Calculate relative path from input directory for structure preservation
+            if preserve_structure:
+                relative_path = img_path.relative_to(input_dir)
+                # Create output subdirectory structure
+                output_subdir = output_dir / relative_path.parent
+                os.makedirs(output_subdir, exist_ok=True)
+                # Use relative path for output filename
+                output_filename = f"{relative_path.stem}_translated"
+                display_path = str(relative_path)
+                error_key = str(relative_path)
+            else:
+                output_subdir = output_dir
+                output_filename = f"{img_path.stem}_translated"
+                display_path = img_path.name
+                error_key = img_path.name
+            if cancellation_manager and cancellation_manager.is_cancelled():
+                raise CancellationError("Batch process cancelled by user.")
+            if progress_callback:
+                current_progress = i / total_images
+                progress_callback(
+                    current_progress,
+                    f"Processing image {i + 1}/{total_images}: {display_path}",
+                )
+            original_ext = img_path.suffix.lower()
+            desired_format = config.output.output_format
+            if desired_format == "jpeg":
+                output_ext = ".jpg"
+            elif desired_format == "png":
+                output_ext = ".png"
+            elif desired_format == "auto":
+                output_ext = original_ext
+            else:
+                output_ext = original_ext
+                log_message(
+                    f"Warning: Invalid output_format '{desired_format}' in config. "
+                    f"Using original extension '{original_ext}'.",
+                    always_print=True,
+                )
+            output_path = output_subdir / f"{output_filename}{output_ext}"
+            log_message(
+                f"Processing {i + 1}/{total_images}: {display_path}", always_print=True
+            )
+            translate_and_render(
+                img_path, config, output_path, cancellation_manager=cancellation_manager
+            )
+            results["success_count"] += 1
+            if progress_callback:
+                completed_progress = (i + 1) / total_images
+                progress_callback(
+                    completed_progress, f"Completed {i + 1}/{total_images} images"
+                )
+        except CancellationError:
+            log_message(
+                f"Batch cancelled during processing of {display_path}",
+                verbose=config.verbose,
+            )
+            raise
+        except Exception as e:
+            log_message(f"Error processing {display_path}: {str(e)}", always_print=True)
+            results["error_count"] += 1
+            results["errors"][error_key] = str(e)
+            if progress_callback:
+                completed_progress = (i + 1) / total_images
+                progress_callback(
+                    completed_progress,
+                    f"Completed {i + 1}/{total_images} images (with errors)",
+                )
+    if progress_callback:
+        progress_callback(1.0, "Processing complete")
+    end_batch_time = time.time()
+    total_batch_time = end_batch_time - start_batch_time
+    seconds_per_image = total_batch_time / total_images if total_images > 0 else 0
+    log_message(
+        f"Batch complete: {results['success_count']}/{total_images} images in "
+        f"{total_batch_time:.2f}s ({seconds_per_image:.2f}s/image)",
+        always_print=True,
+    )
+    if results["error_count"] > 0:
+        log_message(f"Failed: {results['error_count']} images", always_print=True)
+        for filename, error_msg in results["errors"].items():
+            log_message(f"  - {filename}: {error_msg}", always_print=True)
+    return results

core/scaling.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Optional, Tuple
+def _normalize_scale(scale: Optional[float]) -> float:
+    if scale is None or scale <= 0:
+        return 1.0
+    return float(scale)
+def _clamp(value: float, minimum: Optional[float], maximum: Optional[float]) -> float:
+    if minimum is not None:
+        value = max(minimum, value)
+    if maximum is not None:
+        value = min(maximum, value)
+    return value
+def scale_scalar(
+    value: float,
+    scale: Optional[float],
+    *,
+    minimum: Optional[float] = None,
+    maximum: Optional[float] = None,
+) -> float:
+    """
+    Scale an arbitrary scalar (float) value by the processing scale.
+    """
+    effective_scale = _normalize_scale(scale)
+    scaled = value * effective_scale
+    return _clamp(scaled, minimum, maximum)
+def scale_length(
+    value: float,
+    scale: Optional[float],
+    *,
+    minimum: Optional[float] = 1.0,
+    maximum: Optional[float] = None,
+) -> int:
+    """
+    Scale a pixel length and return an int with rounding and clamping.
+    """
+    scaled = scale_scalar(value, scale, minimum=minimum, maximum=maximum)
+    # Round to nearest integer for pixel units
+    return max(1, int(round(scaled)))
+def scale_area(
+    value: float,
+    scale: Optional[float],
+    *,
+    minimum: Optional[float] = 1.0,
+    maximum: Optional[float] = None,
+) -> int:
+    """
+    Scale an area-like value (square pixels). Uses scale^2.
+    """
+    effective_scale = _normalize_scale(scale)
+    scaled = value * (effective_scale * effective_scale)
+    scaled = _clamp(scaled, minimum, maximum)
+    return max(1, int(round(scaled)))
+def scale_kernel(
+    kernel: Tuple[int, int],
+    scale: Optional[float],
+    *,
+    minimum: int = 1,
+    maximum: int = 63,
+) -> Tuple[int, int]:
+    """
+    Scale a 2D kernel size while ensuring odd dimensions (required for many morphology ops).
+    """
+    width, height = kernel
+    effective_scale = _normalize_scale(scale)
+    def _scale_dimension(base: int) -> int:
+        dimension = scale_scalar(
+            base,
+            effective_scale,
+            minimum=float(minimum),
+            maximum=float(maximum),
+        )
+        dim_int = max(minimum, int(round(dimension)))
+        # Ensure result stays within bounds
+        dim_int = min(maximum, dim_int)
+        if dim_int % 2 == 0:
+            # Prefer rounding up to keep padding generous, but clamp again
+            dim_int = min(maximum, dim_int + 1)
+            if dim_int % 2 == 0:
+                dim_int = max(minimum, dim_int - 1)
+                if dim_int % 2 == 0:
+                    dim_int = max(minimum, dim_int + 1)
+        return max(minimum, dim_int)
+    return (_scale_dimension(width), _scale_dimension(height))
+def scale_font_size(
+    value: float,
+    scale: Optional[float],
+    *,
+    minimum: int = 4,
+    maximum: int = 256,
+) -> int:
+    """
+    Scale a font size (int) using linear scaling with clamping.
+    """
+    return scale_length(value, scale, minimum=minimum, maximum=maximum)

core/services/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+External service integration modules for MangaTranslator.
+This subpackage contains modules for:
+- Translation API calls to various LLM providers
+- External service communication
+"""
+from core.image.sorting import sort_bubbles_by_reading_order
+from .translation import (
+    call_translation_api_batch,
+    prepare_bubble_images_for_translation,
+)
+__all__ = [
+    "call_translation_api_batch",
+    "prepare_bubble_images_for_translation",
+    "sort_bubbles_by_reading_order",
+]

core/services/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (686 Bytes). View file

core/services/__pycache__/translation.cpython-311.pyc ADDED Viewed

Binary file (49.8 kB). View file

core/services/translation.py ADDED Viewed

	@@ -0,0 +1,1385 @@

+import base64
+import re
+from io import BytesIO
+from typing import Any, Dict, List, Optional
+import cv2
+import numpy as np
+from PIL import Image
+from core.caching import get_cache
+from core.config import TranslationConfig, calculate_reasoning_budget
+from core.image.image_utils import cv2_to_pil, pil_to_cv2, process_bubble_image_cached
+from core.image.ocr_detection import extract_text_with_manga_ocr
+from utils.endpoints import (
+    call_anthropic_endpoint,
+    call_deepseek_endpoint,
+    call_gemini_endpoint,
+    call_moonshot_endpoint,
+    call_openai_compatible_endpoint,
+    call_openai_endpoint,
+    call_openrouter_endpoint,
+    call_xai_endpoint,
+    call_zai_endpoint,
+    openrouter_is_reasoning_model,
+)
+from utils.exceptions import TranslationError
+from utils.logging import log_message
+from utils.model_metadata import (
+    get_max_tokens_cap,
+    is_deepseek_reasoning_model,
+    is_openai_compatible_reasoning_model,
+    is_opus_45_model,
+    is_xai_reasoning_model,
+    is_zai_reasoning_model,
+)
+TRANSLATION_PATTERN = re.compile(
+    r'^\s*(\d+)\s*:\s*"?\s*(.*?)\s*"?\s*(?=\s*\n\s*\d+\s*:|\s*$)',
+    re.MULTILINE | re.DOTALL,
+)
+def _build_system_prompt_ocr(
+    input_language: Optional[str],
+    reading_direction: str,
+) -> str:
+    lang_label = f"{input_language} " if input_language else ""
+    direction = (
+        "right-to-left"
+        if (reading_direction or "rtl").lower() == "rtl"
+        else "left-to-right"
+    )
+    return f"""
+## ROLE
+You are an expert manga OCR transcriber.
+## OBJECTIVE
+Your sole purpose is to accurately transcribe the original text from a series of provided images. You must not translate, interpret, or add commentary.
+## CORE RULES
+- **Reading Context:** The image crops are presented in a {direction} reading order. Do not reorder them.
+- **Transcription Policy:** Preserve all original punctuation, ellipses, and casing. Collapse multi-line text into a single line, separated by a single space.
+- **Ignore Policy:** You must ignore image borders, speech bubble tails, watermarks, page numbers, and any decorative elements outside the text itself.
+- **Language Focus:** Transcribe only the original {lang_label}text.
+- **Ruby/Furigana Policy:** If small phonetic characters (ruby/furigana) are present, you must ignore them and transcribe only the main, larger base text.
+- **Visual Emphasis Policy:** If the source text is visually emphasized (bold, slanted, etc.), you must mirror that emphasis in your transcription using markdown-style markers: `*italic*` for slanted text, `**bold**` for bold text, `***bold-italic***` for both.
+- **Edge Cases:**
+  - If an image contains standalone periods/ellipses, you must return it exactly as it appears.
+  - If text is indecipherable, you must return the exact token: `[OCR FAILED]`.
+## OUTPUT SCHEMA
+- You must return your response as a single numbered list with exactly one line per input image.
+- The numbering must correspond to the input image order (1, 2, 3...).
+- The format must be `i: <transcribed {lang_label}text>` where `i` is the input image number.
+- Do not include section headers, explanations, or formatting outside of this list.
+"""  # noqa
+def _build_system_prompt_translation(
+    output_language: str,
+    mode: str,
+    reading_direction: str,
+    full_page_context: bool = False,
+) -> str:
+    direction = (
+        "right-to-left"
+        if (reading_direction or "rtl").lower() == "rtl"
+        else "left-to-right"
+    )
+    input_type = "transcriptions" if mode == "two-step" else "image crops"
+    cohesion_visual = (
+        " Refer to the full-page image to resolve ambiguous context."
+        if full_page_context
+        else ""
+    )
+    if mode == "two-step":
+        edge_cases = """- **Edge Cases:**
+  - If an input line contains standalone periods/ellipses, you must return it exactly as it appears.
+  - If an input line is the exact token `[OCR FAILED]`, you must output it unchanged."""
+    else:
+        edge_cases = """- **Edge Cases:**
+  - If an image contains standalone periods/ellipses, you must return it exactly as it appears.
+  - If text is indecipherable, you must return the exact token: `[OCR FAILED]`."""
+    core_rules = f"""
+## CORE RULES
+- **Reading Context:** The {input_type} are presented in a {direction} reading order. Do not reorder them.
+- **Cohesion:** Treat the input lines as a continuous narrative. Ensure the translation flows logically and naturally as a cohesive whole.{cohesion_visual}
+- **Fidelity:** Focus on intent; translate functionally rather than literally.
+- **Conciseness:** Keep translations idiomatic and concise.
+- **Emphasis:** If the source text is visually emphasized (bold, slanted, etc.), mirror that emphasis using the STYLING GUIDE.
+- **Punctuation:** Replace ellipses (e.g., "…") with consecutive periods (e.g., "...").
+- **Text Types:**
+  - **Spoken Dialogue/Internal Monologue:** Translate naturally, matching the character's personality.
+  - **Narration:** Translate neutrally without special styling.
+  - **Audible SFX:** Translate physical sounds (Giongo) as standard onomatopoeia.
+  - **Mimetic FX:** Translate atmospheric text (Gitaigo) or silent actions as descriptive verbs or adjectives. Do not add a period at the end of the word.
+{edge_cases}
+"""  # noqa
+    shared_components = f"""
+## ROLE
+You are a professional manga localization translator and editor.
+## OBJECTIVE
+Your goal is to produce natural-sounding, high-quality translations in {output_language} that are faithful to the original source's meaning, tone, and visual emphasis.
+## STYLING GUIDE
+You must use the following markdown-style markers to convey emphasis:
+- `*italic*`: Used for onomatopoeias, thoughts, flashbacks, distant sounds, or dialogue mediated by a device (e.g., phone, radio).
+- `**bold**`: Used for sound effects (SFX), shouting, timestamps, or individual emphatic words.
+- `***bold-italic***`: Used for extremely loud sounds or dialogue that also meets the criteria for italics (e.g., shouting over a radio).
+{core_rules}
+"""  # noqa
+    if mode == "one-step":
+        output_schema = f"""
+## OUTPUT SCHEMA
+- You must return your response as a single numbered list with exactly one line per input image.
+- The numbering must correspond to the input image order (1, 2, 3...).
+- For each item, provide both transcription and translation in the format:
+  `i: <transcribed text> || <translated {output_language} text>` where `i` is the input image number.
+- Do not include section headers, explanations, or formatting outside of this list.
+"""
+    elif mode == "two-step":
+        output_schema = f"""
+## OUTPUT SCHEMA
+- You must return your response as a single numbered list with exactly one line per input text.
+- The numbering must correspond to the input order (1, 2, 3...).
+- The format must be `i: <translated {output_language} text>` where `i` is the input text number.
+- Do not include section headers, explanations, or formatting outside of this list.
+"""  # noqa
+    else:
+        raise ValueError(
+            f"Invalid mode '{mode}' specified for translation system prompt."
+        )
+    return shared_components + output_schema
+def _is_reasoning_model_google(model_name: str) -> bool:
+    """Check if a Google model is reasoning-capable."""
+    name = model_name or ""
+    return (
+        name.startswith("gemini-2.5")
+        or "gemini-2.5" in name
+        or "gemini-3" in name.lower()
+    )
+def _is_reasoning_model_openai(model_name: str) -> bool:
+    """Check if an OpenAI model is reasoning-capable."""
+    lm = (model_name or "").lower()
+    return (
+        lm.startswith("gpt-5")
+        or lm.startswith("o1")
+        or lm.startswith("o3")
+        or lm.startswith("o4-mini")
+    )
+def _is_reasoning_model_anthropic(model_name: str) -> bool:
+    """Check if an Anthropic model is reasoning-capable."""
+    lm = (model_name or "").lower()
+    reasoning_prefixes = [
+        "claude-opus-4",
+        "claude-sonnet-4",
+        "claude-haiku-4-5",
+        "claude-3-7-sonnet",
+    ]
+    return any(lm.startswith(p) for p in reasoning_prefixes)
+def _add_media_resolution_to_part(
+    part: Dict[str, Any],
+    media_resolution_ui: str,
+    is_gemini_3: bool,
+) -> Dict[str, Any]:
+    """
+    Add media_resolution to an inline_data part for Gemini 3 models.
+    Args:
+        part: Part dictionary with inline_data
+        media_resolution_ui: UI format media resolution ("auto"/"high"/"medium"/"low")
+        is_gemini_3: Whether the model is Gemini 3
+    Returns:
+        Part dictionary with media_resolution added if Gemini 3, otherwise unchanged
+    """
+    if not is_gemini_3 or "inline_data" not in part:
+        return part
+    media_resolution_mapping = {
+        "auto": "MEDIA_RESOLUTION_UNSPECIFIED",
+        "high": "MEDIA_RESOLUTION_HIGH",
+        "medium": "MEDIA_RESOLUTION_MEDIUM",
+        "low": "MEDIA_RESOLUTION_LOW",
+    }
+    backend_media_resolution = media_resolution_mapping.get(
+        media_resolution_ui.lower(), "MEDIA_RESOLUTION_UNSPECIFIED"
+    )
+    result = part.copy()
+    result["media_resolution"] = {"level": backend_media_resolution}
+    return result
+def _build_generation_config(
+    provider: str,
+    model_name: str,
+    config: TranslationConfig,
+    debug: bool = False,
+) -> Dict[str, Any]:
+    """
+    Build provider-specific generation config dictionary.
+    Centralizes logic for:
+    - Base parameters (temperature, top_p, top_k)
+    - Provider-specific parameter names and constraints
+    - Reasoning model detection and token limits
+    - Special features (thinking, reasoning_effort, etc.)
+    Args:
+        provider: Provider name (Google, OpenAI, Anthropic, xAI, OpenRouter, OpenAI-Compatible)
+        model_name: Model identifier
+        config: TranslationConfig with all settings
+        debug: Whether to log debug messages
+    Returns:
+        Dictionary with generation config parameters for the specific provider
+    """
+    temperature = config.temperature
+    top_p = config.top_p
+    top_k = config.top_k
+    if config.max_tokens is not None:
+        max_tokens_value = config.max_tokens
+    else:
+        is_reasoning = False
+        if provider == "Google":
+            is_reasoning = _is_reasoning_model_google(model_name)
+        elif provider == "OpenAI":
+            is_reasoning = _is_reasoning_model_openai(model_name)
+        elif provider == "Anthropic":
+            is_reasoning = _is_reasoning_model_anthropic(model_name)
+        elif provider == "xAI":
+            is_reasoning = is_xai_reasoning_model(model_name)
+        elif provider == "OpenRouter":
+            is_reasoning = openrouter_is_reasoning_model(model_name, debug)
+        elif provider == "OpenAI-Compatible":
+            is_reasoning = is_openai_compatible_reasoning_model(model_name)
+        elif provider == "DeepSeek":
+            is_reasoning = is_deepseek_reasoning_model(model_name)
+        elif provider == "Z.ai":
+            is_reasoning = is_zai_reasoning_model(model_name)
+        max_tokens_value = 16384 if is_reasoning else 4096
+    max_tokens_cap = get_max_tokens_cap(provider, model_name)
+    if max_tokens_cap is not None and max_tokens_value > max_tokens_cap:
+        max_tokens_value = max_tokens_cap
+    if provider == "Google":
+        is_gemini_3 = "gemini-3" in model_name.lower()
+        generation_config = {
+            "temperature": temperature,
+            "topP": top_p,
+            "topK": top_k,
+            "maxOutputTokens": max_tokens_value,
+        }
+        if not is_gemini_3:
+            media_resolution_mapping = {
+                "auto": "MEDIA_RESOLUTION_UNSPECIFIED",
+                "high": "MEDIA_RESOLUTION_HIGH",
+                "medium": "MEDIA_RESOLUTION_MEDIUM",
+                "low": "MEDIA_RESOLUTION_LOW",
+            }
+            backend_media_resolution = media_resolution_mapping.get(
+                config.media_resolution.lower(), "MEDIA_RESOLUTION_UNSPECIFIED"
+            )
+            generation_config["media_resolution"] = backend_media_resolution
+        if is_gemini_3:
+            reasoning_effort = config.reasoning_effort or "high"
+            generation_config["thinkingConfig"] = {"thinkingLevel": reasoning_effort}
+            log_message(
+                f"Using reasoning effort '{reasoning_effort}' for {model_name}",
+                verbose=debug,
+            )
+        elif _is_reasoning_model_google(model_name) and not is_gemini_3:
+            reasoning_effort = config.reasoning_effort or "auto"
+            is_flash = "gemini-2.5-flash" in model_name.lower()
+            is_pro = "gemini-2.5-pro" in model_name.lower()
+            if reasoning_effort == "none":
+                if is_flash:
+                    generation_config["thinkingConfig"] = {"thinkingBudget": 0}
+                    log_message(f"Disabled reasoning for {model_name}", verbose=debug)
+                elif is_pro:
+                    generation_config["thinkingConfig"] = {"thinkingBudget": 128}
+                    log_message(
+                        f"Using 'none' reasoning effort (thinkingBudget: 128) for {model_name}",
+                        verbose=debug,
+                    )
+                else:
+                    log_message(
+                        f"Warning: 'none' not supported for {model_name}, using 'auto'",
+                        verbose=debug,
+                    )
+            elif reasoning_effort == "auto":
+                log_message(
+                    f"Using auto reasoning allocation for {model_name}", verbose=debug
+                )
+            else:
+                thinking_budget = calculate_reasoning_budget(
+                    max_tokens_value, reasoning_effort
+                )
+                generation_config["thinkingConfig"] = {
+                    "thinkingBudget": thinking_budget
+                }
+                log_message(
+                    f"Using reasoning effort '{reasoning_effort}' (budget: {thinking_budget} tokens) for {model_name}",
+                    verbose=debug,
+                )
+        return generation_config
+    elif provider == "OpenAI":
+        generation_config = {
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_output_tokens": max_tokens_value,
+        }  # top_k not supported by OpenAI
+        if config.reasoning_effort:
+            lm = (model_name or "").lower()
+            is_chat_variant = "chat" in lm
+            is_gpt5_1 = lm.startswith("gpt-5.1")
+            is_gpt5_2 = lm.startswith("gpt-5.2")
+            effort = config.reasoning_effort
+            if effort == "xhigh" and not is_gpt5_2:
+                effort = "high"
+            if not is_chat_variant and (is_gpt5_1 or is_gpt5_2 or effort != "none"):
+                generation_config["reasoning_effort"] = effort
+        return generation_config
+    elif provider == "Anthropic":
+        is_reasoning = _is_reasoning_model_anthropic(model_name)
+        is_opus_45 = is_opus_45_model(model_name)
+        clamped_temp = min(temperature, 1.0)  # Anthropic caps at 1.0
+        generation_config = {
+            "temperature": clamped_temp,
+            "top_p": top_p,
+            "top_k": top_k,
+            "max_tokens": max_tokens_value,
+        }
+        if is_reasoning:
+            generation_config["reasoning_effort"] = config.reasoning_effort or "none"
+        if is_opus_45 and config.effort:
+            generation_config["effort"] = config.effort
+        return generation_config
+    elif provider == "xAI":
+        is_reasoning = is_xai_reasoning_model(model_name)
+        generation_config = {
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_tokens": max_tokens_value,
+        }
+        if is_reasoning:
+            generation_config["reasoning_effort"] = config.reasoning_effort or "high"
+        return generation_config
+    elif provider == "DeepSeek":
+        is_reasoning = is_deepseek_reasoning_model(model_name)
+        generation_config = {
+            "temperature": temperature,
+            "top_p": top_p,
+            "max_tokens": max_tokens_value,
+        }
+        return generation_config
+    elif provider == "Z.ai":
+        is_reasoning = is_zai_reasoning_model(model_name)
+        generation_config = {
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "max_tokens": max_tokens_value,
+        }
+        if is_reasoning:
+            # Z.ai uses thinking parameter with {"type": "enabled"} or {"type": "disabled"}
+            # Map reasoning_effort: "high" -> enabled, "none" -> disabled
+            reasoning_effort = config.reasoning_effort or "high"
+            thinking_type = "enabled" if reasoning_effort == "high" else "disabled"
+            generation_config["thinking"] = {"type": thinking_type}
+        return generation_config
+    elif provider == "Moonshot AI":
+        # Moonshot AI is text-only, reasoning models have always-on reasoning
+        generation_config = {
+            "temperature": min(temperature, 1.0),  # Moonshot caps at 1.0
+            "top_p": top_p,
+            "max_tokens": max_tokens_value,
+        }
+        return generation_config
+    elif provider == "OpenRouter":
+        model_lower = (model_name or "").lower()
+        is_openai_model = "openai/" in model_lower or model_lower.startswith("gpt-")
+        is_anthropic_model = "anthropic/" in model_lower or model_lower.startswith(
+            "claude-"
+        )
+        is_grok_model = "grok-4" in model_lower
+        is_gemini_3 = "gemini-3" in model_lower
+        generation_config = {
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "max_tokens": max_tokens_value,
+        }
+        is_openai_reasoning = is_openai_model and (
+            "gpt-5" in model_lower
+            or "o1" in model_lower
+            or "o3" in model_lower
+            or "o4-mini" in model_lower
+        )
+        is_gpt5_1 = is_openai_model and "gpt-5.1" in model_lower
+        is_gpt5 = is_openai_model and "gpt-5" in model_lower and not is_gpt5_1
+        # For OpenRouter, Anthropic models use dots (4.5) not hyphens (4-5)
+        # Claude 3.7 Sonnet :thinking variant is reasoning-capable, non-thinking is not
+        is_claude_37_sonnet_thinking = (
+            is_anthropic_model
+            and "claude-3.7-sonnet" in model_lower
+            and ":thinking" in model_lower
+        )
+        is_anthropic_reasoning = is_anthropic_model and (
+            "claude-opus-4" in model_lower
+            or "claude-sonnet-4" in model_lower
+            or "claude-haiku-4.5" in model_lower
+            or is_claude_37_sonnet_thinking
+        )
+        # For OpenRouter, Grok models don't have "reasoning" in the name (e.g., "grok-4.1-fast")
+        is_grok_reasoning = is_grok_model and "non-reasoning" not in model_lower
+        # Add metadata flags for OpenRouter endpoint to avoid re-parsing model names
+        generation_config["_metadata"] = {
+            "is_openai_model": is_openai_model,
+            "is_anthropic_model": is_anthropic_model,
+            "is_grok_model": is_grok_model,
+            "is_gemini_3": is_gemini_3,
+            "is_google_model": "google/" in model_lower or "gemini" in model_lower,
+            "is_openai_reasoning": is_openai_reasoning,
+            "is_anthropic_reasoning": is_anthropic_reasoning,
+            "is_grok_reasoning": is_grok_reasoning,
+            "is_claude_37_sonnet_thinking": is_claude_37_sonnet_thinking,
+            "is_gpt5_1": is_gpt5_1,
+            "is_gpt5": is_gpt5,
+        }
+        if is_openai_reasoning or is_anthropic_reasoning or is_grok_reasoning:
+            if is_anthropic_reasoning:
+                reasoning_effort = config.reasoning_effort or "none"
+                generation_config["reasoning_effort"] = reasoning_effort
+            elif is_gpt5_1:
+                generation_config["reasoning_effort"] = config.reasoning_effort
+            elif config.reasoning_effort and config.reasoning_effort != "none":
+                generation_config["reasoning_effort"] = config.reasoning_effort
+        elif "gemini" in model_lower or "google/" in model_lower:
+            if config.reasoning_effort:
+                generation_config["reasoning_effort"] = config.reasoning_effort
+        return generation_config
+    elif provider == "OpenAI-Compatible":
+        return {
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "max_tokens": max_tokens_value,
+        }
+    else:
+        raise TranslationError(f"Unknown provider for generation config: {provider}")
+def _call_llm_endpoint(
+    config: TranslationConfig,
+    parts: List[Dict[str, Any]],
+    prompt_text: str,
+    debug: bool = False,
+    system_prompt: Optional[str] = None,
+) -> Optional[str]:
+    """Internal helper to dispatch API calls based on provider."""
+    provider = config.provider
+    model_name = config.model_name
+    api_parts = parts + [{"text": prompt_text}]
+    try:
+        if provider == "Google":
+            api_key = config.google_api_key
+            if not api_key:
+                raise TranslationError("Google API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_gemini_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+                enable_web_search=config.enable_web_search,
+            )
+        elif provider == "OpenAI":
+            api_key = config.openai_api_key
+            if not api_key:
+                raise TranslationError("OpenAI API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_openai_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+                enable_web_search=config.enable_web_search,
+            )
+        elif provider == "Anthropic":
+            api_key = config.anthropic_api_key
+            if not api_key:
+                raise TranslationError("Anthropic API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_anthropic_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+                enable_web_search=config.enable_web_search,
+            )
+        elif provider == "xAI":
+            api_key = config.xai_api_key
+            if not api_key:
+                raise TranslationError("xAI API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_xai_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+                enable_web_search=config.enable_web_search,
+            )
+        elif provider == "DeepSeek":
+            api_key = config.deepseek_api_key
+            if not api_key:
+                raise TranslationError("DeepSeek API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_deepseek_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+            )
+        elif provider == "Z.ai":
+            api_key = config.zai_api_key
+            if not api_key:
+                raise TranslationError("Z.ai API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_zai_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+                enable_web_search=config.enable_web_search,
+            )
+        elif provider == "Moonshot AI":
+            api_key = config.moonshot_api_key
+            if not api_key:
+                raise TranslationError("Moonshot API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_moonshot_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+                enable_web_search=config.enable_web_search,
+            )
+        elif provider == "OpenRouter":
+            api_key = config.openrouter_api_key
+            if not api_key:
+                raise TranslationError("OpenRouter API key is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_openrouter_endpoint(
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+                enable_web_search=config.enable_web_search,
+            )
+        elif provider == "OpenAI-Compatible":
+            base_url = config.openai_compatible_url
+            api_key = config.openai_compatible_api_key  # Optional
+            if not base_url:
+                raise TranslationError("OpenAI-Compatible URL is missing.")
+            generation_config = _build_generation_config(
+                provider, model_name, config, debug
+            )
+            return call_openai_compatible_endpoint(
+                base_url=base_url,
+                api_key=api_key,
+                model_name=model_name,
+                parts=api_parts,
+                generation_config=generation_config,
+                system_prompt=system_prompt,
+                debug=debug,
+            )
+        else:
+            raise TranslationError(
+                f"Unknown translation provider specified: {provider}"
+            )
+    except (ValueError, RuntimeError):
+        raise
+def _parse_llm_response_unified(
+    response_text: Optional[str],
+    total_elements: int,
+    provider: str,
+    debug: bool = False,
+) -> List[str]:
+    """Parse LLM response with a single numbered list."""
+    if response_text is None:
+        log_message(f"API call failed: {provider} returned None", always_print=True)
+        raise TranslationError(f"{provider}: API failed (returned None)")
+    elif response_text == "":
+        log_message(f"API call returned empty response: {provider}", always_print=True)
+        raise TranslationError(f"{provider}: Empty response")
+    try:
+        log_message(
+            f"Parsing {provider} unified response: {len(response_text)} chars",
+            verbose=debug,
+        )
+        log_message(f"Raw response:\n---\n{response_text}\n---", always_print=True)
+        # Pattern matches "1: text" or "1. text" or "1 text" etc.
+        pattern = re.compile(
+            r'^\s*(\d+)\s*[:.]\s*"?\s*(.*?)\s*"?\s*(?=\s*\n\s*\d+\s*[:.]|\s*$)',
+            re.MULTILINE | re.DOTALL,
+        )
+        matches = pattern.findall(response_text)
+        result_dict = {}
+        for num_str, text in matches:
+            try:
+                num = int(num_str)
+                if 1 <= num <= total_elements:
+                    result_dict[num] = text.strip()
+            except ValueError:
+                continue
+        final_list = []
+        for i in range(1, total_elements + 1):
+            if i in result_dict:
+                final_list.append(result_dict[i])
+            else:
+                final_list.append(f"[{provider}: Missing item {i}]")
+        log_message(
+            f"Parsed {len(result_dict)} items from unified response (expected {total_elements})",
+            verbose=debug,
+        )
+        return final_list
+    except Exception as e:
+        log_message(
+            f"Failed to parse {provider} unified response: {str(e)}",
+            always_print=True,
+        )
+        return [f"[{provider}: Parse error]"] * total_elements
+def _prepare_images_for_ocr(
+    images_b64: List[str], verbose: bool = False
+) -> List[Optional[Image.Image]]:
+    """Prepare base64-encoded images for OCR by decoding and converting to RGB.
+    Args:
+        images_b64: List of base64-encoded image strings
+        verbose: Whether to print verbose logging
+    Returns:
+        List of PIL Images (or None for decode failures), all in RGB mode
+    """
+    pil_images = []
+    for img_b64 in images_b64:
+        try:
+            image_data = base64.b64decode(img_b64)
+            pil_img = Image.open(BytesIO(image_data))
+            if pil_img.mode != "RGB":
+                pil_img = pil_img.convert("RGB")
+            pil_images.append(pil_img)
+        except Exception as e:
+            log_message(
+                f"Failed to decode image for manga-ocr: {e}",
+                always_print=True,
+            )
+            pil_images.append(None)
+    return pil_images
+def _format_ocr_results(
+    extracted_texts: List[str],
+    bubble_metadata: List[Dict[str, Any]],
+) -> None:
+    """Format and log OCR results.
+    Args:
+        extracted_texts: List of extracted text strings
+        bubble_metadata: List of metadata dicts for text elements
+        verbose: Whether to print verbose logging
+    """
+    log_lines = []
+    for i, text in enumerate(extracted_texts):
+        metadata = bubble_metadata[i] if i < len(bubble_metadata) else {}
+        is_osb = metadata.get("is_outside_text", False)
+        prefix = f"{i + 1}"
+        type_label = "[OSB]" if is_osb else "[Bubble]"
+        log_lines.append(f"{prefix}: {type_label} {text}")
+    if log_lines:
+        log_message(
+            f"Raw OCR output:\n---\n{chr(10).join(log_lines)}\n---",
+            always_print=True,
+        )
+def _check_ocr_failure(texts: List[str], provider: Optional[str] = None) -> bool:
+    """Check if all OCR results indicate failure.
+    Args:
+        texts: List of extracted text strings
+        provider: Optional provider name for LLM OCR failure detection
+    Returns:
+        True if all texts indicate failure, False otherwise
+    """
+    if not texts:
+        return True
+    if provider:
+        for text in texts:
+            if f"[{provider}-OCR:" not in text:
+                return False
+        return True
+    else:
+        return all(text == "[OCR FAILED]" for text in texts)
+def _format_special_instructions(config: TranslationConfig) -> str:
+    """Format user's special instructions section for prompts.
+    Args:
+        config: TranslationConfig with special_instructions
+    Returns:
+        Formatted special instructions string (empty if none)
+    """
+    if config.special_instructions and config.special_instructions.strip():
+        return f"""
+## SPECIAL INSTRUCTIONS
+{config.special_instructions.strip()}
+"""
+    return ""
+def _perform_manga_ocr(
+    images_b64: List[str],
+    bubble_metadata: List[Dict[str, Any]],
+    debug: bool = False,
+) -> List[str]:
+    """Perform OCR using manga-ocr model.
+    Args:
+        images_b64: List of base64-encoded images
+        bubble_metadata: List of metadata dicts for text elements
+        debug: Whether to print verbose logging
+    Returns:
+        List of extracted text strings, or early return with failure list
+    """
+    total_elements = len(images_b64)
+    log_message("Using manga-ocr for text extraction", verbose=debug)
+    cache = get_cache()
+    cache_key = cache.get_manga_ocr_cache_key(images_b64, total_elements)
+    cached_ocr = cache.get_manga_ocr_result(cache_key)
+    if cached_ocr is not None:
+        if len(cached_ocr) == total_elements:
+            log_message("Using cached manga-ocr results", verbose=debug)
+            return cached_ocr
+        log_message("Discarding manga-ocr cache due to length mismatch", verbose=debug)
+    pil_images = _prepare_images_for_ocr(images_b64, verbose=debug)
+    extracted_texts = extract_text_with_manga_ocr(pil_images, verbose=debug)
+    formatted_texts = []
+    for i, text in enumerate(extracted_texts):
+        if text == "[OCR FAILED]" or not text:
+            formatted_texts.append(text if text else "[OCR FAILED]")
+        else:
+            formatted_texts.append(text)
+    extracted_texts = formatted_texts
+    _format_ocr_results(extracted_texts, bubble_metadata)
+    if len(extracted_texts) != total_elements:
+        msg = (
+            f"Warning: extracted_texts length ({len(extracted_texts)}) "
+            f"doesn't match total_elements ({total_elements})"
+        )
+        log_message(msg, always_print=True)
+        while len(extracted_texts) < total_elements:
+            extracted_texts.append("[OCR FAILED]")
+        extracted_texts = extracted_texts[:total_elements]
+    if not extracted_texts:
+        log_message("manga-ocr returned empty results", verbose=debug)
+        failure_results = ["[OCR FAILED]"] * total_elements
+        cache.set_manga_ocr_result(cache_key, failure_results, debug)
+        return failure_results
+    if _check_ocr_failure(extracted_texts):
+        log_message("manga-ocr returned only failures", verbose=debug)
+        cache.set_manga_ocr_result(cache_key, extracted_texts, debug)
+        return extracted_texts
+    cache.set_manga_ocr_result(cache_key, extracted_texts, debug)
+    return extracted_texts
+def _perform_llm_ocr(
+    config: TranslationConfig,
+    images_b64: List[str],
+    mime_types: List[str],
+    ocr_prompt: str,
+    is_gemini_3: bool,
+    provider: str,
+    input_language: Optional[str],
+    reading_direction: str,
+    debug: bool = False,
+) -> List[str]:
+    """Perform OCR using vision LLM.
+    Args:
+        config: TranslationConfig
+        images_b64: List of base64-encoded images
+        mime_types: List of MIME types for each image
+        ocr_prompt: OCR prompt text
+        is_gemini_3: Whether model is Gemini 3
+        provider: Provider name
+        input_language: Input language
+        reading_direction: Reading direction
+        debug: Whether to print verbose logging
+    Returns:
+        List of extracted text strings, or early return with failure list
+    """
+    total_elements = len(images_b64)
+    ocr_parts = []
+    for i, img_b64 in enumerate(images_b64):
+        mime_type = mime_types[i] if i < len(mime_types) else "image/jpeg"
+        bubble_part = {"inline_data": {"mime_type": mime_type, "data": img_b64}}
+        if is_gemini_3:
+            bubble_part = _add_media_resolution_to_part(
+                bubble_part, config.media_resolution_bubbles, is_gemini_3
+            )
+        ocr_parts.append(bubble_part)
+    ocr_system = _build_system_prompt_ocr(input_language, reading_direction)
+    ocr_response_text = _call_llm_endpoint(
+        config,
+        ocr_parts,
+        ocr_prompt,
+        debug,
+        system_prompt=ocr_system,
+    )
+    extracted_texts = _parse_llm_response_unified(
+        ocr_response_text,
+        total_elements,
+        provider + "-OCR",
+        debug,
+    )
+    if extracted_texts is None:
+        log_message("OCR API call failed", always_print=True)
+        return [f"[{provider}: OCR failed]"] * total_elements
+    if _check_ocr_failure(extracted_texts, provider):
+        log_message("OCR returned only placeholders", verbose=debug)
+        return extracted_texts
+    return extracted_texts
+def call_translation_api_batch(
+    config: TranslationConfig,
+    images_b64: List[str],
+    full_image_b64: str,
+    mime_types: List[str],
+    full_image_mime_type: str,
+    bubble_metadata: List[Dict[str, Any]],
+    debug: bool = False,
+) -> List[str]:
+    """
+    Generates prompts and calls the appropriate LLM API endpoint based on the provider and mode
+    specified in the configuration, translating text from speech bubbles and outside-bubble text.
+    Supports "one-step" (OCR+Translate+Style) and "two-step" (OCR then Translate+Style) modes.
+    Args:
+        config (TranslationConfig): Configuration object.
+        images_b64 (list): List of base64 encoded images of all text elements, in reading order.
+        full_image_b64 (str): Base64 encoded image of the full manga page.
+        mime_types (List[str]): List of MIME types for each text element image.
+        full_image_mime_type (str): MIME type of the full page image.
+        bubble_metadata (List[Dict]): List of metadata dicts with 'is_outside_text' flags for each image.
+        debug (bool): Whether to print debugging information.
+    Returns:
+        list: List of translated strings (potentially with style markers), one for each input text element.
+              Returns placeholder messages on errors or empty responses.
+    Raises:
+        ValueError: If required config (API key, provider, URL) is missing or invalid.
+        RuntimeError: If an API call fails irrecoverably after retries (raised by endpoint functions).
+    """
+    provider = config.provider
+    input_language = config.input_language
+    output_language = config.output_language
+    reading_direction = config.reading_direction
+    translation_mode = config.translation_mode
+    # Include conditional bubble hints
+    total_elements = len(images_b64)
+    dialogue_indices = [
+        i + 1
+        for i, meta in enumerate(bubble_metadata)
+        if not meta.get("is_outside_text", False)
+    ]
+    osb_indices = [
+        i + 1
+        for i, meta in enumerate(bubble_metadata)
+        if meta.get("is_outside_text", False)
+    ]
+    hints = []
+    if dialogue_indices:
+        dialogue_list_str = ", ".join(map(str, dialogue_indices))
+        hints.append(f"Items [{dialogue_list_str}] contain spoken dialogue.")
+    if osb_indices:
+        osb_list_str = ", ".join(map(str, osb_indices))
+        hints.append(
+            f"Items [{osb_list_str}] contain sound effects, mimetic effects, narration, or internal monologues."
+        )
+    context_hints = ""
+    if hints:
+        context_hints = "\nNote: " + " ".join(hints) + " Translate them accordingly."
+    reading_order_desc = (
+        "right-to-left, top-to-bottom"
+        if reading_direction == "rtl"
+        else "left-to-right, top-to-bottom"
+    )
+    cache = get_cache()
+    cache_key = cache.get_translation_cache_key(images_b64, full_image_b64, config)
+    cached_translation = cache.get_translation(cache_key)
+    if cached_translation is not None:
+        log_message("  - Using cached translation", verbose=debug)
+        return cached_translation
+    model_name = config.model_name
+    is_gemini_3 = provider == "Google" and "gemini-3" in model_name.lower()
+    base_parts = []
+    for i, img_b64 in enumerate(images_b64):
+        mime_type = mime_types[i] if i < len(mime_types) else "image/jpeg"
+        bubble_part = {"inline_data": {"mime_type": mime_type, "data": img_b64}}
+        if is_gemini_3:
+            bubble_part = _add_media_resolution_to_part(
+                bubble_part, config.media_resolution_bubbles, is_gemini_3
+            )
+        base_parts.append(bubble_part)
+    if config.send_full_page_context and full_image_b64:
+        context_part = {
+            "inline_data": {
+                "mime_type": full_image_mime_type,
+                "data": full_image_b64,
+            }
+        }
+        if is_gemini_3:
+            context_part = _add_media_resolution_to_part(
+                context_part, config.media_resolution_context, is_gemini_3
+            )
+        base_parts.append(context_part)
+    try:
+        if translation_mode == "two-step":
+            special_instructions_section = _format_special_instructions(config)
+            ocr_prompt = f"""
+## CONTEXT
+You have been provided with {total_elements} individual text images from a manga page. They are presented in their natural reading order ({reading_order_desc}).
+## TASK
+Apply your OCR transcription rules to each image provided.{special_instructions_section}
+"""  # noqa
+            log_message("Starting OCR step", verbose=debug)
+            if config.ocr_method == "manga-ocr":
+                extracted_texts = _perform_manga_ocr(
+                    images_b64,
+                    bubble_metadata,
+                    debug,
+                )
+            else:
+                extracted_texts = _perform_llm_ocr(
+                    config,
+                    images_b64,
+                    mime_types,
+                    ocr_prompt,
+                    is_gemini_3,
+                    provider,
+                    input_language,
+                    reading_direction,
+                    debug,
+                )
+            log_message("Starting translation step", verbose=debug)
+            formatted_texts = []
+            ocr_failed_indices = set()
+            for i, text in enumerate(extracted_texts):
+                if f"[{provider}-OCR:" in text or text == "[OCR FAILED]":
+                    formatted_texts.append("[OCR FAILED]")
+                    ocr_failed_indices.add(i)
+                else:
+                    formatted_texts.append(text)
+            ocr_input_section = """
+## INPUT DATA
+"""
+            for i, text in enumerate(formatted_texts):
+                ocr_input_section += f"{i + 1}: {text}\n"
+            full_page_context = (
+                "A full-page image is also provided for visual and narrative context."
+                if (
+                    config.ocr_method != "manga-ocr"
+                    and config.send_full_page_context
+                    and full_image_b64
+                )
+                else ""
+            )
+            special_instructions_section = _format_special_instructions(config)
+            translation_prompt = f"""
+## CONTEXT
+You have been provided with a list of {total_elements} transcribed text segments from a manga page. {full_page_context}
+{context_hints}
+{ocr_input_section}
+## TASK
+Apply your translation and styling rules to the text in the `## INPUT DATA` section.
+The target language is {output_language}. Use the appropriate translation approach for each text type.{special_instructions_section}
+"""  # noqa
+            translation_parts = []
+            if (
+                config.ocr_method != "manga-ocr"
+                and config.send_full_page_context
+                and full_image_b64
+            ):
+                context_part = {
+                    "inline_data": {
+                        "mime_type": full_image_mime_type,
+                        "data": full_image_b64,
+                    }
+                }
+                if is_gemini_3:
+                    context_part = _add_media_resolution_to_part(
+                        context_part, config.media_resolution_context, is_gemini_3
+                    )
+                translation_parts.append(context_part)
+            translation_system = _build_system_prompt_translation(
+                output_language,
+                mode="two-step",
+                reading_direction=reading_direction,
+                full_page_context=(
+                    config.send_full_page_context and bool(full_image_b64)
+                ),
+            )
+            translation_response_text = _call_llm_endpoint(
+                config,
+                translation_parts,
+                translation_prompt,
+                debug,
+                system_prompt=translation_system,
+            )
+            final_translations = _parse_llm_response_unified(
+                translation_response_text,
+                total_elements,
+                provider + "-Translate",
+                debug,
+            )
+            if final_translations is None:
+                log_message("Translation API call failed", always_print=True)
+                combined_results = []
+                for i in range(total_elements):
+                    if i in ocr_failed_indices:
+                        combined_results.append(f"[{provider}: OCR Failed]")
+                    else:
+                        combined_results.append(f"[{provider}: Translation failed]")
+                return combined_results
+            combined_results = []
+            for i in range(total_elements):
+                if i in ocr_failed_indices:
+                    if final_translations[i] == "[OCR FAILED]":
+                        combined_results.append("[OCR FAILED]")
+                    else:
+                        log_message(
+                            f"Element {i + 1}: LLM ignored OCR failure instruction",
+                            verbose=debug,
+                        )
+                        combined_results.append("[OCR FAILED]")
+                else:
+                    combined_results.append(final_translations[i])
+            cache.set_translation(cache_key, combined_results)
+            return combined_results
+        elif translation_mode == "one-step":
+            log_message("Starting one-step translation", verbose=debug)
+            full_page_context = (
+                "A full-page image is also provided for visual and narrative context."
+                if config.send_full_page_context
+                else ""
+            )
+            special_instructions_section = _format_special_instructions(config)
+            one_step_prompt = f"""
+## CONTEXT
+You have been provided with {total_elements} individual text images from a manga page. {full_page_context}
+{context_hints}
+## TASK
+For each image, you must perform two steps:
+1.  **Transcribe:** Extract the original text exactly as it appears.
+2.  **Translate:** Translate the text you just transcribed into {output_language}, applying your translation and styling rules.{special_instructions_section}
+## OUTPUT FORMAT
+You must return your response as a single numbered list with exactly one line per input image.
+The numbering must correspond to the input image order (1, 2, 3...).
+Format: `i: <transcribed text> || <translated {output_language} text>`
+"""  # noqa
+            one_step_system = _build_system_prompt_translation(
+                output_language,
+                mode="one-step",
+                reading_direction=reading_direction,
+                full_page_context=(
+                    config.send_full_page_context and bool(full_image_b64)
+                ),
+            )
+            response_text = _call_llm_endpoint(
+                config,
+                base_parts,
+                one_step_prompt,
+                debug,
+                system_prompt=one_step_system,
+            )
+            # Parse one-step format ("Original || Translated")
+            raw_lines = _parse_llm_response_unified(
+                response_text, total_elements, provider, debug
+            )
+            translations = []
+            for line in raw_lines:
+                if "||" in line:
+                    parts = line.split("||", 1)
+                    translations.append(parts[1].strip())
+                else:
+                    translations.append(line)
+            cache.set_translation(cache_key, translations)
+            return translations
+        else:
+            raise TranslationError(
+                f"Unknown translation_mode specified in config: {translation_mode}"
+            )
+    except TranslationError:
+        raise
+    except (ValueError, RuntimeError) as e:
+        log_message(f"Translation error: {e}", always_print=True)
+        return [f"[Translation Error: {e}]"] * total_elements
+def prepare_bubble_images_for_translation(
+    bubble_data: List[Dict[str, Any]],
+    original_cv_image: np.ndarray,
+    upscale_model: Any,
+    device: Any,
+    mime_type: str,
+    bubble_min_side_pixels: int,
+    upscale_method: str = "model_lite",
+    verbose: bool = False,
+) -> List[Dict[str, Any]]:
+    """
+    Prepare bubble images for translation by cropping, upscaling, color matching, and encoding.
+    This function processes each speech bubble to prepare it for the translation API:
+    1. Crops the bubble from the original image
+    2. Upscales the bubble to meet minimum size requirements (based on upscale_method)
+    3. Matches colors to preserve visual consistency (only for model upscaling)
+    4. Encodes the processed bubble as base64 for API transmission
+    Args:
+        bubble_data: List of bubble detection dicts with 'bbox' keys
+        original_cv_image: OpenCV image array of the original image
+        upscale_model: Loaded upscaling model
+        device: PyTorch device for model inference
+        mime_type: MIME type for image encoding
+        upscale_method: Method for upscaling - "model", "lanczos", or "none"
+        verbose: Whether to print detailed logging
+    Returns:
+        List of bubble dicts with added 'image_b64' and 'mime_type' keys
+        (immutable approach - returns new list without mutating input)
+    """
+    cv2_ext = ".png" if mime_type == "image/png" else ".jpg"
+    prepared_bubbles = []
+    if upscale_method == "model":
+        log_message(
+            f"Upscaling {len(bubble_data)} bubble images with 2x-AnimeSharpV4_RCAN",
+            always_print=True,
+        )
+    elif upscale_method == "model_lite":
+        log_message(
+            f"Upscaling {len(bubble_data)} bubble images with 2x-AnimeSharpV4_Fast_RCAN_PU (Lite)",
+            always_print=True,
+        )
+    elif upscale_method == "lanczos":
+        log_message(
+            f"Upscaling {len(bubble_data)} bubble images with LANCZOS",
+            always_print=True,
+        )
+    else:  # upscale_method == "none"
+        log_message(
+            f"Processing {len(bubble_data)} bubble images without upscaling",
+            always_print=True,
+        )
+    for bubble in bubble_data:
+        prepared_bubble = bubble.copy()
+        x1, y1, x2, y2 = bubble["bbox"]
+        bubble_image_cv = original_cv_image[y1:y2, x1:x2].copy()
+        bubble_image_pil = cv2_to_pil(bubble_image_cv)
+        if upscale_method == "model" or upscale_method == "model_lite":
+            final_bubble_pil = process_bubble_image_cached(
+                bubble_image_pil,
+                upscale_model,
+                device,
+                bubble_min_side_pixels,
+                "min",
+                upscale_method,
+                verbose,
+            )
+        elif upscale_method == "lanczos":
+            w, h = bubble_image_pil.size
+            min_side = min(w, h)
+            if min_side < bubble_min_side_pixels:
+                scale_factor = bubble_min_side_pixels / min_side
+                new_w = int(w * scale_factor)
+                new_h = int(h * scale_factor)
+                resized_bubble = bubble_image_pil.resize((new_w, new_h), Image.LANCZOS)
+            else:
+                resized_bubble = bubble_image_pil
+            final_bubble_pil = resized_bubble
+        else:  # upscale_method == "none"
+            final_bubble_pil = bubble_image_pil
+        final_bubble_cv = pil_to_cv2(final_bubble_pil)
+        try:
+            is_success, buffer = cv2.imencode(cv2_ext, final_bubble_cv)
+            if is_success:
+                image_b64 = base64.b64encode(buffer).decode("utf-8")
+                prepared_bubble["image_b64"] = image_b64
+                prepared_bubble["mime_type"] = mime_type
+                log_message(
+                    f"Bubble {x1},{y1} ({final_bubble_pil.size[0]}x{final_bubble_pil.size[1]})",
+                    verbose=verbose,
+                )
+            else:
+                log_message(
+                    f"Failed to encode bubble {bubble['bbox']}", verbose=verbose
+                )
+                prepared_bubble["image_b64"] = None
+        except Exception as e:
+            log_message(f"Error encoding bubble {bubble['bbox']}: {e}", verbose=verbose)
+            prepared_bubble["image_b64"] = None
+        prepared_bubbles.append(prepared_bubble)
+    return prepared_bubbles

core/text/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+Text processing and rendering modules for MangaTranslator.
+This subpackage contains modules for:
+- Text processing and tokenization
+- Font management and loading
+- Layout engine for optimal text placement
+- Drawing engine using Skia
+- High-level text rendering orchestration
+"""
+from .drawing_engine import (
+    draw_layout,
+    load_font_resources,
+    pil_to_skia_surface,
+    skia_surface_to_pil,
+)
+from .font_manager import (
+    LRUCache,
+    find_font_variants,
+    get_font_features,
+    load_font_data,
+)
+from .layout_engine import find_optimal_layout, shape_line
+from .text_processing import (
+    find_optimal_breaks_dp,
+    parse_styled_segments,
+    tokenize_styled_text,
+    try_hyphenate_word,
+)
+from .text_renderer import render_text_skia
+__all__ = [
+    "draw_layout",
+    "load_font_resources",
+    "pil_to_skia_surface",
+    "skia_surface_to_pil",
+    "find_font_variants",
+    "get_font_features",
+    "LRUCache",
+    "load_font_data",
+    "find_optimal_layout",
+    "shape_line",
+    "find_optimal_breaks_dp",
+    "parse_styled_segments",
+    "tokenize_styled_text",
+    "try_hyphenate_word",
+    "render_text_skia",
+]

core/text/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.37 kB). View file