# TransateKRtoEN.py # -*- coding: utf-8 -*- import json import logging import shutil import threading import queue import uuid import inspect import os, sys, io, zipfile, time, re, mimetypes, subprocess, tiktoken import builtins import ebooklib from ebooklib import epub from bs4 import BeautifulSoup, NavigableString try: from bs4 import XMLParsedAsHTMLWarning import warnings # Suppress the warning since we handle both HTML and XHTML content warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) except ImportError: # Older versions of BeautifulSoup might not have this warning pass from collections import Counter from unified_api_client import UnifiedClient, UnifiedClientError # Translation thread submission throttling (batch) to align queued logs with actual delay _translation_thread_submit_lock = threading.Lock() _translation_last_thread_submit = 0.0 import hashlib import tempfile import unicodedata from difflib import SequenceMatcher import unicodedata import re import time from history_manager import HistoryManager from chapter_splitter import ChapterSplitter from image_translator import ImageTranslator from typing import Dict, List, Tuple from txt_processor import TextFileProcessor from ai_hunter_enhanced import ImprovedAIHunterDetection import GlossaryManager # Module with glossary functions import csv from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed # Module-level functions for ProcessPoolExecutor compatibility from tqdm import tqdm class ProgressBar: """Simple in-place progress bar for terminal output""" _last_line_length = 0 @classmethod def update(cls, current, total, prefix="Progress", bar_length=30): """Update progress bar in-place Args: current: Current progress value total: Total value for 100% completion prefix: Text to show before the bar bar_length: Length of the progress bar in characters """ if total == 0: return percent = min(100, int(100 * current / total)) filled = int(bar_length * current / total) bar = '█' * filled + '░' * (bar_length - filled) # Build the line line = f"\r{prefix}: [{bar}] {current}/{total} ({percent}%)" # Pad with spaces to clear previous line if it was longer if len(line) < cls._last_line_length: line += ' ' * (cls._last_line_length - len(line)) cls._last_line_length = len(line) # Print without newline print(line, end='', flush=True) @classmethod def finish(cls): """Finish progress bar and move to next line""" print() # Move to next line cls._last_line_length = 0 def is_traditional_translation_api(model: str) -> bool: """Check if the model is a traditional translation API""" return model in ['deepl', 'google-translate', 'google-translate-free'] or model.startswith('deepl/') or model.startswith('google-translate/') def get_chapter_terminology(is_text_file, chapter_data=None): """Get appropriate terminology (Chapter/Section) based on source type""" if is_text_file: return "Section" if chapter_data: if chapter_data.get('filename', '').endswith('.txt') or chapter_data.get('is_chunk', False): return "Section" return "Chapter" def extract_text_from_raw_content(raw_obj) -> str: """ Safely extract human-readable text from a Gemini raw_content_object. Skips reasoning-only parts (thought=True) but preserves normal text. """ try: parts = [] if hasattr(raw_obj, 'parts'): parts = raw_obj.parts or [] elif isinstance(raw_obj, dict): parts = raw_obj.get('parts', []) or [] texts = [] for p in parts: is_thought = False text_val = None if hasattr(p, 'thought'): is_thought = bool(getattr(p, 'thought', False)) elif isinstance(p, dict): is_thought = bool(p.get('thought', False)) if hasattr(p, 'text'): text_val = getattr(p, 'text', None) elif isinstance(p, dict): text_val = p.get('text') if text_val and not is_thought: texts.append(str(text_val)) return "\n".join(texts).strip() except Exception: return "" def build_gemini_model_message(content: str = "", raw_obj=None) -> dict: """ Build a Gemini 3-compatible assistant-role message with parts: - text part (when available) - thought_signature part (when available) Using assistant keeps roles valid while preserving parts for Gemini 3. """ import base64 parts = [] # Prefer text from raw_obj parts if present; else use provided content text_added = False if raw_obj: candidate_parts = [] if hasattr(raw_obj, "parts"): candidate_parts = raw_obj.parts or [] elif isinstance(raw_obj, dict): candidate_parts = raw_obj.get("parts", []) or [] for p in candidate_parts: if hasattr(p, "text") and getattr(p, "text", None): parts.append({"text": str(getattr(p, "text"))}) text_added = True elif isinstance(p, dict) and p.get("text"): parts.append({"text": str(p.get("text"))}) text_added = True if content and not text_added: parts.append({"text": str(content)}) # Find thought signature (snake or camel case, bytes or dict) sig_bytes = None if raw_obj: def _extract_sig_from_part(part): ts = None if hasattr(part, "thought_signature"): ts = getattr(part, "thought_signature", None) elif hasattr(part, "thoughtSignature"): ts = getattr(part, "thoughtSignature", None) elif isinstance(part, dict): ts = part.get("thought_signature") or part.get("thoughtSignature") return ts # Check top-level then parts top_ts = None if isinstance(raw_obj, dict): top_ts = raw_obj.get("thought_signature") or raw_obj.get("thoughtSignature") if hasattr(raw_obj, "thought_signature"): top_ts = getattr(raw_obj, "thought_signature", None) if hasattr(raw_obj, "thoughtSignature"): top_ts = getattr(raw_obj, "thoughtSignature", None) if top_ts is not None: sig_bytes = top_ts else: cand_parts = [] if hasattr(raw_obj, "parts"): cand_parts = raw_obj.parts or [] elif isinstance(raw_obj, dict): cand_parts = raw_obj.get("parts", []) or [] for p in cand_parts: ts = _extract_sig_from_part(p) if ts is not None: sig_bytes = ts break if sig_bytes is not None: if isinstance(sig_bytes, dict) and sig_bytes.get("_type") == "bytes" and sig_bytes.get("data"): data_b64 = sig_bytes.get("data") elif isinstance(sig_bytes, (bytes, bytearray)): data_b64 = base64.b64encode(sig_bytes).decode("utf-8") else: # If provided as string (already b64) keep as-is data_b64 = str(sig_bytes) parts.append({"thought_signature": {"_type": "bytes", "data": data_b64}}) # Fallback to text-only part if nothing found if not parts and content: parts.append({"text": str(content)}) return {"role": "assistant", "parts": parts} if parts else {"role": "assistant", "parts": []} def _merge_split_paragraphs(html_body: str) -> str: """Merge paragraphs that were artificially split across PDF pages. PDFs are extracted page-by-page, which can split paragraphs mid-sentence. This function merges consecutive justified paragraphs that don't end with sentence-ending punctuation, creating more natural paragraph breaks. Only affects PDFs, not EPUBs. """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_body, 'html.parser') # Find all

tags paragraphs = soup.find_all('p') if len(paragraphs) < 2: return html_body # Nothing to merge # Process paragraphs and merge when appropriate i = 0 while i < len(paragraphs) - 1: current_p = paragraphs[i] next_p = paragraphs[i + 1] # Skip if either is None or not a tag if not current_p or not next_p: i += 1 continue # Get paragraph classes - only merge justified paragraphs current_class = current_p.get('class', []) next_class = next_p.get('class', []) current_is_justified = 'align-justify' in current_class if current_class else False next_is_justified = 'align-justify' in next_class if next_class else False # Only merge if both are justified (regular body text) if not (current_is_justified and next_is_justified): i += 1 continue # Get text content of current paragraph current_text = current_p.get_text().strip() # Check if current paragraph ends with sentence-ending punctuation ends_with_sentence = bool(re.search(r'[.!?]\s*$', current_text)) # Check if next paragraph looks like continuation (doesn't start with capital) next_text = next_p.get_text().strip() starts_with_capital = bool(re.match(r'^[A-Z"\(]', next_text)) if next_text else False # Merge if: # - Current doesn't end with sentence punctuation, OR # - Current ends with sentence but next doesn't start with capital (likely continuation) should_merge = not ends_with_sentence or (ends_with_sentence and not starts_with_capital) if should_merge: # Merge next paragraph's content into current # Add a space between them current_p.append(' ') for content in list(next_p.contents): try: current_p.append(content.extract()) except Exception: current_p.append(content) # Remove the next paragraph next_p.decompose() # Update list and continue without increment to consider further merges paragraphs = soup.find_all('p') continue else: # Can't merge, move to next pair i += 1 # Use decode() instead of str() to preserve original formatting and attributes return soup.decode(formatter='minimal') def _merge_image_only_pages(html_body: str) -> str: """Merge image-only extracted PDF page containers into the previous container. Motivation: When PDFs are extracted page-by-page, some pages contain only a single image. Keeping them as a standalone container often produces large wasted whitespace in the final PDF/HTML output. This pass moves the image(s) into the previous page container. We treat a container as "image-only" if: - it contains at least one - its visible text (after stripping whitespace/nbsp) is empty This is a best-effort layout hint; the renderer may still paginate based on available space. """ try: from bs4 import BeautifulSoup import re as _re soup = BeautifulSoup(html_body, 'html.parser') # Common page wrapper IDs produced by our pipeline / MuPDF id_pat = _re.compile(r'^(?:mupdf-page0-\d+|page\d+|page0)$') def _is_image_only(div) -> bool: if not div: return False imgs = div.find_all('img') if not imgs: return False txt = (div.get_text(' ', strip=True) or '').replace('\xa0', '').strip() return txt == '' changed = True while changed: changed = False divs = soup.find_all('div', id=id_pat) for idx in range(1, len(divs)): div = divs[idx] if not _is_image_only(div): continue prev = div.find_previous('div', id=id_pat) if not prev: continue # Move children into previous container for child in list(div.contents): try: prev.append(child.extract()) except Exception: prev.append(child) div.decompose() changed = True break # restart scan since tree changed return soup.decode(formatter='minimal') except Exception: return html_body def _keep_text_with_following_image(html_body: str, *, min_text_chars: int = 40) -> str: """Reduce image-only PDF pages by keeping the last text block together with the following image. If an image doesn't fit at the bottom of a page, renderers will push it to the next page, sometimes resulting in a page that contains only the image. By wrapping the last text block immediately before an image together with that image in a container that avoids page breaks inside, the renderer will move BOTH to the next page when needed. This intentionally trades some extra whitespace on the previous page to avoid image-only pages. """ try: from bs4 import BeautifulSoup soup = BeautifulSoup(html_body, 'html.parser') # Target

blocks (most of your extracted images are in this shape) for p in soup.find_all('p'): imgs = p.find_all('img') if len(imgs) != 1: continue # Ensure this

is basically image-only txt = (p.get_text(' ', strip=True) or '').replace('\xa0', '').strip() if txt: continue # Find a preceding text block sibling (h1-h6 or p with text) prev = p.find_previous_sibling(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']) if not prev: continue prev_txt = (prev.get_text(' ', strip=True) or '').replace('\xa0', '').strip() if len(prev_txt) < min_text_chars: continue # Wrap prev + image-paragraph together wrapper = soup.new_tag('div') wrapper['class'] = (wrapper.get('class', []) or []) + ['keep-with-image'] wrapper['style'] = 'break-inside:avoid; page-break-inside:avoid;' prev.insert_before(wrapper) wrapper.append(prev.extract()) wrapper.append(p.extract()) return soup.decode(formatter='minimal') except Exception: return html_body def _generate_and_replace_toc(html_body: str) -> str: """Generate a proper table of contents from headers and replace any existing broken TOC. Only affects PDFs, not EPUBs. """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_body, 'html.parser') # Find all h1 and h2 headers (skip those in first 3 pages/divs as they're likely title page) headers = [] all_divs = soup.find_all('div', id=lambda x: x and x.startswith('page')) # Start collecting headers after the first 3 pages for div in all_divs[3:] if len(all_divs) > 3 else []: for header in div.find_all(['h1', 'h2']): header_text = header.get_text().strip() if header_text and len(header_text) > 2: # Skip very short headers # Create anchor ID if not header.get('id'): anchor_id = re.sub(r'[^a-zA-Z0-9]+', '-', header_text[:50].lower()).strip('-') header['id'] = anchor_id else: anchor_id = header['id'] headers.append({ 'text': header_text, 'id': anchor_id, 'level': int(header.name[1]) # h1 -> 1, h2 -> 2 }) # If we found headers, generate TOC if headers: # Build TOC HTML toc_html = '

\n' toc_html += '

Table of Contents

\n' for h in headers: indent = '' if h['level'] == 1 else '    ' toc_html += f'

{indent}{h["text"]}

\n' toc_html += '
\n' # Search for existing TOC by looking for "Table of Contents" or "Contents" text toc_replaced = False # Method 1: Search for any element containing "Table of Contents" text for element in soup.find_all(string=re.compile(r'table of contents|^contents$', re.IGNORECASE)): # Find the containing page div page_div = element.find_parent('div', id=lambda x: x and x.startswith('page')) if page_div: page_div.clear() page_div.append(BeautifulSoup(toc_html, 'html.parser')) toc_replaced = True print(f" • Replaced broken TOC with generated TOC ({len(headers)} entries)") break # Method 2: If not found by text, check page divs for TOC-like content if not toc_replaced: for i, div in enumerate(all_divs[:10]): # Check first 10 pages div_text = div.get_text().lower().strip() # Check if this looks like a TOC page (has "contents" early in the page) if ('table of contents' in div_text or (div_text.startswith('contents') or 'contents' in div_text[:100])): # Replace entire div content with new TOC div.clear() div.append(BeautifulSoup(toc_html, 'html.parser')) toc_replaced = True print(f" • Replaced broken TOC on page {i+1} with generated TOC ({len(headers)} entries)") break # Use decode() instead of str() to preserve original formatting and attributes return soup.decode(formatter='minimal') # ===================================================== # CONFIGURATION AND ENVIRONMENT MANAGEMENT # ===================================================== class TranslationConfig: """Centralized configuration management""" def __init__(self): self.MODEL = os.getenv("MODEL", "gemini-1.5-flash") self.input_path = os.getenv("input_path", "default.epub") self.PROFILE_NAME = os.getenv("PROFILE_NAME", "korean").lower() self.CONTEXTUAL = os.getenv("CONTEXTUAL", "1") == "1" self.DELAY = float(os.getenv("SEND_INTERVAL_SECONDS", "1")) # Use large_env to bypass Windows 32,767-char env var limit for large prompts try: import large_env self.SYSTEM_PROMPT = (large_env.get_env("SYSTEM_PROMPT", "") or "").strip() except Exception: self.SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip() self.ASSISTANT_PROMPT = os.getenv("ASSISTANT_PROMPT", "").strip() # Optional assistant prefill self.REQUEST_MERGING_ENABLED = os.getenv("REQUEST_MERGING_ENABLED", "0") == "1" # Read merge count early so we can use it for placeholder handling self.REQUEST_MERGE_COUNT = int(os.getenv("REQUEST_MERGE_COUNT", "3")) # Handle split marker instruction placeholder # Always strip the placeholder from base prompt - we'll add the instruction dynamically # based on whether we're actually merging multiple chapters in a given request if self.SYSTEM_PROMPT: import re # Strip placeholder - the actual instruction will be added via get_system_prompt() self.SYSTEM_PROMPT = re.sub(r'\s*\{split_marker_instruction\}\s*', '', self.SYSTEM_PROMPT) self.REMOVE_AI_ARTIFACTS = os.getenv("REMOVE_AI_ARTIFACTS", "0") == "1" self.TEMP = float(os.getenv("TRANSLATION_TEMPERATURE", "0.3")) self.HIST_LIMIT = int(os.getenv("TRANSLATION_HISTORY_LIMIT", "20")) self.MAX_OUTPUT_TOKENS = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) self.EMERGENCY_RESTORE = os.getenv("EMERGENCY_PARAGRAPH_RESTORE", "1") == "1" self.BATCH_TRANSLATION = os.getenv("BATCH_TRANSLATION", "0") == "1" self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10")) self.BATCHING_MODE = os.getenv("BATCHING_MODE", "aggressive") self.BATCH_GROUP_SIZE = int(os.getenv("BATCH_GROUP_SIZE", os.getenv("CONSERVATIVE_BATCH_GROUP_SIZE", "3"))) # Note: REQUEST_MERGING_ENABLED and REQUEST_MERGE_COUNT are set earlier (before split_marker_instruction handling) # Synthetic header injection for merged requests (Split-the-Merge helper) self.SYNTHETIC_MERGE_HEADERS = os.getenv("SYNTHETIC_MERGE_HEADERS", "1") == "1" self.ENABLE_IMAGE_TRANSLATION = os.getenv("ENABLE_IMAGE_TRANSLATION", "1") == "1" # Auto-disable image translation for html2text and BeautifulSoup profiles # These profiles are designed for text extraction and don't need image translation if self.ENABLE_IMAGE_TRANSLATION and self.PROFILE_NAME: profile_lower = self.PROFILE_NAME.lower() if 'html2text' in profile_lower or 'beautifulsoup' in profile_lower: self.ENABLE_IMAGE_TRANSLATION = False print(f"ℹ️ Image translation disabled for {self.PROFILE_NAME} profile") self.TRANSLATE_BOOK_TITLE = os.getenv("TRANSLATE_BOOK_TITLE", "1") == "1" self.DISABLE_ZERO_DETECTION = os.getenv("DISABLE_ZERO_DETECTION", "0") == "1" self.ENABLE_AUTO_GLOSSARY = os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1" self.COMPREHENSIVE_EXTRACTION = os.getenv("COMPREHENSIVE_EXTRACTION", "0") == "1" self.MANUAL_GLOSSARY = os.getenv("MANUAL_GLOSSARY") self.RETRY_TRUNCATED = os.getenv("RETRY_TRUNCATED", "1") == "1" try: self.TRUNCATION_RETRY_ATTEMPTS = int(os.getenv("TRUNCATION_RETRY_ATTEMPTS", "1")) except Exception: self.TRUNCATION_RETRY_ATTEMPTS = 1 # Char-ratio truncation detection (silent truncation) self.CHAR_RATIO_TRUNCATION_ENABLED = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1" try: self.CHAR_RATIO_TRUNCATION_PERCENT = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50")) except Exception: self.CHAR_RATIO_TRUNCATION_PERCENT = 50.0 try: self.CHAR_RATIO_TRUNCATION_ATTEMPTS = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1")) except Exception: self.CHAR_RATIO_TRUNCATION_ATTEMPTS = 1 try: self.CHAR_RATIO_MIN_OUTPUT_CHARS = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100")) except Exception: self.CHAR_RATIO_MIN_OUTPUT_CHARS = 100 self.RETRY_SPLIT_FAILED = os.getenv("RETRY_SPLIT_FAILED", "0") == "1" try: self.SPLIT_FAILED_RETRY_ATTEMPTS = int(os.getenv("SPLIT_FAILED_RETRY_ATTEMPTS", "1")) except Exception: self.SPLIT_FAILED_RETRY_ATTEMPTS = 1 self.RETRY_DUPLICATE_BODIES = os.getenv("RETRY_DUPLICATE_BODIES", "1") == "1" self.RETRY_TIMEOUT = os.getenv("RETRY_TIMEOUT", "0") == "1" self.CHUNK_TIMEOUT = int(os.getenv("CHUNK_TIMEOUT", "1800")) self.DISABLE_MERGE_FALLBACK = os.getenv("DISABLE_MERGE_FALLBACK", "0") == "1" self.MAX_RETRY_TOKENS = int(os.getenv("MAX_RETRY_TOKENS", "16384")) self.DUPLICATE_LOOKBACK_CHAPTERS = int(os.getenv("DUPLICATE_LOOKBACK_CHAPTERS", "3")) self.USE_ROLLING_SUMMARY = os.getenv("USE_ROLLING_SUMMARY", "0") == "1" self.ROLLING_SUMMARY_EXCHANGES = int(os.getenv("ROLLING_SUMMARY_EXCHANGES", "5")) self.ROLLING_SUMMARY_MODE = os.getenv("ROLLING_SUMMARY_MODE", "replace") # New: maximum number of rolling summary entries to retain when in append mode (0 = unlimited) self.ROLLING_SUMMARY_MAX_ENTRIES = int(os.getenv("ROLLING_SUMMARY_MAX_ENTRIES", "10")) self.DUPLICATE_DETECTION_MODE = os.getenv("DUPLICATE_DETECTION_MODE", "basic") self.AI_HUNTER_THRESHOLD = int(os.getenv("AI_HUNTER_THRESHOLD", "75")) self.TRANSLATION_HISTORY_ROLLING = os.getenv("TRANSLATION_HISTORY_ROLLING", "0") == "1" self.API_KEY = (os.getenv("API_KEY") or os.getenv("OPENAI_API_KEY") or os.getenv("OPENAI_OR_Gemini_API_KEY") or os.getenv("GEMINI_API_KEY")) # NEW: Simple chapter number offset self.CHAPTER_NUMBER_OFFSET = int(os.getenv("CHAPTER_NUMBER_OFFSET", "0")) self.ENABLE_WATERMARK_REMOVAL = os.getenv("ENABLE_WATERMARK_REMOVAL", "1") == "1" self.SAVE_CLEANED_IMAGES = os.getenv("SAVE_CLEANED_IMAGES", "1") == "1" self.EMERGENCY_IMAGE_RESTORE = os.getenv("EMERGENCY_IMAGE_RESTORE", "0") == "1" self.WATERMARK_PATTERN_THRESHOLD = int(os.getenv("WATERMARK_PATTERN_THRESHOLD", "10")) self.WATERMARK_CLAHE_LIMIT = float(os.getenv("WATERMARK_CLAHE_LIMIT", "3.0")) self.COMPRESSION_FACTOR = float(os.getenv("COMPRESSION_FACTOR", "2.0")) # Multi API key support self.use_multi_api_keys = os.environ.get('USE_MULTI_API_KEYS', '0') == '1' self.multi_api_keys = [] if self.use_multi_api_keys: multi_keys_json = os.environ.get('MULTI_API_KEYS', '[]') try: if multi_keys_json and str(multi_keys_json).strip() not in ('', '[]', 'null', 'None'): self.multi_api_keys = json.loads(multi_keys_json) else: # Fallback: UnifiedClient may have the keys stored in-memory to avoid Windows env var limits try: from unified_api_client import UnifiedClient with UnifiedClient._in_memory_multi_keys_lock: self.multi_api_keys = UnifiedClient._in_memory_multi_keys or [] except Exception: self.multi_api_keys = [] print(f"Loaded {len(self.multi_api_keys)} API keys for multi-key mode") if not self.multi_api_keys: self.use_multi_api_keys = False except Exception as e: print(f"Failed to load multi API keys: {e}") self.use_multi_api_keys = False # Fallback keys (for direct fallback retries) self.use_fallback_keys = os.environ.get('USE_FALLBACK_KEYS', '0') == '1' self.fallback_keys = [] if self.use_fallback_keys: fk_json = os.environ.get('FALLBACK_KEYS', '[]') try: self.fallback_keys = json.loads(fk_json) except Exception as e: print(f"Failed to load fallback keys: {e}") self.use_fallback_keys = False def get_effective_output_limit(self) -> int: """Return the effective output token limit, considering per-key overrides. - Start from the global MAX_OUTPUT_TOKENS. - Check if the model has a discovered limit (from auto-adjustment) - If multi-key mode is enabled, intersect with any per-key individual_output_token_limit values (min of all >0 limits). - If fallback keys are enabled, also intersect with their per-key individual_output_token_limit values. """ effective = self.MAX_OUTPUT_TOKENS # Check if we've discovered a model limit via auto-adjustment try: from unified_api_client import UnifiedClient with UnifiedClient._model_limits_lock: cached_limit = UnifiedClient._model_token_limits.get(self.MODEL) if cached_limit and cached_limit < effective: effective = cached_limit except Exception: pass # Collect per-key limits from multi-key pool (only from enabled keys) per_key_limits = [] try: for idx, key_data in enumerate(self.multi_api_keys or []): if not isinstance(key_data, dict): continue # Skip disabled keys if not key_data.get('enabled', True): continue raw = key_data.get('individual_output_token_limit') if raw in (None, "", 0): continue try: val = int(raw) if val > 0: per_key_limits.append(val) except Exception: continue except Exception: pass # Collect per-key limits from fallback keys (only from enabled keys) try: for idx, fb in enumerate(self.fallback_keys or []): if not isinstance(fb, dict): continue # Skip disabled keys if not fb.get('enabled', True): continue raw = fb.get('individual_output_token_limit') if raw in (None, "", 0): continue try: val = int(raw) if val > 0: per_key_limits.append(val) except Exception: continue except Exception: pass if per_key_limits: effective = min(effective, min(per_key_limits)) return effective def get_system_prompt(self, actual_merge_count: int = 1) -> str: """Return the system prompt, optionally with split marker instruction. Args: actual_merge_count: The actual number of chapters being merged in this request. If > 1, the split marker instruction will be added. If 1 (default), no split marker instruction is added. Returns: The system prompt string, with or without split marker instruction. """ if not self.SYSTEM_PROMPT: return self.SYSTEM_PROMPT # Only add split marker instruction if actually merging multiple chapters if actual_merge_count > 1 and self.REQUEST_MERGING_ENABLED: split_instr = ("- CRITICAL Requirement: If you see any HTML tags containing 'SPLIT MARKER' " "(Example:

SPLIT MARKER: Do Not Remove This Tag

), " "you MUST preserve them EXACTLY as they appear. Do not translate, modify, or remove these markers.") # Append to end of system prompt return self.SYSTEM_PROMPT + "\n\n" + split_instr return self.SYSTEM_PROMPT # ===================================================== # REQUEST MERGING UTILITIES # ===================================================== class RequestMerger: """Handles merging multiple chapters into a single request""" @classmethod def merge_chapters(cls, chapters_data, log_injections=True): """Merge multiple chapters into a single content block. This is used both for request-size estimation and for the actual merged request that is sent to the API. Before concatenating, we inject an invisible split marker at the beginning of each chapter. This greatly improves the reliability of Split-the-Merge, because the splitter can simply find these markers instead of carefully parsing headers. Args: chapters_data: List of tuples (chapter_num, content, chapter_obj) log_injections: If False, perform marker injection silently (no console logging). Used for size-estimation previews to avoid duplicate log lines. Returns: Merged content string """ if not chapters_data: return "" # Split markers are only needed when split-the-merge is enabled # Check if the feature is turned on split_the_merge_enabled = os.getenv('SPLIT_THE_MERGE', '0') == '1' split_markers_enabled = split_the_merge_enabled merged_parts = [] for chapter_num, content, chapter_obj in chapters_data: # Defensive: if something goes wrong in the marker injection # logic, fall back to the original content rather than breaking # the whole merge. try: if isinstance(content, str): # Only add split markers if split-the-merge is enabled if split_markers_enabled: # Use H1 tag as split marker - AI will preserve visible HTML elements split_marker = f'

SPLIT MARKER: Do Not Remove This Tag

\n' marked_content = split_marker + content if log_injections: preview = marked_content[:120].replace('\n', ' ') print( f" ℹ️ Request Merging: Injected H1 split marker for " f"chapter {chapter_num}: {preview}..." ) merged_parts.append(marked_content) else: # No split markers - just append content as-is merged_parts.append(content) else: # Non-string content, just append as-is merged_parts.append(content) except Exception as e: # Fallback: append original content if anything goes wrong if log_injections: print(f" ⚠️ Request Merging: Failed to inject split marker for chapter {chapter_num}: {e}") merged_parts.append(content) return "\n\n".join(merged_parts) @classmethod def create_merge_groups(cls, chapters_to_translate, merge_count): """Group chapters into merge groups, keeping only nearby chapters together. This prevents cases like chapter 7 being merged with chapter 29 just because chapters 8–28 were already translated or merged earlier. Args: chapters_to_translate: List of tuples. Supported shapes: - (idx, chapter_obj) - (idx, chapter_obj, actual_num, ...) merge_count: Maximum number of chapters to merge per request. Returns: List of merge groups, each group is a list of chapter tuples taken from ``chapters_to_translate`` in order. """ if merge_count <= 1 or not chapters_to_translate: # No merging, return each chapter as its own group return [[ch] for ch in chapters_to_translate] def _get_actual_num(item): """Best-effort extraction of the logical chapter number for grouping. This is primarily used as a *display* / fallback value. For actual proximity checks we prefer OPF spine order when available (see ``_get_proximity_key`` below). We try, in order: 1. Explicit ``actual_num`` in position 2 (non-text merge path). 2. ``chapter_obj['actual_chapter_num']`` if present. 3. ``chapter_obj['num']``. 4. Fallback to idx (position 0). """ # Shape: (idx, chapter_obj, actual_num, ...) try: if len(item) >= 3 and isinstance(item[2], (int, float)): return item[2] except Exception: pass # Shape: (idx, chapter_obj) try: chapter_obj = item[1] if isinstance(chapter_obj, dict): if 'actual_chapter_num' in chapter_obj: return chapter_obj.get('actual_chapter_num') return chapter_obj.get('num') except Exception: pass # Fallback: idx try: return item[0] except Exception: return None def _get_proximity_key(item): """Return a numeric key representing *reading order* proximity. We want proximity to reflect where chapters sit in the *book* rather than their logical numbering, so that multiple files with the same chapter number (e.g. notice pages vs. main text) don't get merged just because their labels are "4, 5, 4". Strategy (in order): 1. Use ``spine_order`` or ``opf_spine_position`` if present on the chapter object (true reading order from content.opf). 2. Fall back to the chapter index ``idx`` (position 0 in the tuple), which preserves the original ordering of the ``chapters`` list. 3. As a last resort, fall back to ``_get_actual_num``. """ # 1) Prefer explicit spine-based order from OPF if available try: chapter_obj = item[1] if isinstance(chapter_obj, dict): spine_pos = chapter_obj.get('spine_order') if spine_pos is None: spine_pos = chapter_obj.get('opf_spine_position') if spine_pos is not None: return float(spine_pos) except Exception: pass # 2) Fall back to the chapter's index in the master chapter list. # ``idx`` is stored in position 0 in all supported shapes. try: return float(item[0]) except Exception: pass # 3) Ultimate fallback – use the logical chapter number. return _get_actual_num(item) groups = [] current_group = [] prev_num = None for ch in chapters_to_translate: # Use proximity key (spine order when available) instead of the # logical chapter number alone. This prevents far‑apart chapters # with the same numeric label (e.g. multiple "Ch.004" entries in # different parts of the book) from being merged together when # there are many intervening chapters in the OPF spine. current_num = _get_proximity_key(ch) if not current_group: # Start the first group current_group = [ch] prev_num = current_num continue # If we've hit the per-request limit, start a new group if len(current_group) >= merge_count: groups.append(current_group) current_group = [ch] prev_num = current_num continue # If we can't safely determine chapter numbers, be conservative and # start a new group so we never merge far‑apart chapters by accident. if current_num is None or prev_num is None: groups.append(current_group) current_group = [ch] prev_num = current_num continue # Only merge if chapters are numerically adjacent (or effectively so). # This means sequences like 1→2→3 will merge, but 1→4 will not. try: gap = abs(float(current_num) - float(prev_num)) except Exception: gap = None if gap is not None and gap <= 1: # Close enough in chapter numbering, keep in same group current_group.append(ch) else: # Too far apart (e.g. 7 then 29) → start a new group groups.append(current_group) current_group = [ch] prev_num = current_num if current_group: groups.append(current_group) return groups @classmethod def split_by_markers(cls, content, expected_count): """ Split merged translation output by split markers. This method is robust to broken or missing split tags: - Handles partial marker tags (e.g., missing closing tag) - Handles malformed id attributes - Falls back to ANY h1 tag if split markers are missing - Works even if some markers are completely missing Args: content: The translated HTML content expected_count: Expected number of sections (should match merged chapter count) Returns: List of content sections if we can reliably split, or None if splitting is not possible (fallback to normal merged behavior) """ import re from bs4 import BeautifulSoup # Try multiple strategies in order of reliability: # 1. Perfect split markers with proper id="split-N" # 2. Any h1 tag with "split" in the id (even broken) # 3. Any h1 tag containing "SPLIT MARKER" text # 4. Any h1 tag at all # Strategy 1: Perfect markers perfect_pattern = r']*id="split-\d+"[^>]*>.*?' perfect_markers = list(re.finditer(perfect_pattern, content, flags=re.DOTALL | re.IGNORECASE)) if len(perfect_markers) == expected_count: print(f" ✓️ Split the Merge: Found {len(perfect_markers)} perfect split markers") return cls._split_by_positions(content, [m.start() for m in perfect_markers]) print(f" ⚠️ Split the Merge: Found {len(perfect_markers)} perfect markers, expected {expected_count}. Trying fallback strategies...") # Strategy 2: Broken markers with "split" in id (handles broken closing tags, etc.) try: soup = BeautifulSoup(content, 'html.parser') h1_tags = soup.find_all('h1') # Try markers with "split" in id split_id_tags = [tag for tag in h1_tags if tag.get('id') and 'split' in tag.get('id', '').lower()] if len(split_id_tags) == expected_count: print(f" ✓️ Split the Merge: Found {len(split_id_tags)} h1 tags with 'split' in id (broken marker format)") positions = [] for tag in split_id_tags: # Find position of this tag in original content tag_str = str(tag) # Search for the opening tag opening_tag = re.escape(tag_str.split('>')[0] + '>') match = re.search(opening_tag, content, flags=re.IGNORECASE) if match: positions.append(match.start()) if len(positions) == expected_count: return cls._split_by_positions(content, sorted(positions)) except Exception as e: print(f" ⚠️ Split the Merge: BeautifulSoup fallback failed: {e}") # Strategy 3: H1 tags containing "SPLIT MARKER" text try: soup = BeautifulSoup(content, 'html.parser') h1_tags = soup.find_all('h1') marker_text_tags = [tag for tag in h1_tags if 'split marker' in tag.get_text().lower()] if len(marker_text_tags) == expected_count: print(f" ✓️ Split the Merge: Found {len(marker_text_tags)} h1 tags with 'SPLIT MARKER' text") positions = [] for tag in marker_text_tags: tag_str = str(tag) opening_tag = re.escape(tag_str.split('>')[0] + '>') match = re.search(opening_tag, content, flags=re.IGNORECASE) if match: positions.append(match.start()) if len(positions) == expected_count: return cls._split_by_positions(content, sorted(positions)) except Exception as e: print(f" ⚠️ Split the Merge: Text marker fallback failed: {e}") # All strategies failed print(f" ❌ Split the Merge: Could not reliably split content (found varying marker counts across strategies)") return None @classmethod def _split_by_positions(cls, content, positions): """ Helper to split content at specific character positions. Args: content: Full content string positions: List of character positions where splits should occur (sorted) Returns: List of content sections """ if not positions: return [content] sections = [] # First section is before the first marker (usually empty/whitespace) first_section = content[:positions[0]].strip() if first_section: # Only include if non-empty sections.append(first_section) # Middle sections between markers for i in range(len(positions) - 1): # Find where the actual content starts (after the marker tag) start_pos = positions[i] # Skip past the h1 tag marker_end = content.find('', start_pos) if marker_end != -1: content_start = marker_end + 5 # len('') else: # Broken closing tag, try to skip past the opening tag at least next_close_bracket = content.find('>', start_pos) content_start = next_close_bracket + 1 if next_close_bracket != -1 else start_pos section = content[content_start:positions[i + 1]].strip() sections.append(section) # Last section after the last marker last_marker_pos = positions[-1] marker_end = content.find('', last_marker_pos) if marker_end != -1: content_start = marker_end + 5 else: next_close_bracket = content.find('>', last_marker_pos) content_start = next_close_bracket + 1 if next_close_bracket != -1 else last_marker_pos last_section = content[content_start:].strip() sections.append(last_section) print(f" ✓️ Split the Merge: Successfully split into {len(sections)} sections") return sections # ===================================================== # UNIFIED PATTERNS AND CONSTANTS # ===================================================== class PatternManager: """Centralized pattern management""" CHAPTER_PATTERNS = [ # English patterns (r'chapter[\s_-]*(\d+)', re.IGNORECASE, 'english_chapter'), (r'\bch\.?\s*(\d+)\b', re.IGNORECASE, 'english_ch'), (r'part[\s_-]*(\d+)', re.IGNORECASE, 'english_part'), (r'episode[\s_-]*(\d+)', re.IGNORECASE, 'english_episode'), # Chinese patterns (r'第\s*(\d+)\s*[章节話话回]', 0, 'chinese_chapter'), (r'第\s*([一二三四五六七八九十百千万]+)\s*[章节話话回]', 0, 'chinese_chapter_cn'), (r'(\d+)[章节話话回]', 0, 'chinese_short'), # Japanese patterns (r'第\s*(\d+)\s*話', 0, 'japanese_wa'), (r'第\s*(\d+)\s*章', 0, 'japanese_chapter'), (r'その\s*(\d+)', 0, 'japanese_sono'), (r'(\d+)話目', 0, 'japanese_wame'), # Korean patterns (r'제\s*(\d+)\s*[장화권부편]', 0, 'korean_chapter'), (r'(\d+)\s*[장화권부편]', 0, 'korean_short'), (r'에피소드\s*(\d+)', 0, 'korean_episode'), # Generic numeric patterns (r'^\s*(\d+)\s*[-–—.\:]', re.MULTILINE, 'generic_numbered'), (r'_(\d+)\.x?html?$', re.IGNORECASE, 'filename_number'), (r'/(\d+)\.x?html?$', re.IGNORECASE, 'path_number'), (r'(\d+)', 0, 'any_number'), ] FILENAME_EXTRACT_PATTERNS = [ # IMPORTANT: More specific patterns MUST come first r'^\d{3}(\d)_(\d{2})_\.x?html?$', # Captures both parts for decimal: group1.group2 r'^\d{4}_(\d+)\.x?html?$', # "0000_1.xhtml" - extracts 1, not 0000 r'^\d+_(\d+)[_\.]', # Any digits followed by underscore then capture next digits r'^(\d+)[_\.]', # Standard: "0249_" or "0249." r'response_(\d+)_', # Standard pattern: response_001_ r'response_(\d+)\.', # Pattern: response_001. r'(\d{3,5})[_\.]', # 3-5 digit pattern with padding r'[Cc]hapter[_\s]*(\d+)', # Chapter word pattern r'[Cc]h[_\s]*(\d+)', # Ch abbreviation r'No(\d+)Chapter', # No prefix with Chapter - matches "No00013Chapter.xhtml" r'No(\d+)Section', # No prefix with Section - matches "No00013Section.xhtml" r'No(\d+)(?=\.|_|$)', # No prefix followed by end, dot, or underscore (not followed by text) r'第(\d+)[章话回]', # Chinese chapter markers r'_(\d+)(?:_|\.|$)', # Number between underscores or at end r'^(\d+)(?:_|\.|$)', # Starting with number r'(\d+)', # Any number (fallback) ] CJK_HONORIFICS = { 'korean': [ # Modern honorifics '님', '씨', '선배', '후배', '동기', '형', '누나', '언니', '오빠', '동생', '선생님', '교수님', '박사님', '사장님', '회장님', '부장님', '과장님', '대리님', '팀장님', '실장님', '이사님', '전무님', '상무님', '부사장님', '고문님', # Classical/formal honorifics '공', '옹', '군', '양', '낭', '랑', '생', '자', '부', '모', '시', '제', '족하', # Royal/noble address forms '마마', '마노라', '대감', '영감', '나리', '도령', '낭자', '아씨', '규수', '각하', '전하', '폐하', '저하', '합하', '대비', '대왕', '왕자', '공주', # Buddhist/religious '스님', '사부님', '조사님', '큰스님', '화상', '대덕', '대사', '법사', '선사', '율사', '보살님', '거사님', '신부님', '목사님', '장로님', '집사님', # Confucian/scholarly '부자', '선생', '대인', '어른', '어르신', '존자', '현자', '군자', '대부', '학사', '진사', '문하생', '제자', # Kinship honorifics '어르신', '할아버님', '할머님', '아버님', '어머님', '형님', '누님', '아주버님', '아주머님', '삼촌', '이모님', '고모님', '외삼촌', '장인어른', '장모님', '시아버님', '시어머님', '처남', '처형', '매형', '손님', # Verb-based honorific endings and speech levels '습니다', 'ㅂ니다', '습니까', 'ㅂ니까', '시다', '세요', '셔요', '십시오', '시오', '이에요', '예요', '이예요', '에요', '어요', '아요', '여요', '해요', '이세요', '으세요', '으시', '시', '으십니다', '십니다', '으십니까', '십니까', '으셨', '셨', '드립니다', '드려요', '드릴게요', '드리겠습니다', '올립니다', '올려요', '사옵니다', '사뢰', '여쭙니다', '여쭤요', '아뢰', '뵙니다', '뵈요', '모십니다', '시지요', '시죠', '시네요', '시는군요', '시는구나', '으실', '실', '드시다', '잡수시다', '주무시다', '계시다', '가시다', '오시다', # Common verb endings with 있다/없다/하다 '있어요', '있습니다', '있으세요', '있으십니까', '없어요', '없습니다', '없으세요', '해요', '합니다', '하세요', '하십시오', '하시죠', '하시네요', '했어요', '했습니다', '되세요', '되셨어요', '되십니다', '됩니다', '되요', '돼요', '이야', '이네', '이구나', '이군', '이네요', '인가요', '인가', '일까요', '일까', '거예요', '거에요', '겁니다', '건가요', '게요', '을게요', '을까요', '었어요', '었습니다', '겠습니다', '겠어요', '겠네요', '을겁니다', '을거예요', '을거에요', # Common endings '요', '죠', '네요', '는데요', '거든요', '니까', '으니까', '는걸요', '군요', '구나', '는구나', '는군요', '더라고요', '더군요', '던데요', '나요', '가요', '까요', '라고요', '다고요', '냐고요', '자고요', '란다', '단다', '냔다', '잔다', # Formal archaic endings '나이다', '사옵나이다', '옵니다', '오', '소서', '으오', '으옵소서', '사이다', '으시옵니다', '시옵니다', '으시옵니까', '시옵니까', '나이까', '리이까', '리이다', '옵소서', '으소서', '소이다', '로소이다', '이옵니다', '이올시다', '하옵니다' ], 'japanese': [ # Modern honorifics 'さん', 'ちゃん', '君', 'くん', '様', 'さま', '先生', 'せんせい', '殿', 'どの', '先輩', 'せんぱい', # Classical/historical '氏', 'し', '朝臣', 'あそん', '宿禰', 'すくね', '連', 'むらじ', '臣', 'おみ', '君', 'きみ', '真人', 'まひと', '道師', 'みちのし', '稲置', 'いなぎ', '直', 'あたい', '造', 'みやつこ', # Court titles '卿', 'きょう', '大夫', 'たいふ', '郎', 'ろう', '史', 'し', '主典', 'さかん', # Buddhist titles '和尚', 'おしょう', '禅師', 'ぜんじ', '上人', 'しょうにん', '聖人', 'しょうにん', '法師', 'ほうし', '阿闍梨', 'あじゃり', '大和尚', 'だいおしょう', # Shinto titles '大宮司', 'だいぐうじ', '宮司', 'ぐうじ', '禰宜', 'ねぎ', '祝', 'はふり', # Samurai era '守', 'かみ', '介', 'すけ', '掾', 'じょう', '目', 'さかん', '丞', 'じょう', # Keigo (honorific language) verb forms 'です', 'ます', 'ございます', 'いらっしゃる', 'いらっしゃいます', 'おっしゃる', 'おっしゃいます', 'なさる', 'なさいます', 'くださる', 'くださいます', 'いただく', 'いただきます', 'おります', 'でございます', 'ございません', 'いたします', 'いたしました', '申す', '申します', '申し上げる', '申し上げます', '存じる', '存じます', '存じ上げる', '伺う', '伺います', '参る', '参ります', 'お目にかかる', 'お目にかかります', '拝見', '拝見します', '拝聴', '拝聴します', '承る', '承ります', # Respectful prefixes/suffixes 'お', 'ご', '御', 'み', '美', '貴', '尊' ], 'chinese': [ # Modern forms '先生', '小姐', '夫人', '公子', '大人', '老师', '师父', '师傅', '同志', '同学', # Ancient/classical forms '子', '丈', '翁', '公', '侯', '伯', '叔', '仲', '季', '父', '甫', '卿', '君', '生', # Imperial court '陛下', '殿下', '千岁', '万岁', '圣上', '皇上', '天子', '至尊', '御前', '爷', # Nobility/officials '阁下', '大人', '老爷', '相公', '官人', '郎君', '娘子', '夫子', '足下', # Religious titles '上人', '法师', '禅师', '大师', '高僧', '圣僧', '神僧', '活佛', '仁波切', '真人', '天师', '道长', '道友', '仙长', '上仙', '祖师', '掌教', # Scholarly/Confucian '夫子', '圣人', '贤人', '君子', '大儒', '鸿儒', '宗师', '泰斗', '巨擘', # Martial arts '侠士', '大侠', '少侠', '女侠', '英雄', '豪杰', '壮士', '义士', # Family/kinship '令尊', '令堂', '令郎', '令爱', '贤弟', '贤侄', '愚兄', '小弟', '家父', '家母', # Humble forms '在下', '小人', '鄙人', '不才', '愚', '某', '仆', '妾', '奴', '婢', # Polite verbal markers '请', '请问', '敢问', '恭请', '敬请', '烦请', '有请', '请教', '赐教', '惠顾', '惠赐', '惠存', '笑纳', '雅正', '指正', '斧正', '垂询', '拜', '拜见', '拜访', '拜读', '拜托', '拜谢', '敬上', '谨上', '顿首' ], 'english': [ # Modern Korean romanizations (Revised Romanization of Korean - 2000) '-nim', '-ssi', '-seonbae', '-hubae', '-donggi', '-hyeong', '-nuna', '-eonni', '-oppa', '-dongsaeng', '-seonsaengnim', '-gyosunim', '-baksanim', '-sajangnim', '-hoejangnim', '-bujangnim', '-gwajangnim', '-daerim', '-timjangnim', '-siljangnim', '-isanim', '-jeonmunim', '-sangmunim', '-busajangnim', '-gomunnim', # Classical/formal Korean romanizations '-gong', '-ong', '-gun', '-yang', '-nang', '-rang', '-saeng', '-ja', '-bu', '-mo', '-si', '-je', '-jokha', # Royal/noble Korean romanizations '-mama', '-manora', '-daegam', '-yeonggam', '-nari', '-doryeong', '-nangja', '-assi', '-gyusu', '-gakha', '-jeonha', '-pyeha', '-jeoha', '-hapka', '-daebi', '-daewang', '-wangja', '-gongju', # Buddhist/religious Korean romanizations '-seunim', '-sabunim', '-josanim', '-keunseunim', '-hwasang', '-daedeok', '-daesa', '-beopsa', '-seonsa', '-yulsa', '-bosalnim', '-geosanim', '-sinbunim', '-moksanim', '-jangnonim', '-jipsanim', # Confucian/scholarly Korean romanizations '-buja', '-seonsaeng', '-daein', '-eoreun', '-eoreusin', '-jonja', '-hyeonja', '-gunja', '-daebu', '-haksa', '-jinsa', '-munhasaeng', '-jeja', # Kinship Korean romanizations '-harabeonim', '-halmeonim', '-abeonim', '-eomeonim', '-hyeongnim', '-nunim', '-ajubeonim', '-ajumeonim', '-samchon', '-imonim', '-gomonim', '-oesamchon', '-jangineoreun', '-jangmonim', '-siabeonim', '-sieomeonim', '-cheonam', '-cheohyeong', '-maehyeong', '-sonnim', # Korean verb endings romanized (Revised Romanization) '-seumnida', '-mnida', '-seumnikka', '-mnikka', '-sida', '-seyo', '-syeoyo', '-sipsio', '-sio', '-ieyo', '-yeyo', '-iyeyo', '-eyo', '-eoyo', '-ayo', '-yeoyo', '-haeyo', '-iseyo', '-euseyo', '-eusi', '-si', '-eusimnida', '-simnida', '-eusimnikka', '-simnikka', '-eusyeot', '-syeot', '-deurimnida', '-deuryeoyo', '-deurilgeyo', '-deurigesseumnida', '-ollimnida', '-ollyeoyo', '-saomnida', '-saroe', '-yeojjumnida', '-yeojjwoyo', '-aroe', '-boemnida', '-boeyo', '-mosimnida', '-sijiyo', '-sijyo', '-sineyo', '-sineungunyo', '-sineunguna', '-eusil', '-sil', '-deusida', '-japsusida', '-jumusida', '-gyesida', '-gasida', '-osida', # Common Korean verb endings romanized '-isseoyo', '-isseumnida', '-isseuseyo', '-isseusimnikka', '-eopseoyo', '-eopseumnida', '-eopseuseyo', '-hamnida', '-haseyo', '-hasipsio', '-hasijyo', '-hasineyo', '-haesseoyo', '-haesseumnida', '-doeseyo', '-doesyeosseoyo', '-doesimnida', '-doemnida', '-doeyo', '-dwaeyo', '-iya', '-ine', '-iguna', '-igun', '-ineyo', '-ingayo', '-inga', '-ilkkayo', '-ilkka', '-geoyeyo', '-geoeyo', '-geomnida', '-geongayo', '-geyo', '-eulgeyo', '-eulkkayo', '-eosseoyo', '-eosseumnida', '-gesseumnida', '-gesseoyo', '-genneyo', '-eulgeommida', '-eulgeoyeyo', '-eulgeoeyo', # Common Korean endings romanized '-yo', '-jyo', '-neyo', '-neundeyo', '-geodeunyo', '-nikka', '-eunikka', '-neungeolyo', '-gunyo', '-guna', '-neunguna', '-neungunyo', '-deoragoyo', '-deogunyo', '-deondeyo', '-nayo', '-gayo', '-kkayo', '-ragoyo', '-dagoyo', '-nyagoyo', '-jagoyo', '-randa', '-danda', '-nyanda', '-janda', # Formal archaic Korean romanized '-naida', '-saomnaida', '-omnida', '-o', '-soseo', '-euo', '-euopsoseo', '-saida', '-eusiomnida', '-siomnida', '-eusiomnikka', '-siomnikka', '-naikka', '-riikka', '-riida', '-opsoseo', '-eusoseo', '-soida', '-rosoida', '-iomnida', '-iolsida', '-haomnida', # Japanese keigo romanized (keeping existing) '-san', '-chan', '-kun', '-sama', '-sensei', '-senpai', '-dono', '-shi', '-tan', '-chin', '-desu', '-masu', '-gozaimasu', '-irassharu', '-irasshaimasu', '-ossharu', '-osshaimasu', '-nasaru', '-nasaimasu', '-kudasaru', '-kudasaimasu', '-itadaku', '-itadakimasu', '-orimasu', '-degozaimasu', '-gozaimasen', '-itashimasu', '-itashimashita', '-mousu', '-moushimasu', '-moushiageru', '-moushiagemasu', '-zonjiru', '-zonjimasu', '-ukagau', '-ukagaimasu', '-mairu', '-mairimasu', '-haiken', '-haikenshimasu', # Chinese romanizations (keeping existing) '-xiong', '-di', '-ge', '-gege', '-didi', '-jie', '-jiejie', '-meimei', '-shixiong', '-shidi', '-shijie', '-shimei', '-gongzi', '-guniang', '-xiaojie', '-daren', '-qianbei', '-daoyou', '-zhanglao', '-shibo', '-shishu', '-shifu', '-laoshi', '-xiansheng', '-daxia', '-shaoxia', '-nvxia', '-jushi', '-shanren', '-dazhang', '-zhenren', # Ancient Chinese romanizations '-zi', '-gong', '-hou', '-bo', '-jun', '-qing', '-weng', '-fu', '-sheng', '-lang', '-langjun', '-niangzi', '-furen', '-gege', '-jiejie', '-yeye', '-nainai', # Chinese politeness markers romanized '-qing', '-jing', '-gong', '-hui', '-ci', '-bai', '-gan', '-chui', 'qingwen', 'ganwen', 'gongjing', 'jingjing', 'baijian', 'baifang', 'baituo' ] } TITLE_PATTERNS = { 'korean': [ # Modern titles r'\b(왕|여왕|왕자|공주|황제|황후|대왕|대공|공작|백작|자작|남작|기사|장군|대장|원수|제독|함장|대신|재상|총리|대통령|시장|지사|검사|판사|변호사|의사|박사|교수|신부|목사|스님|도사)\b', r'\b(폐하|전하|각하|예하|님|대감|영감|나리|도련님|아가씨|부인|선생)\b', # Historical/classical titles r'\b(대왕|태왕|왕비|왕후|세자|세자빈|대군|군|옹주|공주|부마|원자|원손)\b', r'\b(영의정|좌의정|우의정|판서|참판|참의|정승|판사|사또|현령|군수|목사|부사)\b', r'\b(대제학|제학|대사간|사간|대사헌|사헌|도승지|승지|한림|사관|내시|환관)\b', r'\b(병조판서|이조판서|호조판서|예조판서|형조판서|공조판서)\b', r'\b(도원수|부원수|병마절도사|수군절도사|첨절제사|만호|천호|백호)\b', r'\b(정일품|종일품|정이품|종이품|정삼품|종삼품|정사품|종사품|정오품|종오품)\b', # Korean honorific verb endings patterns r'(습니다|ㅂ니다|습니까|ㅂ니까|세요|셔요|십시오|시오)$', r'(이에요|예요|이예요|에요|어요|아요|여요|해요)$', r'(으시|시)(었|겠|ㄹ|을|는|던)*(습니다|ㅂ니다|어요|아요|세요)', r'(드립니다|드려요|드릴게요|드리겠습니다|올립니다|올려요)$', r'(사옵니다|여쭙니다|여쭤요|뵙니다|뵈요|모십니다)$', r'(나이다|사옵나이다|옵니다|으오|으옵소서|사이다)$' ], 'japanese': [ # Modern titles r'\b(王|女王|王子|姫|皇帝|皇后|天皇|皇太子|大王|大公|公爵|伯爵|子爵|男爵|騎士|将軍|大将|元帥|提督|艦長|大臣|宰相|総理|大統領|市長|知事|検事|裁判官|弁護士|医者|博士|教授|神父|牧師|僧侶|道士)\b', r'\b(陛下|殿下|閣下|猊下|様|大人|殿|卿|君|氏)\b', # Historical titles r'\b(天皇|皇后|皇太子|親王|内親王|王|女王|太政大臣|左大臣|右大臣|内大臣|大納言|中納言|参議)\b', r'\b(関白|摂政|征夷大将軍|管領|執権|守護|地頭|代官|奉行|与力|同心)\b', r'\b(太政官|神祇官|式部省|治部省|民部省|兵部省|刑部省|大蔵省|宮内省)\b', r'\b(大僧正|僧正|大僧都|僧都|律師|大法師|法師|大禅師|禅師)\b', r'\b(正一位|従一位|正二位|従二位|正三位|従三位|正四位|従四位|正五位|従五位)\b', r'\b(大和守|山城守|摂津守|河内守|和泉守|伊賀守|伊勢守|尾張守|三河守|遠江守)\b', # Japanese keigo (honorific language) patterns r'(です|ます|ございます)$', r'(いらっしゃ|おっしゃ|なさ|くださ)(います|いました|る|った)$', r'(いただ|お|ご|御)(き|きます|きました|く|ける|けます)', r'(申し上げ|申し|存じ上げ|存じ|伺い|参り)(ます|ました|る)$', r'(拝見|拝聴|承り|承)(します|しました|いたします|いたしました)$', r'お[^あ-ん]+[になる|になります|くださる|くださいます]' ], 'chinese': [ # Modern titles r'\b(王|女王|王子|公主|皇帝|皇后|大王|大公|公爵|伯爵|子爵|男爵|骑士|将军|大将|元帅|提督|舰长|大臣|宰相|总理|大总统|市长|知事|检察官|法官|律师|医生|博士|教授|神父|牧师|和尚|道士)\b', r'\b(陛下|殿下|阁下|大人|老爷|夫人|小姐|公子|少爷|姑娘|先生)\b', # Imperial titles r'\b(天子|圣上|皇上|万岁|万岁爷|太上皇|皇太后|太后|皇后|贵妃|妃|嫔|贵人|常在|答应)\b', r'\b(太子|皇子|皇孙|亲王|郡王|贝勒|贝子|公主|格格|郡主|县主|郡君|县君)\b', # Ancient official titles r'\b(丞相|相国|太师|太傅|太保|太尉|司徒|司空|大司马|大司农|大司寇)\b', r'\b(尚书|侍郎|郎中|员外郎|主事|知府|知州|知县|同知|通判|推官|巡抚|总督)\b', r'\b(御史大夫|御史中丞|监察御史|给事中|都察院|翰林院|国子监|钦天监)\b', r'\b(大学士|学士|侍读|侍讲|编修|检讨|庶吉士|举人|进士|状元|榜眼|探花)\b', # Military ranks r'\b(大元帅|元帅|大将军|将军|都督|都指挥使|指挥使|千户|百户|总兵|副将|参将|游击|都司|守备)\b', r'\b(提督|总兵官|副总兵|参将|游击将军|都司|守备|千总|把总|外委)\b', # Religious titles r'\b(国师|帝师|法王|活佛|堪布|仁波切|大和尚|方丈|住持|首座|维那|知客)\b', r'\b(天师|真人|道长|掌教|监院|高功|都讲|总理|提点|知观)\b', # Nobility ranks r'\b(公|侯|伯|子|男|开国公|郡公|国公|郡侯|县侯|郡伯|县伯|县子|县男)\b', r'\b(一品|二品|三品|四品|五品|六品|七品|八品|九品|正一品|从一品|正二品|从二品)\b', # Chinese politeness markers r'(请|敢|恭|敬|烦|有)(问|请|赐|教|告|示)', r'(拜|惠|赐|垂|雅|笑)(见|访|读|托|谢|顾|赐|存|纳|正|询)', r'(敬|谨|顿)(上|呈|启|白|首)' ], 'english': [ # Western titles r'\b(King|Queen|Prince|Princess|Emperor|Empress|Duke|Duchess|Marquis|Marquess|Earl|Count|Countess|Viscount|Viscountess|Baron|Baroness|Knight|Lord|Lady|Sir|Dame|General|Admiral|Captain|Major|Colonel|Commander|Lieutenant|Sergeant|Minister|Chancellor|President|Mayor|Governor|Judge|Doctor|Professor|Father|Reverend|Master|Mistress)\b', r'\b(His|Her|Your|Their)\s+(Majesty|Highness|Grace|Excellency|Honor|Worship|Lordship|Ladyship)\b', # Romanized historical titles r'\b(Tianzi|Huangdi|Huanghou|Taizi|Qinwang|Junwang|Beile|Beizi|Gongzhu|Gege)\b', r'\b(Chengxiang|Zaixiang|Taishi|Taifu|Taibao|Taiwei|Situ|Sikong|Dasima)\b', r'\b(Shogun|Daimyo|Samurai|Ronin|Ninja|Tenno|Mikado|Kampaku|Sessho)\b', r'\b(Taewang|Wangbi|Wanghu|Seja|Daegun|Gun|Ongju|Gongju|Buma)\b' ] } # Expanded Chinese numbers including classical forms CHINESE_NUMS = { # Basic numbers '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, '十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15, '十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20, '二十一': 21, '二十二': 22, '二十三': 23, '二十四': 24, '二十五': 25, '三十': 30, '四十': 40, '五十': 50, '六十': 60, '七十': 70, '八十': 80, '九十': 90, '百': 100, # Classical/formal numbers '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10, '佰': 100, '仟': 1000, '萬': 10000, '万': 10000, # Ordinal indicators '第一': 1, '第二': 2, '第三': 3, '第四': 4, '第五': 5, '首': 1, '次': 2, '初': 1, '末': -1, } # Common words - keeping the same for filtering COMMON_WORDS = { '이', '그', '저', '우리', '너희', '자기', '당신', '여기', '거기', '저기', '오늘', '내일', '어제', '지금', '아까', '나중', '먼저', '다음', '마지막', '모든', '어떤', '무슨', '이런', '그런', '저런', '같은', '다른', '새로운', '하다', '있다', '없다', '되다', '하는', '있는', '없는', '되는', '것', '수', '때', '년', '월', '일', '시', '분', '초', '은', '는', '이', '가', '을', '를', '에', '의', '와', '과', '도', '만', '에서', '으로', '로', '까지', '부터', '에게', '한테', '께', '께서', 'この', 'その', 'あの', 'どの', 'これ', 'それ', 'あれ', 'どれ', 'わたし', 'あなた', 'かれ', 'かのじょ', 'わたしたち', 'あなたたち', 'きょう', 'あした', 'きのう', 'いま', 'あとで', 'まえ', 'つぎ', 'の', 'は', 'が', 'を', 'に', 'で', 'と', 'も', 'や', 'から', 'まで', '这', '那', '哪', '这个', '那个', '哪个', '这里', '那里', '哪里', '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '今天', '明天', '昨天', '现在', '刚才', '以后', '以前', '后来', '的', '了', '在', '是', '有', '和', '与', '或', '但', '因为', '所以', '一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', } # ===================================================== # CHUNK CONTEXT MANAGER (unchanged - already optimal) # ===================================================== class ChunkContextManager: """Manage context within a chapter separate from history""" def __init__(self): self.current_chunks = [] self.chapter_num = None self.chapter_title = None def start_chapter(self, chapter_num, chapter_title): """Start a new chapter context""" self.current_chunks = [] self.chapter_num = chapter_num self.chapter_title = chapter_title def add_chunk(self, user_content, assistant_content, chunk_idx, total_chunks): """Add a chunk to the current chapter context""" self.current_chunks.append({ "user": user_content, "assistant": assistant_content, "chunk_idx": chunk_idx, "total_chunks": total_chunks }) def get_context_messages(self, limit=3): """Get last N chunks as messages for API context""" context = [] for chunk in self.current_chunks[-limit:]: context.extend([ {"role": "user", "content": chunk["user"]}, {"role": "assistant", "content": chunk["assistant"]} ]) return context def get_summary_for_history(self): """Create a summary representation for the history""" if not self.current_chunks: return None, None total_chunks = len(self.current_chunks) user_summary = f"[Chapter {self.chapter_num}: {self.chapter_title}]\n" user_summary += f"[{total_chunks} chunks processed]\n" if self.current_chunks: first_chunk = self.current_chunks[0]['user'] if len(first_chunk) > 500: user_summary += first_chunk[:500] + "..." else: user_summary += first_chunk assistant_summary = f"[Chapter {self.chapter_num} Translation Complete]\n" assistant_summary += f"[Translated in {total_chunks} chunks]\n" if self.current_chunks: samples = [] first_trans = self.current_chunks[0]['assistant'] samples.append(f"Beginning: {first_trans[:200]}..." if len(first_trans) > 200 else f"Beginning: {first_trans}") if total_chunks > 2: mid_idx = total_chunks // 2 mid_trans = self.current_chunks[mid_idx]['assistant'] samples.append(f"Middle: {mid_trans[:200]}..." if len(mid_trans) > 200 else f"Middle: {mid_trans}") if total_chunks > 1: last_trans = self.current_chunks[-1]['assistant'] samples.append(f"End: {last_trans[:200]}..." if len(last_trans) > 200 else f"End: {last_trans}") assistant_summary += "\n".join(samples) return user_summary, assistant_summary def clear(self): """Clear the current chapter context""" self.current_chunks = [] self.chapter_num = None self.chapter_title = None # ===================================================== # UNIFIED UTILITIES # ===================================================== class FileUtilities: """Utilities for file and path operations""" @staticmethod def extract_actual_chapter_number(chapter, patterns=None, config=None): """Extract actual chapter number from filename using improved logic""" # IMPORTANT: Check if this is a pre-split TEXT FILE chunk first if (chapter.get('is_chunk', False) and 'num' in chapter and isinstance(chapter['num'], float) and chapter.get('filename', '').endswith('.txt')): # For text file chunks only, preserve the decimal number return chapter['num'] # This will be 1.1, 1.2, etc. # Get filename for extraction (broadened to match GUI/spine data) filename = ( chapter.get('original_basename') or chapter.get('original_filename') or chapter.get('filename') or chapter.get('source_filename') or chapter.get('href') or chapter.get('idref') or chapter.get('id') or chapter.get('name') or chapter.get('key') or '' ) opf_spine_position = chapter.get('spine_order') if opf_spine_position is None: opf_spine_position = chapter.get('opf_spine_position') actual_num, method = extract_chapter_number_from_filename(filename, opf_spine_position=opf_spine_position) # If extraction failed (no digits and no special), fall back to spine/file data if actual_num is None and opf_spine_position is not None: actual_num = opf_spine_position method = 'opf_spine_fallback' # Only fall back to file_chapter_num when we still have no number if actual_num is None and chapter.get('file_chapter_num') is not None: actual_num = chapter['file_chapter_num'] method = 'file_chapter_num_fallback' # Prefer OPF spine position when available (ensures range selection follows content.opf) # opf_spine_position = chapter.get('spine_order') # opf_spine_data = chapter.get('opf_spine_data') # Use our improved extraction function # actual_num, method = extract_chapter_number_from_filename( # filename, # opf_spine_position=opf_spine_position, # opf_spine_data=opf_spine_data # ) # If extraction succeeded, return the result if actual_num is not None: #print(f"[DEBUG] Extracted {actual_num} from '{filename}' using method: {method}") return actual_num # Fallback to original complex logic for edge cases actual_num = None if patterns is None: patterns = PatternManager.FILENAME_EXTRACT_PATTERNS # Try to extract from original basename first if chapter.get('original_basename'): basename = chapter['original_basename'] # Check if decimal chapters are enabled for EPUBs enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1' # For EPUBs, only check decimal patterns if the toggle is enabled if enable_decimal: # Check for standard decimal chapter numbers (e.g., Chapter_1.1, 1.2.html) decimal_match = re.search(r'(\d+)\.(\d+)', basename) if decimal_match: actual_num = float(f"{decimal_match.group(1)}.{decimal_match.group(2)}") return actual_num # Check for the XXXX_YY pattern where it represents X.YY decimal chapters decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename) if decimal_prefix_match: first_part = decimal_prefix_match.group(1) second_part = decimal_prefix_match.group(2) if len(second_part) == 2 and int(second_part) > 9: chapter_num = int(first_part[-1]) decimal_part = second_part actual_num = float(f"{chapter_num}.{decimal_part}") return actual_num # Standard XXXX_Y format handling (existing logic) prefix_suffix_match = re.match(r'^(\d+)_(\d+)', basename) if prefix_suffix_match: second_part = prefix_suffix_match.group(2) if not enable_decimal: actual_num = int(second_part) return actual_num else: if len(second_part) == 1 or (len(second_part) == 2 and int(second_part) <= 9): actual_num = int(second_part) return actual_num # Check other patterns if no match yet for pattern in patterns: if pattern in [r'^(\d+)[_\.]', r'(\d{3,5})[_\.]', r'^(\d+)_']: continue match = re.search(pattern, basename, re.IGNORECASE) if match: actual_num = int(match.group(1)) break # Final fallback to chapter num if actual_num is None: actual_num = chapter.get("num", 0) print(f"[DEBUG] No pattern matched, using chapter num: {actual_num}") return actual_num @staticmethod def create_chapter_filename(chapter, actual_num=None): """Create consistent chapter filename""" # Check if we should use header as output name use_header_output = os.getenv("USE_HEADER_AS_OUTPUT", "0") == "1" # Check if this is for a text file is_text_file = chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False) # Respect toggle: retain source extension and remove 'response_' prefix retain = should_retain_source_extension() # Helper to compute full original extension chain (e.g., '.html.xhtml') def _full_ext_from_original(ch): fn = ch.get('original_filename') if not fn: return '.html' bn = os.path.basename(fn) root, ext = os.path.splitext(bn) if not ext: return '.html' full_ext = '' while ext: full_ext = ext + full_ext root, ext = os.path.splitext(root) return full_ext or '.html' if use_header_output and chapter.get('title'): chapter_num_for_name = actual_num or chapter.get('num', 0) safe_title = make_safe_filename(chapter['title'], chapter_num_for_name) # For comparison, handle both int and float chapter numbers if isinstance(chapter_num_for_name, float): major = int(chapter_num_for_name) minor = int(round((chapter_num_for_name - major) * 100)) if minor > 0: comparison_name = f"chapter_{major:03d}_{minor:02d}" else: comparison_name = f"chapter_{major:03d}" else: comparison_name = f"chapter_{chapter_num_for_name:03d}" if safe_title and safe_title != comparison_name: if is_text_file: return f"{safe_title}.txt" if retain else f"response_{safe_title}.txt" else: # If retaining, use full original ext chain; else default .html if retain: return f"{safe_title}{_full_ext_from_original(chapter)}" return f"response_{safe_title}.html" # Check if decimal chapters are enabled enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1' # For EPUBs with decimal detection enabled if enable_decimal and 'original_basename' in chapter and chapter['original_basename']: basename = chapter['original_basename'] # Check for standard decimal pattern (e.g., Chapter_1.1) decimal_match = re.search(r'(\d+)\.(\d+)', basename) if decimal_match: # Create a modified basename that preserves the decimal base = os.path.splitext(basename)[0] # Replace dots with underscores for filesystem compatibility base = base.replace('.', '_') # Use .txt extension for text files if is_text_file: return f"{base}.txt" if retain else f"response_{base}.txt" else: if retain: return f"{base}{_full_ext_from_original(chapter)}" return f"response_{base}.html" # NOTE: Removed broken XXXX_YY decimal pattern handling that was mangling filenames # Files like 0009_10.xhtml should just use original_basename (handled below at line 1707+) # The old code was incorrectly extracting just the last digit and losing leading zeros # Standard EPUB handling - use original basename if 'original_basename' in chapter and chapter['original_basename']: base = os.path.splitext(chapter['original_basename'])[0] # Use .txt extension for text files if is_text_file: return f"{base}.txt" if retain else f"response_{base}.txt" else: if retain: # Preserve the full original extension chain return f"{base}{_full_ext_from_original(chapter)}" return f"response_{base}.html" else: # Text file handling (no original basename) if actual_num is None: actual_num = chapter.get('actual_chapter_num', chapter.get('num', 0)) # Handle decimal chapter numbers from text file splitting if isinstance(actual_num, float): major = int(actual_num) minor = int(round((actual_num - major) * 10)) # Use *10 to get 0, 1, 2, etc. from 1.0, 1.1, 1.2 # PDF CHUNK FIX: Check if the chunk has a specific filename with extension # For PDF chunks, preserve the .html or .md extension from the original filename chunk_filename = chapter.get('filename', '') if chunk_filename and (chunk_filename.endswith('.html') or chunk_filename.endswith('.md')): # Use the extension from the chunk's original filename file_ext = '.html' if chunk_filename.endswith('.html') else '.md' if retain: return f"section_{major}_{minor}{file_ext}" else: return f"response_section_{major}_{minor}{file_ext}" elif is_text_file: return f"section_{major}_{minor}.txt" if retain else f"response_section_{major}_{minor}.txt" else: return f"{major:03d}_{minor:02d}.html" if retain else f"response_{major:03d}_{minor:02d}.html" else: # For integer chapter numbers, use standard formatting if is_text_file: return f"section_{actual_num}.txt" if retain else f"response_section_{actual_num}.txt" else: return f"{actual_num:03d}.html" if retain else f"response_{actual_num:03d}.html" # ===================================================== # UNIFIED PROGRESS MANAGER # ===================================================== class ProgressManager: """Unified progress management""" def __init__(self, payloads_dir): self.payloads_dir = payloads_dir self.PROGRESS_FILE = os.path.join(payloads_dir, "translation_progress.json") self.prog = self._init_or_load() # Disable auto-dedup unless explicitly enabled; dedup can drop distinct chapters sharing filenames if os.getenv("ENABLE_PROGRESS_DEDUP", "0") == "1": self._dedup_by_output() def _init_or_load(self): """Initialize or load progress tracking with improved structure""" if os.path.exists(self.PROGRESS_FILE): try: with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf: prog = json.load(pf) except json.JSONDecodeError as e: print(f"⚠️ Warning: Progress file is corrupted: {e}") print("🔧 Attempting to fix JSON syntax...") try: with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf: content = pf.read() content = re.sub(r',\s*\]', ']', content) content = re.sub(r',\s*\}', '}', content) prog = json.loads(content) with open(self.PROGRESS_FILE, "w", encoding="utf-8") as pf: json.dump(prog, pf, ensure_ascii=False, indent=2) print("✅ Successfully fixed and saved progress file") except Exception as fix_error: print(f"❌ Could not fix progress file: {fix_error}") print("🔄 Creating backup and starting fresh...") backup_name = f"translation_progress_backup_{int(time.time())}.json" backup_path = os.path.join(self.payloads_dir, backup_name) try: shutil.copy(self.PROGRESS_FILE, backup_path) print(f"📁 Backup saved to: {backup_name}") except: pass prog = { "chapters": {}, "chapter_chunks": {}, "version": "2.0" } if "chapters" not in prog: prog["chapters"] = {} for idx in prog.get("completed", []): prog["chapters"][str(idx)] = { "status": "completed", "timestamp": None } if "chapter_chunks" not in prog: prog["chapter_chunks"] = {} else: prog = { "chapters": {}, "chapter_chunks": {}, "image_chunks": {}, "version": "2.1" } return prog def _dedup_by_output(self): """Keep a single entry per normalized output filename; priority: qa_failed > pending > failed > in_progress > completed.""" def _norm_out(fname: str): if not fname: return None base = os.path.basename(fname) if base.startswith("response_"): base = base[len("response_"):] return os.path.splitext(base)[0] def _infer_num(fname: str): if not fname: return None nums = re.findall(r"\d+", fname) if not nums: return None nums = list(map(int, nums)) if nums[0] == 0 and nums[-1] > 0: return nums[-1] return nums[0] # Prefer completed over failed/pending/in_progress, but keep qa_failed highest severity = {'qa_failed': 6, 'completed': 5, 'merged': 5, 'pending': 4, 'failed': 3, 'in_progress': 2, 'unknown': 0} dedup = {} for key, info in list(self.prog.get("chapters", {}).items()): out = info.get("output_file") norm = _norm_out(out) or key if (info.get("actual_num") in (None, 0)) and out: hint = _infer_num(out) if hint is not None: info["actual_num"] = hint current = dedup.get(norm) if current: cur_rank = severity.get(current.get("status", "unknown"), 0) new_rank = severity.get(info.get("status", "unknown"), 0) if (new_rank > cur_rank) or (new_rank == cur_rank and info.get("last_updated", 0) > current.get("last_updated", 0)): dedup[norm] = info else: dedup[norm] = info new_chapters = {} for norm, info in dedup.items(): new_key = str(info["actual_num"]) if info.get("actual_num") is not None else norm if new_key in new_chapters: cur_rank = severity.get(new_chapters[new_key].get("status", "unknown"), 0) new_rank = severity.get(info.get("status", "unknown"), 0) if (new_rank > cur_rank) or (new_rank == cur_rank and info.get("last_updated", 0) > new_chapters[new_key].get("last_updated", 0)): new_chapters[new_key] = info else: new_chapters[new_key] = info self.prog["chapters"] = new_chapters # NOTE: caller is responsible for saving after dedup def _get_chapter_key(self, actual_num, output_file=None, chapter_obj=None, content_hash=None): """Generate consistent chapter key, handling collisions with composite keys. Returns the key that should be used for this chapter in the progress dict. """ def _normalize_fname(fname): """Normalize filename for comparison regardless of response_ prefix or extension.""" if not fname: return None base = os.path.basename(fname) if base.startswith('response_'): base = base[len('response_'):] # Strip extension only for comparison so .html vs .xhtml don't diverge return os.path.splitext(base)[0] def _make_spine_key(num, spine_pos): if spine_pos is None: return None return f"{num}@{spine_pos}" spine_pos = None if chapter_obj: spine_pos = chapter_obj.get('spine_order') if spine_pos is None: spine_pos = chapter_obj.get('opf_spine_position') # CHUNK FIX: For decimal chapter numbers (e.g., 1.0, 1.1), use the full decimal in the key # This prevents collisions when multiple chunks share the same integer part if isinstance(actual_num, float) and actual_num != int(actual_num): # Convert to string preserving decimal: "1.0", "1.1", etc. chapter_key = str(actual_num) else: chapter_key = str(actual_num) # Determine the output filename if output_file: filename = output_file elif chapter_obj: from TransateKRtoEN import FileUtilities filename = FileUtilities.create_chapter_filename(chapter_obj, actual_num) else: # No way to determine filename, use simple key return chapter_key # SPECIAL FILES FIX: Check if there's an in-progress entry with matching content_hash # This allows us to update the same entry when completing a special file if content_hash and chapter_key in self.prog["chapters"]: existing_info = self.prog["chapters"][chapter_key] existing_hash = existing_info.get("content_hash") existing_file = existing_info.get("output_file") # If hashes match and it's in-progress (no output file yet), keep using simple key if existing_hash == content_hash and not existing_file: return chapter_key # If a spine key already exists, prefer it spine_key = _make_spine_key(actual_num, spine_pos) if spine_key and spine_key in self.prog["chapters"]: existing_info = self.prog["chapters"][spine_key] existing_file = existing_info.get("output_file") # Require exact filename match to avoid mixing notice/chapter files with same number if existing_file == filename: return spine_key # Check if simple key exists and matches this file if chapter_key in self.prog["chapters"]: existing_info = self.prog["chapters"][chapter_key] existing_file = existing_info.get("output_file") existing_status = existing_info.get("status") # If the existing entry is for the same file, use simple key if existing_file == filename: return chapter_key # NEW: tolerate retain-source toggle changes (response_ prefix / extension) existing_norm = _normalize_fname(existing_file) new_norm = _normalize_fname(filename) if existing_norm and new_norm and existing_norm == new_norm: return chapter_key # MERGED STATUS FIX: If existing entry is merged, always use simple key # Merged chapters point to parent's output_file, so filename won't match # but we still want to use the same key to find the merged status if existing_status == "merged": return chapter_key # Different file with same chapter number - prefer spine-based composite, else filename-based if spine_key: return spine_key file_basename = os.path.splitext(os.path.basename(filename))[0] file_basename = file_basename.replace("response_", "") composite_key = f"{actual_num}_{file_basename}" # NEW: if existing entry is pending and for a different file, don't overwrite it if existing_status and str(existing_status).lower().startswith("pending"): if existing_file and existing_file != filename: return composite_key return composite_key # Check if composite key already exists for this file file_basename = os.path.splitext(os.path.basename(filename))[0] file_basename = file_basename.replace("response_", "") composite_key = f"{actual_num}_{file_basename}" spine_composite = spine_key if spine_composite and spine_composite in self.prog["chapters"]: return spine_composite if composite_key in self.prog["chapters"]: return composite_key # No existing entry - use simple key for new entries return spine_key or chapter_key def save(self): """Save progress to file""" try: self.prog["completed_list"] = [] for chapter_key, chapter_info in self.prog.get("chapters", {}).items(): if chapter_info.get("status") == "completed" and chapter_info.get("output_file"): actual_num = chapter_info.get("actual_num", 0) self.prog["completed_list"].append({ "num": actual_num, "idx": 0, # idx is not used anymore "title": f"Chapter {actual_num}", "file": chapter_info.get("output_file", ""), "key": chapter_key }) if self.prog.get("completed_list"): self.prog["completed_list"].sort(key=lambda x: x["num"]) temp_file = self.PROGRESS_FILE + '.tmp' with open(temp_file, "w", encoding="utf-8") as pf: json.dump(self.prog, pf, ensure_ascii=False, indent=2) if os.path.exists(self.PROGRESS_FILE): os.remove(self.PROGRESS_FILE) os.rename(temp_file, self.PROGRESS_FILE) except Exception as e: print(f"⚠️ Warning: Failed to save progress: {e}") temp_file = self.PROGRESS_FILE + '.tmp' if os.path.exists(temp_file): try: os.remove(temp_file) except: pass def update(self, idx, actual_num, content_hash, output_file, status="in_progress", ai_features=None, raw_num=None, chapter_obj=None, merged_chapters=None, qa_issues_found=None): """Update progress for a chapter""" # Use helper method to get consistent key chapter_key = self._get_chapter_key(actual_num, output_file, chapter_obj, content_hash) # Log if we're using a composite key if "_" in chapter_key and chapter_key != str(actual_num): print(f"📌 Using composite key for chapter {actual_num}: {chapter_key}") # MERGED CHAPTERS FIX: If this chapter has merged children and status changes to failed/pending, # clear the merged status from all child chapters so they can be retranslated if status in ["qa_failed", "failed", "pending", "error"] and chapter_key in self.prog["chapters"]: existing_info = self.prog["chapters"][chapter_key] merged_child_nums = existing_info.get("merged_chapters", []) if merged_child_nums: print(f"🔓 Clearing merged status from {len(merged_child_nums)} child chapters due to parent status: {status}") # Find and clear merged status from all child chapters for child_chapter_key, child_info in list(self.prog["chapters"].items()): if child_info.get("status") == "merged" and child_info.get("merged_parent_chapter") == actual_num: child_actual_num = child_info.get("actual_num") print(f" 🔓 Clearing merged status for chapter {child_actual_num}") # Delete the merged child entry so it will be retranslated del self.prog["chapters"][child_chapter_key] chapter_info = { "actual_num": actual_num, "content_hash": content_hash, "output_file": output_file, "status": status, "last_updated": time.time() } # CRITICAL: Store original_basename for OPF->output mapping in GUI if chapter_obj: if chapter_obj.get('original_basename'): chapter_info["original_basename"] = chapter_obj['original_basename'] elif chapter_obj.get('original_filename'): chapter_info["original_basename"] = os.path.basename(chapter_obj['original_filename']) # Add raw number tracking if raw_num is not None: chapter_info["raw_chapter_num"] = raw_num # Check if zero detection was disabled if hasattr(builtins, '_DISABLE_ZERO_DETECTION') and builtins._DISABLE_ZERO_DETECTION: chapter_info["zero_adjusted"] = False else: chapter_info["zero_adjusted"] = (raw_num != actual_num) if raw_num is not None else False # FIXED: Store AI features if provided if ai_features is not None: chapter_info["ai_features"] = ai_features # Preserve existing AI features if not overwriting elif chapter_key in self.prog["chapters"] and "ai_features" in self.prog["chapters"][chapter_key]: chapter_info["ai_features"] = self.prog["chapters"][chapter_key]["ai_features"] # Add merged chapters list if provided (for parent chapters in request merging) if merged_chapters is not None: chapter_info["merged_chapters"] = merged_chapters # Add QA issues if provided (for qa_failed status) if qa_issues_found is not None: chapter_info["qa_issues"] = True chapter_info["qa_timestamp"] = time.time() chapter_info["qa_issues_found"] = qa_issues_found # IMPORTANT: When changing to in_progress or failed status, explicitly clear QA fields # This ensures old qa_failed markers don't persist elif status in ["in_progress", "failed"]: # Don't add QA fields - they will be excluded from chapter_info pass self.prog["chapters"][chapter_key] = chapter_info def mark_as_merged(self, idx, actual_num, content_hash, parent_chapter_num, chapter_obj=None, parent_output_file=None): """Mark a chapter as merged into a parent chapter""" chapter_key = self._get_chapter_key(actual_num, output_file=None, chapter_obj=chapter_obj, content_hash=content_hash) merged_info = { "actual_num": actual_num, "content_hash": content_hash, "output_file": parent_output_file, # Point to parent's output file "status": "merged", "merged_parent_chapter": parent_chapter_num, "last_updated": time.time() } # Add original_basename so GUI can match by source filename if chapter_obj and 'original_basename' in chapter_obj: merged_info["original_basename"] = chapter_obj['original_basename'] elif chapter_obj and 'filename' in chapter_obj: merged_info["original_basename"] = chapter_obj['filename'] self.prog["chapters"][chapter_key] = merged_info def update_merged_chapters_list(self, parent_chapter_num, merged_chapter_nums, parent_content_hash=None, parent_chapter_obj=None): """Update the parent chapter to track which chapters were merged into it""" chapter_key = self._get_chapter_key(parent_chapter_num, output_file=None, chapter_obj=parent_chapter_obj, content_hash=parent_content_hash) if chapter_key in self.prog["chapters"]: self.prog["chapters"][chapter_key]["merged_chapters"] = merged_chapter_nums def check_chapter_status(self, chapter_idx, actual_num, content_hash, output_dir, chapter_obj=None): """Check if a chapter needs translation""" # Define _norm helper early so it's available throughout this method def _norm(fname: str): """ Normalize a filename for comparison: - drop leading response_ prefix - strip *all* extensions (handle .html.xhtml, .md.html, etc.) - lowercase for case-insensitive matching on Windows """ if not fname: return "" base = os.path.basename(fname) if base.startswith("response_"): base = base[len("response_"):] # Strip all extensions, not just the last one while True: base, ext = os.path.splitext(base) if not ext: break return base.lower() # If caller passed 0/None, recompute from filename/spine to avoid collapsing to chapter 0 if (actual_num is None or actual_num <= 0) and chapter_obj: try: from TransateKRtoEN import FileUtilities recomputed = FileUtilities.extract_actual_chapter_number(chapter_obj, patterns=None, config=None) if recomputed is not None: actual_num = recomputed except Exception: pass # Use helper method to get consistent key chapter_key = self._get_chapter_key(actual_num, output_file=None, chapter_obj=chapter_obj, content_hash=content_hash) # Check if we have tracking for this chapter if chapter_key in self.prog["chapters"]: chapter_info = self.prog["chapters"][chapter_key] status = chapter_info.get("status") status_l = status.lower() if isinstance(status, str) else status or "" # Failed statuses ALWAYS trigger retranslation if status in ["qa_failed", "failed", "error", "file_missing"]: return True, None, None # Merged status - skip translation, content is in parent chapter if status == "merged": parent_chapter = chapter_info.get("merged_parent_chapter") return False, f"Chapter {actual_num} merged into chapter {parent_chapter}", None # Completed - check file exists if status in ["completed", "completed_empty", "completed_image_only"]: output_file = chapter_info.get("output_file") if output_file: output_path = os.path.join(output_dir, output_file) if os.path.exists(output_path): return False, f"Chapter {actual_num} already translated: {output_file}", output_file # Fallback: look for any file with same base name (ignore extensions) expected_norm = _norm(output_file) try: for f in os.listdir(output_dir): if _norm(f) == expected_norm: alt_path = os.path.join(output_dir, f) if os.path.exists(alt_path): # Update stored filename to the discovered one self.prog["chapters"][chapter_key]["output_file"] = f self.save() return False, f"Chapter {actual_num} already translated: {f}", f except Exception: pass # File missing - retranslate del self.prog["chapters"][chapter_key] if chapter_key in self.prog.get("chapter_chunks", {}): del self.prog["chapter_chunks"][chapter_key] self.save() return True, None, None # Any other status - retranslate return True, None, None # No entry in progress tracking - check if file exists on disk # This handles the case where progress file was deleted but translated files remain if chapter_obj: from TransateKRtoEN import FileUtilities output_filename = FileUtilities.create_chapter_filename(chapter_obj, actual_num) output_path = os.path.join(output_dir, output_filename) # If a differently-keyed entry already tracks this file, reuse it instead of auto-discovering expected_norm = _norm(output_filename) for k, info in self.prog.get("chapters", {}).items(): if _norm(info.get("output_file")) == expected_norm: status = info.get("status") if status in ["completed", "completed_empty", "completed_image_only"]: if info.get("output_file"): if os.path.exists(os.path.join(output_dir, info["output_file"])): return False, f"Chapter {info.get('actual_num', actual_num)} already translated: {info['output_file']}", info["output_file"] # If tracked with other status, treat as tracked (will retranslate if non-completed) return True, None, info.get("output_file") # Check if file exists for auto-discovery if os.path.exists(output_path): print(f"📁 Found existing file for chapter {actual_num}: {output_filename}") self.prog["chapters"][chapter_key] = { "actual_num": actual_num, "content_hash": content_hash, "output_file": output_filename, "status": "completed", "last_updated": os.path.getmtime(output_path), "auto_discovered": True } self.save() return False, f"Chapter {actual_num} already exists: {output_filename}", output_filename # No entry and no file - needs translation return True, None, None def cleanup_missing_files(self, output_dir): """Remove missing files and clear merged children of missing parents""" cleaned_count = 0 deleted_parents = set() # Track which parent chapters were deleted parents_with_missing_files = set() # Track parents with missing files (for merged children clearing) # First pass: Remove entries for missing files (except merged children and certain non-final states) for chapter_key, chapter_info in list(self.prog["chapters"].items()): output_file = chapter_info.get("output_file") status = chapter_info.get("status") status_l = status.lower().strip() if isinstance(status, str) else (str(status).lower().strip() if status is not None else "") # MERGED CHAPTERS FIX: Don't delete merged children in first pass # They will be handled in second pass if their parent was deleted if status == "merged": continue # QA_FAILED / FAILED / IN_PROGRESS / PENDING FIX: # Don't delete entries that are meant to be visible in the retranslation UI # even when their output file is missing. # - qa_failed/failed: should remain visible for investigation/retry # - in_progress: file doesn't exist yet because translation is ongoing # - pending: user explicitly marked for retranslation; file may have been deleted on purpose if status_l.startswith("pending") or status_l in ["qa_failed", "failed", "in_progress"]: continue if output_file: output_path = os.path.join(output_dir, output_file) if not os.path.exists(output_path): # Before deleting, check if the file was renamed (response_/extension toggle) _html_exts = {'.html', '.xhtml', '.htm', '.xml'} def _norm_cleanup(fn): b = os.path.basename(fn) if b.startswith('response_'): b = b[len('response_'):] while True: b2, e2 = os.path.splitext(b) if e2.lower() in _html_exts: b = b2 else: break return b.lower() expected_norm = _norm_cleanup(output_file) renamed_match = None try: for f in os.listdir(output_dir): if f.lower().endswith(('.html', '.xhtml', '.htm')): if _norm_cleanup(f) == expected_norm: renamed_match = f break except Exception: pass if renamed_match: # File was renamed (retain toggle) – update the stored filename chapter_info['output_file'] = renamed_match continue actual_num = chapter_info.get("actual_num") if actual_num is not None: # Track if this was a parent of merged chapters deleted_parents.add(actual_num) # Also track if this chapter has merged children (for later clearing) if chapter_info.get("merged_chapters"): parents_with_missing_files.add(actual_num) # Delete the entry del self.prog["chapters"][chapter_key] # Remove chunk data if chapter_key in self.prog.get("chapter_chunks", {}): del self.prog["chapter_chunks"][chapter_key] cleaned_count += 1 # Second pass: Clear merged children whose parents were deleted OR have missing files if deleted_parents or parents_with_missing_files: all_affected_parents = deleted_parents | parents_with_missing_files for chapter_key, chapter_info in list(self.prog["chapters"].items()): if chapter_info.get("status") == "merged": parent_num = chapter_info.get("merged_parent_chapter") if parent_num in all_affected_parents: actual_num = chapter_info.get("actual_num") print(f"🔓 Clearing merged child chapter {actual_num} (parent {parent_num} file is missing)") del self.prog["chapters"][chapter_key] cleaned_count += 1 if cleaned_count > 0: print(f"🔄 Removed {cleaned_count} missing file entries") def migrate_to_content_hash(self, chapters): """Change keys to match actual_num values for proper mapping and sort by chapter number""" def _normalize_out(fname: str): if not fname: return None base = os.path.basename(fname) if base.startswith('response_'): base = base[len('response_'):] return os.path.splitext(base)[0] def _infer_num_from_filename(fname: str): if not fname: return None nums = re.findall(r'\\d+', fname) if not nums: return None nums = list(map(int, nums)) if nums[0] == 0 and nums[-1] > 0: return nums[-1] return nums[0] # Priority: qa_failed > pending > failed > in_progress > completed severity_rank = {'qa_failed': 6, 'completed': 5, 'merged': 5, 'pending': 4, 'failed': 3, 'in_progress': 2, 'unknown': 0} # First, deduplicate by normalized output filename choosing highest severity then latest timestamp dedup = {} for old_key, chapter_info in self.prog["chapters"].items(): out = chapter_info.get("output_file") norm = _normalize_out(out) if not norm: norm = old_key # fallback to key to avoid losing entry # Fix actual_num if missing or zero using filename hint actual_num = chapter_info.get("actual_num") if (actual_num in (None, 0)) and out: hint = _infer_num_from_filename(out) if hint is not None: chapter_info["actual_num"] = hint actual_num = hint current_best = dedup.get(norm) if current_best: best_sev = severity_rank.get(current_best.get("status", "unknown"), 0) cur_sev = severity_rank.get(chapter_info.get("status", "unknown"), 0) if (cur_sev > best_sev) or (cur_sev == best_sev and chapter_info.get("last_updated", 0) > current_best.get("last_updated", 0)): dedup[norm] = chapter_info else: dedup[norm] = chapter_info new_chapters = {} migrated_count = 0 for norm, chapter_info in dedup.items(): actual_num = chapter_info.get("actual_num") key_candidate = None # Prefer numeric key when available if actual_num is not None: key_candidate = str(actual_num) else: key_candidate = norm # If non-numeric key, keep as-is if not key_candidate.isdigit(): new_key = key_candidate else: new_key = key_candidate # Handle collisions by severity and timestamp if new_key in new_chapters: existing = new_chapters[new_key] best_sev = severity_rank.get(existing.get("status", "unknown"), 0) cur_sev = severity_rank.get(chapter_info.get("status", "unknown"), 0) if (cur_sev > best_sev) or (cur_sev == best_sev and chapter_info.get("last_updated", 0) > existing.get("last_updated", 0)): new_chapters[new_key] = chapter_info else: new_chapters[new_key] = chapter_info migrated_count += 1 # Sort chapters by actual_num field, then by key as fallback def sort_key(item): key, chapter_info = item actual_num = chapter_info.get("actual_num") if actual_num is not None: return actual_num else: # Fallback to key if no actual_num try: return int(key) except ValueError: # For non-numeric keys, sort them at the end return float('inf') sorted_chapters = dict(sorted(new_chapters.items(), key=sort_key)) if migrated_count > 0: # Also migrate and sort chapter_chunks if they exist if "chapter_chunks" in self.prog: new_chunks = {} for old_key, chunk_data in self.prog["chapter_chunks"].items(): if not str(old_key).isdigit(): new_chunks[old_key] = chunk_data elif old_key in self.prog["chapters"] and "actual_num" in self.prog["chapters"][old_key]: new_key = str(self.prog["chapters"][old_key]["actual_num"]) new_chunks[new_key] = chunk_data else: new_chunks[old_key] = chunk_data # Sort chapter_chunks using the same sorting logic sorted_chunks = dict(sorted(new_chunks.items(), key=sort_key)) self.prog["chapter_chunks"] = sorted_chunks self.prog["chapters"] = sorted_chapters self.save() print(f"✅ Migrated {migrated_count} entries to use actual_num as key and sorted by chapter number") else: # Even if no migration occurred, still apply sorting self.prog["chapters"] = sorted_chapters if "chapter_chunks" in self.prog: sorted_chunks = dict(sorted(self.prog["chapter_chunks"].items(), key=sort_key)) self.prog["chapter_chunks"] = sorted_chunks self.save() print("✅ Sorted chapters by chapter number") def get_stats(self, output_dir): """Get statistics about translation progress""" stats = { "total_tracked": len(self.prog["chapters"]), "completed": 0, "missing_files": 0, "in_progress": 0 } for chapter_info in self.prog["chapters"].values(): status = chapter_info.get("status") output_file = chapter_info.get("output_file") if status == "completed" and output_file: output_path = os.path.join(output_dir, output_file) if os.path.exists(output_path): stats["completed"] += 1 else: stats["missing_files"] += 1 elif status == "in_progress": stats["in_progress"] += 1 elif status == "file_missing": stats["missing_files"] += 1 return stats # ===================================================== # UNIFIED CONTENT PROCESSOR # ===================================================== class ContentProcessor: """Unified content processing""" @staticmethod def clean_ai_artifacts(text, remove_artifacts=True): """Remove AI response artifacts from text - but ONLY when enabled""" if not remove_artifacts: return text # IMPORTANT: Protect split markers used by request merging # These must NEVER be removed as they're critical for split-the-merge split_marker_pattern = r']*id="split-\d+"[^>]*>.*?SPLIT MARKER.*?' has_split_markers = bool(re.search(split_marker_pattern, text, re.DOTALL | re.IGNORECASE)) if has_split_markers: # Extract and preserve split markers temporarily split_markers = [] def preserve_marker(match): marker_id = f"__SPLIT_MARKER_{len(split_markers)}__" split_markers.append(match.group(0)) return marker_id text = re.sub(split_marker_pattern, preserve_marker, text, flags=re.DOTALL | re.IGNORECASE) # First, remove thinking tags if they exist text = ContentProcessor._remove_thinking_tags(text) # After removing thinking tags, re-analyze the text structure # to catch AI artifacts that may now be at the beginning lines = text.split('\n') # Clean up empty lines at the beginning while lines and not lines[0].strip(): lines.pop(0) if not lines: # Restore split markers before returning if has_split_markers: for i, marker in enumerate(split_markers): text = text.replace(f"__SPLIT_MARKER_{i}__", marker) return text # Check the first non-empty line for AI artifacts first_line = lines[0].strip() ai_patterns = [ r'^(?:Sure|Okay|Understood|Of course|Got it|Alright|Certainly|Here\'s|Here is)', r'^(?:I\'ll|I will|Let me) (?:translate|help|assist)', r'^(?:System|Assistant|AI|User|Human|Model)\s*:', r'^\[PART\s+\d+/\d+\]', r'^(?:Translation note|Note|Here\'s the translation|I\'ve translated)', r'^```(?:html|xml|text)?\s*$', # Enhanced code block detection r'^', remaining_text, re.IGNORECASE) or len(remaining_text.strip()) > 50): # Reduced from 100 to 50 print(f"✂️ Removed AI artifact: {first_line[:50]}...") return remaining_text.lstrip() if first_line.lower() in ['html', 'text', 'content', 'translation', 'output']: remaining_lines = lines[1:] remaining_text = '\n'.join(remaining_lines) if remaining_text.strip(): print(f"✂️ Removed single word artifact: {first_line}") result = remaining_text.lstrip() # Restore split markers if has_split_markers: for i, marker in enumerate(split_markers): result = result.replace(f"__SPLIT_MARKER_{i}__", marker) return result result = '\n'.join(lines) # Restore split markers before returning if has_split_markers: for i, marker in enumerate(split_markers): result = result.replace(f"__SPLIT_MARKER_{i}__", marker) return result @staticmethod def _remove_thinking_tags(text): """Remove thinking tags that some AI models produce""" if not text: return text # Common thinking tag patterns used by various AI models thinking_patterns = [ # XML-style thinking tags (r'.*?', 'thinking'), (r'.*?', 'think'), (r'.*?', 'thoughts'), (r'.*?', 'reasoning'), (r'.*?', 'analysis'), (r'.*?', 'reflection'), # OpenAI o1-style reasoning blocks - fix the regex escaping (r'<\|thinking\|>.*?', 'o1-thinking'), # Claude-style thinking blocks (r'\[thinking\].*?\[/thinking\]', 'claude-thinking'), # Generic bracketed thinking patterns (r'\[THINKING\].*?\[/THINKING\]', 'bracketed-thinking'), (r'\[ANALYSIS\].*?\[/ANALYSIS\]', 'bracketed-analysis'), ] original_text = text removed_count = 0 for pattern, tag_type in thinking_patterns: # Use DOTALL flag to match across newlines matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE) if matches: text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE) removed_count += len(matches) # Also remove standalone code block markers that might be artifacts # But preserve all actual content - only remove the ``` markers themselves code_block_removed = 0 code_block_patterns = [ (r'^```\w*\s*\n', '\n'), # Opening code blocks - replace with newline (r'\n```\s*$', ''), # Closing code blocks at end - remove entirely (r'^```\w*\s*$', ''), # Standalone ``` on its own line - remove entirely ] for pattern, replacement in code_block_patterns: matches = re.findall(pattern, text, re.MULTILINE) if matches: text = re.sub(pattern, replacement, text, flags=re.MULTILINE) code_block_removed += len(matches) # Clean up any extra whitespace or empty lines left after removing thinking tags total_removed = removed_count + code_block_removed if total_removed > 0: # Remove multiple consecutive newlines text = re.sub(r'\n\s*\n\s*\n', '\n\n', text) # Remove leading/trailing whitespace text = text.strip() if removed_count > 0 and code_block_removed > 0: print(f"🧠 Removed {removed_count} thinking tag(s) and {code_block_removed} code block marker(s)") elif removed_count > 0: print(f"🧠 Removed {removed_count} thinking tag(s)") elif code_block_removed > 0: print(f"📝 Removed {code_block_removed} code block marker(s)") return text @staticmethod def clean_memory_artifacts(text): """Remove any memory/summary artifacts that leaked into the translation""" text = re.sub(r'\[MEMORY\].*?\[END MEMORY\]', '', text, flags=re.DOTALL) lines = text.split('\n') cleaned_lines = [] skip_next = False for line in lines: if any(marker in line for marker in ['[MEMORY]', '[END MEMORY]', 'Previous context summary:', 'memory summary', 'context summary', '[Context]']): skip_next = True continue if skip_next and line.strip() == '': skip_next = False continue skip_next = False cleaned_lines.append(line) return '\n'.join(cleaned_lines) @staticmethod def emergency_restore_paragraphs(text, original_html=None, verbose=True): """Emergency restoration when AI returns wall of text without proper paragraph tags""" def log(message): if verbose: print(message) if text.count('

') >= 3: return text if original_html: original_para_count = original_html.count('

') current_para_count = text.count('

') if current_para_count < original_para_count / 2: log(f"⚠️ Paragraph mismatch! Original: {original_para_count}, Current: {current_para_count}") log("🔧 Attempting emergency paragraph restoration...") if '

' not in text and len(text) > 300: log("❌ No paragraph tags found - applying emergency restoration") if '\n\n' in text: parts = text.split('\n\n') paragraphs = ['

' + part.strip() + '

' for part in parts if part.strip()] return '\n'.join(paragraphs) dialogue_pattern = r'(?<=[.!?])\s+(?=[""\u201c\u201d])' if re.search(dialogue_pattern, text): parts = re.split(dialogue_pattern, text) paragraphs = [] for part in parts: part = part.strip() if part: if not part.startswith('

'): part = '

' + part if not part.endswith('

'): part = part + '

' paragraphs.append(part) return '\n'.join(paragraphs) sentence_boundary = r'(?<=[.!?])\s+(?=[A-Z\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af])' sentences = re.split(sentence_boundary, text) if len(sentences) > 1: paragraphs = [] current_para = [] for sentence in sentences: sentence = sentence.strip() if not sentence: continue current_para.append(sentence) should_break = ( len(current_para) >= 3 or sentence.rstrip().endswith(('"', '"', '"')) or '* * *' in sentence or '***' in sentence or '---' in sentence ) if should_break: para_text = ' '.join(current_para) if not para_text.startswith('

'): para_text = '

' + para_text if not para_text.endswith('

'): para_text = para_text + '

' paragraphs.append(para_text) current_para = [] if current_para: para_text = ' '.join(current_para) if not para_text.startswith('

'): para_text = '

' + para_text if not para_text.endswith('

'): para_text = para_text + '

' paragraphs.append(para_text) result = '\n'.join(paragraphs) log(f"✅ Restored {len(paragraphs)} paragraphs from wall of text") return result words = text.split() if len(words) > 100: paragraphs = [] words_per_para = max(100, len(words) // 10) for i in range(0, len(words), words_per_para): chunk = ' '.join(words[i:i + words_per_para]) if chunk.strip(): paragraphs.append('

' + chunk.strip() + '

') return '\n'.join(paragraphs) elif '

' in text and text.count('

') < 3 and len(text) > 1000: log("⚠️ Very few paragraphs for long text - checking if more breaks needed") soup = BeautifulSoup(text, 'html.parser') existing_paras = soup.find_all('p') new_paragraphs = [] for para in existing_paras: para_text = para.get_text() if len(para_text) > 500: sentences = re.split(r'(?<=[.!?])\s+', para_text) if len(sentences) > 5: chunks = [] current = [] for sent in sentences: current.append(sent) if len(current) >= 3: chunks.append('

' + ' '.join(current) + '

') current = [] if current: chunks.append('

' + ' '.join(current) + '

') new_paragraphs.extend(chunks) else: new_paragraphs.append(str(para)) else: new_paragraphs.append(str(para)) return '\n'.join(new_paragraphs) return text @staticmethod def emergency_restore_images(text, original_html=None, verbose=True): """Emergency restoration of images lost during translation - Filename Pattern Search Args: text: Translated HTML or markdown text original_html: Original HTML before translation (can be actual HTML or converted markdown) verbose: Whether to print debug messages Returns: Text with restored image tags """ if not original_html or not text: return text def log(message): if verbose: print(message) try: import re import os # Parse both documents soup_orig = BeautifulSoup(original_html, 'html.parser') soup_text = BeautifulSoup(text, 'html.parser') # Extract images from source orig_images = soup_orig.find_all('img') if not orig_images: return text # Extract images from translation text_images = soup_text.find_all('img') # If counts match, nothing to do if len(orig_images) == len(text_images): return text # If translation has fewer images, try to restore them if len(text_images) < len(orig_images): log(f"🖼️ Image mismatch! Source: {len(orig_images)}, Translation: {len(text_images)}") log("🔧 Attempting emergency image restoration (filename search method)...") # Get the set of image sources present in translation present_srcs = set() for img in text_images: src = img.get('src') if src: present_srcs.add(src) # Collect missing images missing_images = [] for img in orig_images: src = img.get('src') if src and src not in present_srcs: missing_images.append((src, img)) if not missing_images: return text # Convert both to strings for searching source_str = str(original_html) text_str = str(text) inserted_count = 0 # For each missing image, find where it appears in source and insert at same relative position in output for src, orig_img in missing_images: # Extract just the filename from the path filename = os.path.basename(src) log(f" 🔍 Processing missing image: {src}") # Try to find with full filename first (most specific) pattern = re.escape(filename) source_matches = list(re.finditer(pattern, source_str, re.IGNORECASE)) log(f" Searching for full filename '{filename}': {len(source_matches)} matches") # If not found, try without response_ prefix if not source_matches and filename.lower().startswith('response_'): filename_no_prefix = filename[9:] # Remove 'response_' pattern = re.escape(filename_no_prefix) source_matches = list(re.finditer(pattern, source_str, re.IGNORECASE)) log(f" Searching without response_ prefix '{filename_no_prefix}': {len(source_matches)} matches") # If still not found, try core name without extension (least specific) if not source_matches: core_name = os.path.splitext(filename)[0] if core_name.lower().startswith('response_'): core_name = core_name[9:] pattern = re.escape(core_name) source_matches = list(re.finditer(pattern, source_str, re.IGNORECASE)) log(f" Searching for core name '{core_name}': {len(source_matches)} matches") if source_matches: # Found the filename in source! Calculate its relative position source_pos = source_matches[0].start() source_len = len(source_str) # Calculate proportional position (0.0 to 1.0) relative_pos = source_pos / source_len if source_len > 0 else 0.5 log(f" Position in source: {source_pos}/{source_len} ({relative_pos:.1%})") # Calculate corresponding position in translation text_len = len(text_str) insert_pos = int(relative_pos * text_len) log(f" Initial insert position in translation: {insert_pos}/{text_len}") # Find a good insertion point - prefer after closing tag or before opening tag # Search backwards for '>' or forwards for '<' within reasonable distance original_insert_pos = insert_pos max_search_distance = 200 # Search backwards for closing tag backward_pos = insert_pos search_start = max(0, insert_pos - max_search_distance) while backward_pos > search_start and text_str[backward_pos] != '>': backward_pos -= 1 backward_found = (text_str[backward_pos] == '>') backward_distance = insert_pos - backward_pos if backward_found else max_search_distance + 1 # Search forwards for opening tag (but skip closing tags like ) forward_pos = insert_pos search_end = min(len(text_str), insert_pos + max_search_distance) forward_found = False while forward_pos < search_end: if text_str[forward_pos] == '<': # Check if it's a closing tag if forward_pos + 1 < len(text_str) and text_str[forward_pos + 1] != '/': # It's an opening tag, use it forward_found = True break # It's a closing tag, keep searching forward_pos += 1 forward_distance = forward_pos - insert_pos if forward_found else max_search_distance + 1 # Use whichever is closer if backward_found and backward_distance <= forward_distance: insert_pos = backward_pos + 1 # After the '>' log(f" Adjusted to after closing tag at position {insert_pos} (moved {original_insert_pos - insert_pos} chars back)") elif forward_found: insert_pos = forward_pos # Before the '<' log(f" Adjusted to before opening tag at position {insert_pos} (moved {insert_pos - original_insert_pos} chars forward)") else: log(f" No nearby tags found within {max_search_distance} chars, using original position: {insert_pos}") # Show context around insertion point context_start = max(0, insert_pos - 30) context_end = min(len(text_str), insert_pos + 30) before_context = text_str[context_start:insert_pos] after_context = text_str[insert_pos:context_end] log(f" Context: ...{before_context}[INSERT HERE]{after_context}...") # Create the image tag HTML img_html = f'

' in content_stripped[:500] and not '

' in content_stripped[:500] # No common HTML tags ): # This looks like plain text or markdown from html2text is_plain_text = True if is_plain_text: # For plain text, just check the length text_length = len(content_stripped) # Be more lenient with plain text since it's already extracted return text_length > 50 # Much lower threshold for plain text # Original HTML parsing logic soup = BeautifulSoup(html_content, 'html.parser') soup_copy = BeautifulSoup(str(soup), 'html.parser') for img in soup_copy.find_all('img'): img.decompose() text_elements = soup_copy.find_all(['p', 'div', 'span']) text_content = ' '.join(elem.get_text(strip=True) for elem in text_elements) headers = soup_copy.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']) header_text = ' '.join(h.get_text(strip=True) for h in headers) if headers and len(text_content.strip()) > 1: return True if len(text_content.strip()) > 200: return True if len(header_text.strip()) > 100: return True return False except Exception as e: print(f"Warning: Error checking text content: {e}") return True @staticmethod def is_only_image_links(html_content): """Return True if content contains only image links/paths (no meaningful text).""" try: if not html_content: return False content_stripped = html_content.strip() if not content_stripped: return False # Prefer HTML text extraction when it looks like HTML if content_stripped.startswith('<') or '<' in content_stripped[:200]: try: soup = BeautifulSoup(html_content, 'html.parser') text = soup.get_text(separator='\n', strip=True) except Exception: text = content_stripped else: text = content_stripped if not text: return False image_ext = r'(?:png|jpe?g|gif|webp|svg|bmp)' md_img = re.compile(r'!\[[^\]]*\]\(([^)]+)\)', re.IGNORECASE) url_pat = re.compile(r'https?://[^\s)>\"]+\.' + image_ext + r'(?:\?[^\s)>\"]*)?', re.IGNORECASE) path_pat = re.compile(r'(?:[A-Za-z]:)?[^\s)>\"]+\.' + image_ext + r'(?:\?[^\s)>\"]*)?', re.IGNORECASE) found_any = False lines = [ln.strip() for ln in text.splitlines() if ln.strip()] if not lines: return False for line in lines: line_work = line # Markdown image syntax for m in md_img.findall(line_work): if url_pat.search(m) or path_pat.search(m): found_any = True line_work = md_img.sub(' ', line_work) if url_pat.search(line_work): found_any = True line_work = url_pat.sub(' ', line_work) if path_pat.search(line_work): found_any = True line_work = path_pat.sub(' ', line_work) # If any meaningful text remains, it's not image-only if re.sub(r'[\s\.,;:\(\)\[\]<>\"\'\-\u200b]+', '', line_work): return False return found_any except Exception: return False # ===================================================== # UNIFIED TRANSLATION PROCESSOR # ===================================================== STOP_LOGGED = False def log_stop_once(message="❌ Translation stopped by user request."): """Print a single stop message per run.""" global STOP_LOGGED if not STOP_LOGGED: # print(message) # Redundant with "Translation stopped by user" from exception STOP_LOGGED = True class TranslationProcessor: """Handles the translation of individual chapters""" def __init__(self, config, client, out_dir, log_callback=None, stop_callback=None, uses_zero_based=False, is_text_file=False): self.config = config self.client = client self.out_dir = out_dir self.log_callback = log_callback self.stop_callback = stop_callback self.chapter_splitter = ChapterSplitter(model_name=config.MODEL) self.uses_zero_based = uses_zero_based self.is_text_file = is_text_file # Check and log multi-key status if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys: stats = self.client.get_stats() self._log(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys") self._log(f" Active keys: {stats.get('active_keys', 0)}") def _log(self, message): """Log a message""" if self.log_callback: self.log_callback(message) else: print(message) def report_key_status(self): """Report multi-key status if available""" if hasattr(self.client, 'get_stats'): stats = self.client.get_stats() if stats.get('multi_key_mode', False): self._log(f"\n📊 API Key Status:") self._log(f" Active Keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}") self._log(f" Success Rate: {stats.get('success_rate', 0):.1%}") self._log(f" Total Requests: {stats.get('total_requests', 0)}\n") def check_stop(self): """Check if translation should stop""" # During graceful stop, ALWAYS return False to let current chapter complete fully # The main loop will check GRACEFUL_STOP at the START of each new chapter if os.environ.get('GRACEFUL_STOP') == '1': return False if self.stop_callback and self.stop_callback(): log_stop_once() return True def check_duplicate_content(self, result, idx, prog, out, actual_num=None): """Check if translated content is duplicate - with mode selection""" # Get detection mode from config detection_mode = getattr(self.config, 'DUPLICATE_DETECTION_MODE', 'basic') print(f" 🔍 DEBUG: Detection mode = '{detection_mode}'") print(f" 🔍 DEBUG: Lookback chapters = {self.config.DUPLICATE_LOOKBACK_CHAPTERS}") # Extract content_hash if available from progress content_hash = None if detection_mode == 'ai-hunter': # Try to get content_hash from the current chapter info # Use actual_num if provided, otherwise fallback to idx+1 if actual_num is not None: chapter_key = str(actual_num) else: chapter_key = str(idx + 1) if chapter_key in prog.get("chapters", {}): chapter_info = prog["chapters"][chapter_key] content_hash = chapter_info.get("content_hash") print(f" 🔍 DEBUG: Found content_hash for chapter {idx}: {content_hash}") if detection_mode == 'ai-hunter': print(" 🤖 DEBUG: Routing to AI Hunter detection...") # Check if AI Hunter method is available (injected by the wrapper) if hasattr(self, '_check_duplicate_ai_hunter'): return self._check_duplicate_ai_hunter(result, idx, prog, out, content_hash) else: print(" ⚠️ AI Hunter method not available, falling back to basic detection") return self._check_duplicate_basic(result, idx, prog, out) elif detection_mode == 'cascading': print(" 🔄 DEBUG: Routing to Cascading detection...") return self._check_duplicate_cascading(result, idx, prog, out) else: print(" 📋 DEBUG: Routing to Basic detection...") return self._check_duplicate_basic(result, idx, prog, out) def _check_duplicate_basic(self, result, idx, prog, out): """Original basic duplicate detection""" try: result_clean = re.sub(r'<[^>]+>', '', result).strip().lower() result_sample = result_clean[:1000] lookback_chapters = self.config.DUPLICATE_LOOKBACK_CHAPTERS for prev_idx in range(max(0, idx - lookback_chapters), idx): prev_key = str(prev_idx) if prev_key in prog["chapters"] and prog["chapters"][prev_key].get("output_file"): prev_file = prog["chapters"][prev_key]["output_file"] prev_path = os.path.join(out, prev_file) if os.path.exists(prev_path): try: with open(prev_path, 'r', encoding='utf-8') as f: prev_content = f.read() prev_clean = re.sub(r'<[^>]+>', '', prev_content).strip().lower() prev_sample = prev_clean[:1000] # Use SequenceMatcher for similarity comparison similarity = SequenceMatcher(None, result_sample, prev_sample).ratio() if similarity >= 0.85: # 85% threshold print(f" 🚀 Basic detection: Duplicate found ({int(similarity*100)}%)") return True, int(similarity * 100) except Exception as e: print(f" Warning: Failed to read {prev_path}: {e}") continue return False, 0 except Exception as e: print(f" Warning: Failed to check duplicate content: {e}") return False, 0 def _check_duplicate_cascading(self, result, idx, prog, out): """Cascading detection - basic first, then AI Hunter for borderline cases""" # Step 1: Basic is_duplicate_basic, similarity_basic = self._check_duplicate_basic(result, idx, prog, out) if is_duplicate_basic: return True, similarity_basic # Step 2: If basic detection finds moderate similarity, use AI Hunter if similarity_basic >= 60: # Configurable threshold print(f" 🤖 Moderate similarity ({similarity_basic}%) - running AI Hunter analysis...") if hasattr(self, '_check_duplicate_ai_hunter'): is_duplicate_ai, similarity_ai = self._check_duplicate_ai_hunter(result, idx, prog, out) if is_duplicate_ai: return True, similarity_ai else: print(" ⚠️ AI Hunter method not available for cascading analysis") return False, max(similarity_basic, 0) def _extract_text_features(self, text): """Extract multiple features from text for AI Hunter analysis""" features = { 'semantic': {}, 'structural': {}, 'characters': [], 'patterns': {} } # Semantic fingerprint lines = text.split('\n') # Character extraction (names that appear 3+ times) words = re.findall(r'\b[A-Z][a-z]+\b', text) word_freq = Counter(words) features['characters'] = [name for name, count in word_freq.items() if count >= 3] # Dialogue patterns dialogue_patterns = re.findall(r'"([^"]+)"', text) features['semantic']['dialogue_count'] = len(dialogue_patterns) features['semantic']['dialogue_lengths'] = [len(d) for d in dialogue_patterns[:10]] # Speaker patterns speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered)', text.lower()) features['semantic']['speakers'] = list(set(speaker_patterns[:20])) # Number extraction numbers = re.findall(r'\b\d+\b', text) features['patterns']['numbers'] = numbers[:20] # Structural signature para_lengths = [] dialogue_count = 0 for para in text.split('\n\n'): if para.strip(): para_lengths.append(len(para)) if '"' in para: dialogue_count += 1 features['structural']['para_count'] = len(para_lengths) features['structural']['avg_para_length'] = sum(para_lengths) / max(1, len(para_lengths)) features['structural']['dialogue_ratio'] = dialogue_count / max(1, len(para_lengths)) # Create structural pattern string pattern = [] for para in text.split('\n\n')[:20]: # First 20 paragraphs if para.strip(): if '"' in para: pattern.append('D') # Dialogue elif len(para) > 300: pattern.append('L') # Long elif len(para) < 100: pattern.append('S') # Short else: pattern.append('M') # Medium features['structural']['pattern'] = ''.join(pattern) return features def _calculate_exact_similarity(self, text1, text2): """Calculate exact text similarity""" return SequenceMatcher(None, text1.lower(), text2.lower()).ratio() def _calculate_smart_similarity(self, text1, text2): """Smart similarity with length-aware sampling""" # Check length ratio first len_ratio = len(text1) / max(1, len(text2)) if len_ratio < 0.7 or len_ratio > 1.3: return 0.0 # Smart sampling for large texts if len(text1) > 10000: sample_size = 3000 samples1 = [ text1[:sample_size], text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2], text1[-sample_size:] ] samples2 = [ text2[:sample_size], text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2], text2[-sample_size:] ] similarities = [SequenceMatcher(None, s1.lower(), s2.lower()).ratio() for s1, s2 in zip(samples1, samples2)] return sum(similarities) / len(similarities) else: # Use first 2000 chars for smaller texts return SequenceMatcher(None, text1[:2000].lower(), text2[:2000].lower()).ratio() def _calculate_semantic_similarity(self, sem1, sem2): """Calculate semantic fingerprint similarity""" score = 0.0 max_score = 0.0 # Compare dialogue counts if 'dialogue_count' in sem1 and 'dialogue_count' in sem2: max_score += 1.0 ratio = min(sem1['dialogue_count'], sem2['dialogue_count']) / max(1, max(sem1['dialogue_count'], sem2['dialogue_count'])) score += ratio * 0.3 # Compare speakers if 'speakers' in sem1 and 'speakers' in sem2: max_score += 1.0 if sem1['speakers'] and sem2['speakers']: overlap = len(set(sem1['speakers']) & set(sem2['speakers'])) total = len(set(sem1['speakers']) | set(sem2['speakers'])) score += (overlap / max(1, total)) * 0.4 # Compare dialogue lengths pattern if 'dialogue_lengths' in sem1 and 'dialogue_lengths' in sem2: max_score += 1.0 if sem1['dialogue_lengths'] and sem2['dialogue_lengths']: # Compare dialogue length patterns len1 = sem1['dialogue_lengths'][:10] len2 = sem2['dialogue_lengths'][:10] if len1 and len2: avg1 = sum(len1) / len(len1) avg2 = sum(len2) / len(len2) ratio = min(avg1, avg2) / max(1, max(avg1, avg2)) score += ratio * 0.3 return score / max(1, max_score) def _calculate_structural_similarity(self, struct1, struct2): """Calculate structural signature similarity""" score = 0.0 # Compare paragraph patterns if 'pattern' in struct1 and 'pattern' in struct2: pattern_sim = SequenceMatcher(None, struct1['pattern'], struct2['pattern']).ratio() score += pattern_sim * 0.4 # Compare paragraph statistics if all(k in struct1 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']) and \ all(k in struct2 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']): # Paragraph count ratio para_ratio = min(struct1['para_count'], struct2['para_count']) / max(1, max(struct1['para_count'], struct2['para_count'])) score += para_ratio * 0.2 # Average length ratio avg_ratio = min(struct1['avg_para_length'], struct2['avg_para_length']) / max(1, max(struct1['avg_para_length'], struct2['avg_para_length'])) score += avg_ratio * 0.2 # Dialogue ratio similarity dialogue_diff = abs(struct1['dialogue_ratio'] - struct2['dialogue_ratio']) score += (1 - dialogue_diff) * 0.2 return score def _calculate_character_similarity(self, chars1, chars2): """Calculate character name similarity""" if not chars1 or not chars2: return 0.0 # Find overlapping characters set1 = set(chars1) set2 = set(chars2) overlap = len(set1 & set2) total = len(set1 | set2) return overlap / max(1, total) def _calculate_pattern_similarity(self, pat1, pat2): """Calculate pattern-based similarity""" score = 0.0 # Compare numbers (they rarely change in translations) if 'numbers' in pat1 and 'numbers' in pat2: nums1 = set(pat1['numbers']) nums2 = set(pat2['numbers']) if nums1 and nums2: overlap = len(nums1 & nums2) total = len(nums1 | nums2) score = overlap / max(1, total) return score def generate_rolling_summary( self, history_manager, actual_num, base_system_content=None, source_text=None, previous_summary_text=None, previous_summary_chapter_num=None, prefer_translations_only_user=False, ): """Generate rolling summary after a chapter for context continuity. Uses a dedicated summary system prompt (with glossary) distinct from translation. Writes the summary to rolling_summary.txt and returns the summary string. IMPORTANT: The SUMMARY_ROLE setting controls what is sent to the summary API: - system: send system prompt + user message containing ONLY the translated text - user: send ONLY a user message (configured prompt template + translated text) - both: send system + user (current/legacy behavior) Optional: - previous_summary_text: when provided, it is sent as an assistant message for context. - prefer_translations_only_user: when True, the user message will be ONLY the translated text (even if SUMMARY_ROLE would otherwise use the configured user template). """ if not self.config.USE_ROLLING_SUMMARY: return None current_history = history_manager.load_history() messages_to_include = self.config.ROLLING_SUMMARY_EXCHANGES * 2 # Prefer directly provided source text (e.g., just-translated chapter) when available assistant_responses = [] if source_text and isinstance(source_text, str) and source_text.strip(): assistant_responses = [source_text] else: if len(current_history) >= 2: recent_messages = current_history[-messages_to_include:] if messages_to_include > 0 else current_history for h in recent_messages: if h.get("role") == "assistant": assistant_responses.append(h["content"]) # If still empty, skip quietly if not assistant_responses: return None # Build a dedicated summary system prompt (do NOT reuse main translation system prompt) # Append glossary to keep terminology consistent summary_system_template = os.getenv("ROLLING_SUMMARY_SYSTEM_PROMPT", "You create concise summaries for continuity.").strip() try: glossary_path = find_glossary_file(self.out_dir) except Exception: glossary_path = None # Rolling summary generation is a summarization-only call; do NOT append glossary here. # (This keeps prompts smaller and avoids glossary-compression logic for summaries.) _prev_append_glossary_env = os.environ.get("APPEND_GLOSSARY") try: os.environ["APPEND_GLOSSARY"] = "0" system_prompt = build_system_prompt(summary_system_template, glossary_path, source_text=source_text) finally: if _prev_append_glossary_env is None: os.environ.pop("APPEND_GLOSSARY", None) else: os.environ["APPEND_GLOSSARY"] = _prev_append_glossary_env # Add explicit instruction for clarity (glossary usage instructions come from APPEND_GLOSSARY_PROMPT). system_prompt += "\n\n[Instruction: Update the rolling summary using any prior summary context provided, plus the newly provided translated text. Do not include warnings or explanations.]" user_prompt_template = os.getenv( "ROLLING_SUMMARY_USER_PROMPT", "Summarize the key events, characters, tone, and important details from these translations. " "Focus on: character names/relationships, plot developments, and any special terminology used.\n\n" "{translations}" ) translations_text = "\n---\n".join(assistant_responses) user_prompt = user_prompt_template.replace("{translations}", translations_text) # Optional: provide the previous rolling summary as an assistant message for context. # IMPORTANT: This MUST NOT be duplicated into the user message. prev_summary_msg = None if previous_summary_text and isinstance(previous_summary_text, str) and previous_summary_text.strip(): prev_summary_msg = { "role": "assistant", "content": ( "[PREVIOUS ROLLING SUMMARY — UPDATE THIS]\n" + previous_summary_text.strip() + "\n[END PREVIOUS ROLLING SUMMARY]" ), } # SUMMARY_ROLE also controls the rolling-summary generation payload. # Default to 'both' to preserve legacy behavior when the env var isn't set. summary_role = (os.getenv("SUMMARY_ROLE", "both") or "both").strip().lower() # When requested, force the user message to be ONLY the translated text. if prefer_translations_only_user: summary_role = "system" # ensures we include system prompt + translations-only user message if summary_role == "system": # System prompt + user content containing ONLY the translated text summary_msgs = [{"role": "system", "content": system_prompt}] if prev_summary_msg: summary_msgs.append(prev_summary_msg) summary_msgs.append({"role": "user", "content": translations_text}) elif summary_role == "user": # User prompt only (as configured) with translated text inside it summary_msgs = [] if prev_summary_msg: summary_msgs.append(prev_summary_msg) summary_msgs.append({"role": "user", "content": user_prompt}) else: # both (current behavior) summary_msgs = [{"role": "system", "content": system_prompt}] if prev_summary_msg: summary_msgs.append(prev_summary_msg) summary_msgs.append({"role": "user", "content": f"[Rolling Summary of Chapter {actual_num}]\n" + user_prompt}) try: # Get configurable rolling summary token limit # -1 means: use the main MAX_OUTPUT_TOKENS value raw_max = os.getenv('ROLLING_SUMMARY_MAX_TOKENS', '-1') try: rolling_summary_max_tokens = int(str(raw_max).strip()) except Exception: rolling_summary_max_tokens = -1 if rolling_summary_max_tokens == -1: rolling_summary_max_tokens = int(getattr(self.config, 'MAX_OUTPUT_TOKENS', 8192)) send_result = send_with_interrupt( summary_msgs, self.client, self.config.TEMP, min(int(rolling_summary_max_tokens), self.config.MAX_OUTPUT_TOKENS), self.check_stop, context='summary' ) # send_with_interrupt may return: # - a plain string (content) # - (content, finish_reason) # - (content, finish_reason, raw_obj) # We only need the content for rolling summaries. if isinstance(send_result, tuple) and len(send_result) >= 1: summary_resp = send_result[0] else: summary_resp = send_result # Save the summary to the output folder summary_file = os.path.join(self.out_dir, "rolling_summary.txt") mode = "a" if self.config.ROLLING_SUMMARY_MODE == "append" else "w" # Header formatting: # - append mode: each appended block corresponds to a specific chapter → keep chapter-specific header # - replace mode: file is overwritten and represents the current rolling window → label as "Last N Chapters" if mode == "a": header_title = f"=== Rolling Summary of Chapter {actual_num} ===" else: try: _n = int(getattr(self.config, 'ROLLING_SUMMARY_MAX_ENTRIES', 0) or 0) except Exception: _n = 0 header_title = f"=== Rolling Summary of Last {_n} Chapters ===" if _n > 0 else "=== Rolling Summary ===" header = header_title + "\n" with open(summary_file, mode, encoding="utf-8") as sf: if mode == "a": sf.write("\n\n") sf.write(header) sf.write(summary_resp.strip()) # If in append mode, trim to retain only the last N entries if configured try: if self.config.ROLLING_SUMMARY_MODE == "append": max_entries = int(getattr(self.config, "ROLLING_SUMMARY_MAX_ENTRIES", 0) or 0) if max_entries > 0: with open(summary_file, 'r', encoding='utf-8') as rf: content = rf.read() # Find the start of each summary block by header line headers = [m.start() for m in re.finditer(r"(?m)^===\s*Rolling Summary.*$", content)] if len(headers) > max_entries: # Keep only the last max_entries blocks keep_starts = headers[-max_entries:] blocks = [] for i, s in enumerate(keep_starts): e = keep_starts[i + 1] if i + 1 < len(keep_starts) else len(content) block = content[s:e].strip() if block: blocks.append(block) trimmed_content = ("\n\n".join(blocks) + "\n") if blocks else "" with open(summary_file, 'w', encoding='utf-8') as wf: wf.write(trimmed_content) # Optional log showing retained count try: self._log(f"📚 Total summaries in memory: {len(blocks)} (trimmed to last {max_entries})") except Exception: pass except Exception as _trim_err: try: self._log(f"⚠️ Failed to trim rolling summaries: {_trim_err}") except Exception: pass # Log to GUI if available, otherwise console try: self._log(f"📝 Generated rolling summary for Chapter {actual_num} ({'append' if mode=='a' else 'replace'} mode)") self._log(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)") except Exception: print(f"📝 Generated rolling summary for Chapter {actual_num} ({'append' if mode=='a' else 'replace'} mode)") print(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)") return summary_resp.strip() except Exception as e: try: self._log(f"⚠️ Failed to generate rolling summary: {e}") except Exception: print(f"⚠️ Failed to generate rolling summary: {e}") return None def translate_with_retry(self, msgs, chunk_html, c, chunk_idx, total_chunks, merge_group_len=None, merged_chapters=None): """Handle translation with retry logic Args: merged_chapters: Optional list of chapter numbers that were merged into this request """ # CRITICAL FIX: Reset client state for each chunk if hasattr(self.client, 'reset_cleanup_state'): self.client.reset_cleanup_state() # Also ensure we're not in cleanup mode from previous operations if hasattr(self.client, '_in_cleanup'): self.client._in_cleanup = False if hasattr(self.client, '_cancelled'): self.client._cancelled = False # Reinitialize Gemini client if it was closed by a previous timeout if hasattr(self.client, 'gemini_client') and self.client.gemini_client is None: try: self.client._setup_client() except Exception: pass truncation_retry_count = 0 split_failed_retry_count = 0 # Get retry attempts from AI Hunter config if available ai_config = {} try: # Try to get AI Hunter config from environment variable first ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG') if ai_hunter_config_str: ai_config = json.loads(ai_hunter_config_str) else: # Fallback to config attribute ai_config = getattr(self.config, 'ai_hunter_config', {}) except (json.JSONDecodeError, AttributeError): ai_config = {} if isinstance(ai_config, dict): max_retries = ai_config.get('retry_attempts', 3) max_duplicate_retries = ai_config.get('retry_attempts', 6) # Use same setting for duplicate retries else: max_retries = 3 max_duplicate_retries = 6 try: truncation_retry_limit = int(os.getenv("TRUNCATION_RETRY_ATTEMPTS", "1")) except Exception: truncation_retry_limit = 1 try: split_failed_retry_limit = int(getattr(self.config, 'SPLIT_FAILED_RETRY_ATTEMPTS', 2)) except Exception: split_failed_retry_limit = 2 disable_merge_fallback_flag = os.getenv("DISABLE_MERGE_FALLBACK", "0") == "1" or getattr(self.config, 'DISABLE_MERGE_FALLBACK', False) truncation_retry_enabled = (os.getenv("RETRY_TRUNCATED", "0") == "1") or bool(getattr(self.config, "RETRY_TRUNCATED", False)) split_retry_enabled = (os.getenv("RETRY_SPLIT_FAILED", "0") == "1") or bool(getattr(self.config, "RETRY_SPLIT_FAILED", False)) duplicate_retry_count = 0 timeout_retry_count = 0 try: max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2")) except Exception: max_timeout_retries = 2 history_purged = False original_max_tokens = self.config.MAX_OUTPUT_TOKENS original_temp = self.config.TEMP original_user_prompt = msgs[-1]["content"] # Determine stable chapter number for this chunk (used for payload metadata) idx = c.get('__index', 0) actual_num = c.get('actual_chapter_num', c.get('num', idx + 1)) # Determine chunk timeout respecting runtime env overrides. # If RETRY_TIMEOUT is "0"/false/blank, disable chunk timeouts entirely. env_retry = os.getenv("RETRY_TIMEOUT") if env_retry is not None: retry_timeout_enabled = env_retry.strip().lower() not in ("0", "false", "off", "") else: retry_timeout_enabled = bool(getattr(self.config, "RETRY_TIMEOUT", False)) chunk_timeout = None if retry_timeout_enabled: env_ct = os.getenv("CHUNK_TIMEOUT") if env_ct and str(env_ct).strip().lower() not in ("", "none", "0"): try: chunk_timeout = int(float(env_ct)) except Exception: chunk_timeout = getattr(self.config, "CHUNK_TIMEOUT", None) else: chunk_timeout = getattr(self.config, "CHUNK_TIMEOUT", None) # Treat non-positive timeouts as disabled try: if chunk_timeout is not None and float(chunk_timeout) <= 0: chunk_timeout = None except Exception: chunk_timeout = None result = None finish_reason = None # Fallback stop callback (overridden later for chunked chapters) def local_stop_cb(): # First check if we should abort due to internal error/cancel # Check stop_callback directly to bypass check_stop's graceful logic override stop_requested = False if self.stop_callback and self.stop_callback(): stop_requested = True if stop_requested: # User requested stop. Check graceful settings. graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if not graceful_stop_active: # Force stop log_stop_once() return True # Graceful stop is active. wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' # If wait_for_chunks is OFF (0), we should cancel immediately # UNLESS we are at the last chunk (chunk_idx == total_chunks), # in which case we wait for it to complete the chapter. is_last_chunk = (chunk_idx == total_chunks) if not wait_for_chunks and not is_last_chunk: log_stop_once() return True # If wait_for_chunks is ON (1), we return False to let it finish return False return False while True: if local_stop_cb(): return None, None, None try: current_max_tokens = self.config.MAX_OUTPUT_TOKENS current_temp = self.config.TEMP # Compute token counts, separating assistant (memory/context) tokens when present total_tokens = 0 assistant_tokens = 0 for m in msgs: content = m.get("content", "") tokens = self.chapter_splitter.count_tokens(content) total_tokens += tokens if m.get("role") == "assistant": assistant_tokens += tokens non_assistant_tokens = total_tokens - assistant_tokens # Determine file reference if c.get('is_chunk', False): # Handle float chapter numbers in file reference chapter_num_for_ref = c['num'] if isinstance(chapter_num_for_ref, float): # Keep decimal notation for display (e.g., "Section_1.0") file_ref = f"Section_{chapter_num_for_ref}" else: file_ref = f"Section_{chapter_num_for_ref}" else: # Check if this is a text file - need to access from self is_text_source = self.is_text_file or c.get('filename', '').endswith('.txt') terminology = "Section" if is_text_source else "Chapter" chapter_num_for_ref = c['num'] if isinstance(chapter_num_for_ref, float): file_ref = c.get('original_basename', f'{terminology}_{chapter_num_for_ref}') else: file_ref = c.get('original_basename', f'{terminology}_{chapter_num_for_ref}') # When contextual translation is enabled and we have assistant-role # context (memory, summaries, etc.), surface its token share explicitly. if getattr(self.config, 'CONTEXTUAL', False) and assistant_tokens > 0: print( f"💬 Chunk {chunk_idx}/{total_chunks} combined prompt: " f"{total_tokens:,} tokens (system + user: {non_assistant_tokens:,}, " f"assistant/memory: {assistant_tokens:,}) / {self.get_token_budget_str()} [File: {file_ref}]" ) else: print( f"💬 Chunk {chunk_idx}/{total_chunks} combined prompt: " f"{total_tokens:,} tokens (system + user) / {self.get_token_budget_str()} [File: {file_ref}]" ) self.client.context = 'translation' # Generate filename for chunks if chunk_idx and total_chunks > 1: # This is a chunk - use chunk naming format # Prefer original_basename to preserve source file's zero-padding (e.g., 0009_10) original_basename = c.get('original_basename', '') if original_basename: # Use original basename (without extension) + _chunk_N base = os.path.splitext(original_basename)[0] fname = f"response_{base}_chunk_{chunk_idx}.html" else: # Handle float chapter numbers (e.g., 1.0, 2.5) properly chapter_num = c['num'] if isinstance(chapter_num, float): # For decimal chapters like 1.5, use format like "response_001_5_chunk_1.html" major = int(chapter_num) minor = int(round((chapter_num - major) * 100)) # 1.5 -> 50, 1.1 -> 10 if minor > 0: fname = f"response_{major:03d}_{minor:02d}_chunk_{chunk_idx}.html" else: # It's like 1.0, just use the integer part fname = f"response_{major:03d}_chunk_{chunk_idx}.html" else: fname = f"response_{chapter_num:03d}_chunk_{chunk_idx}.html" else: # Not a chunk - use regular naming fname = FileUtilities.create_chapter_filename(c, c.get('actual_chapter_num', c['num'])) # Set output filename BEFORE the API call if hasattr(self.client, 'set_output_filename'): self.client.set_output_filename(fname) # Track the filename so truncation logs know which file this is if hasattr(self.client, '_current_output_file'): self.client._current_output_file = fname # Generate unique request ID for this chunk #request_id = f"{c['num']:03d}_chunk{chunk_idx}_{uuid.uuid4().hex[:8]}" chapter_ctx = { 'chapter': actual_num, 'chunk': chunk_idx, 'total_chunks': total_chunks, 'merged_chapters': merged_chapters, } result, finish_reason, raw_obj = send_with_interrupt( msgs, self.client, current_temp, current_max_tokens, local_stop_cb, chunk_timeout, context='translation', chapter_context=chapter_ctx, bypass_graceful_stop=True ) # Enhanced mode workflow: # 1. Original HTML -> html2text -> Markdown/plain text (during extraction) # 2. Markdown sent to translation API (better for translation quality) # 3. Translated markdown -> HTML conversion (here) if result and c.get("enhanced_extraction", False): print(f"🔄 Converting translated markdown back to HTML...") result = convert_enhanced_text_to_html(result, c) # Emergency Image Restoration (if enabled globally OR forced for this chapter) # Check for forced flag in progress data attached to chunk prog_data = c.get('__progress', {}) force_restore = prog_data.get('force_image_restore', False) if isinstance(prog_data, dict) else False if result and (self.config.EMERGENCY_IMAGE_RESTORE or force_restore): # Use original HTML if available (for enhanced extraction), otherwise use chunk_html source_html = c.get('original_html', chunk_html) result = ContentProcessor.emergency_restore_images(result, source_html) retry_needed = False retry_reason = "" retry_limit_for_reason = None is_duplicate_retry = False # Mark if we're already in a truncation retry to prevent nested retries # This flag is set by the char-ratio check below to prevent infinite recursion in_truncation_retry = c.get('__in_truncation_retry', False) # Debug logging to verify the toggle state #print(f" DEBUG: finish_reason='{finish_reason}', truncation_enabled={truncation_retry_enabled}, split_retry_enabled={split_retry_enabled}") # DISABLED: Truncation retries are now handled entirely by unified_api_client.py # This prevents double/triple retry cascades (unified_api_client → translate_with_retry → char-ratio check) if False and finish_reason == "length" and not in_truncation_retry: if truncation_retry_enabled and truncation_retry_count < truncation_retry_limit: # Always attempt a truncation retry, even if token limits are equal new_token_limit = self.config.MAX_RETRY_TOKENS retry_needed = True retry_reason = "truncated output" retry_limit_for_reason = truncation_retry_limit old_limit = self.config.MAX_OUTPUT_TOKENS self.config.MAX_OUTPUT_TOKENS = new_token_limit truncation_retry_count += 1 print(f" 🔄 TRUNCATION RETRY: Attempt {truncation_retry_count}/{truncation_retry_limit} — tokens {old_limit} → {new_token_limit}") elif truncation_retry_enabled: print(f" ⚠️ TRUNCATION DETECTED: Max truncation retries ({truncation_retry_limit}) reached - accepting truncated response") else: print(f" ⏭️ TRUNCATION DETECTED: Auto-retry is DISABLED - accepting truncated response") elif False and finish_reason == "length" and in_truncation_retry: # We're in a char-ratio triggered retry - don't nest another retry print(f" 📋 Already in truncation retry chain - skipping nested retry") # Treat split failures like truncation for auto-retry split_failed_in_finish = bool(finish_reason and 'split' in str(finish_reason).lower()) split_failed_in_body = bool(isinstance(result, str) and 'SPLIT_FAILED' in result) # Check for split markers if this is a merged request split_validation_failed = False if merge_group_len and merge_group_len > 1 and result and isinstance(result, str): # We need to import RequestMerger here or assume it's available in module scope # RequestMerger is defined at module level try: # Clean artifacts first? No, we want to check raw result usually, # but split_by_markers is robust. # However, translate_with_retry doesn't clean artifacts yet. # Let's try splitting. split_sections = RequestMerger.split_by_markers(result, merge_group_len) if not split_sections or len(split_sections) != merge_group_len: print(f" ⚠️ Split validation failed: Expected {merge_group_len} sections") split_validation_failed = True except Exception as e: print(f" ⚠️ Split validation error: {e}") split_validation_failed = True if not retry_needed and (split_failed_in_finish or split_failed_in_body or split_validation_failed) and split_retry_enabled: if split_failed_retry_count < split_failed_retry_limit: retry_needed = True retry_reason = "split failed" retry_limit_for_reason = split_failed_retry_limit split_failed_retry_count += 1 print(f" 🔄 Split failed — retrying merged request (attempt {split_failed_retry_count}/{split_failed_retry_limit})") else: print(f" ⚠️ SPLIT FAILED: Max split-failed retries ({split_failed_retry_limit}) reached - accepting response") if not retry_needed: # Force re-read the environment variable to ensure we have current setting duplicate_enabled = os.getenv("RETRY_DUPLICATE_BODIES", "0") == "1" if duplicate_enabled and duplicate_retry_count < max_duplicate_retries: idx = c.get('__index', 0) prog = c.get('__progress', {}) print(f" 🔍 Checking for duplicate content...") # Get actual chapter number for duplicate detection actual_num = c.get('actual_chapter_num', c.get('num', idx + 1)) is_duplicate, similarity = self.check_duplicate_content(result, idx, prog, self.out_dir, actual_num) if is_duplicate: retry_needed = True is_duplicate_retry = True retry_reason = f"duplicate content (similarity: {similarity}%)" duplicate_retry_count += 1 # Check if temperature change is disabled disable_temp_change = ai_config.get('disable_temperature_change', False) if isinstance(ai_config, dict) else False if duplicate_retry_count >= 3 and not history_purged: print(f" 🧹 Clearing history after 3 attempts...") if 'history_manager' in c: c['history_manager'].save_history([]) history_purged = True if not disable_temp_change: self.config.TEMP = original_temp else: print(f" 🌡️ Temperature change disabled - keeping current temp: {self.config.TEMP}") elif duplicate_retry_count == 1: if disable_temp_change: print(f" 🔄 First duplicate retry - temperature change disabled") else: print(f" 🔄 First duplicate retry - same temperature") elif history_purged: if not disable_temp_change: attempts_since_purge = duplicate_retry_count - 3 self.config.TEMP = min(original_temp + (0.1 * attempts_since_purge), 1.0) print(f" 🌡️ Post-purge temp: {self.config.TEMP}") else: print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}") else: if not disable_temp_change: self.config.TEMP = min(original_temp + (0.1 * (duplicate_retry_count - 1)), 1.0) print(f" 🌡️ Gradual temp increase: {self.config.TEMP}") else: print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}") if duplicate_retry_count == 1: user_prompt = f"[RETRY] Chapter {c['num']}: Ensure unique translation.\n{chunk_html}" elif duplicate_retry_count <= 3: user_prompt = f"[ATTEMPT {duplicate_retry_count}] Translate uniquely:\n{chunk_html}" else: user_prompt = f"Chapter {c['num']}:\n{chunk_html}" msgs[-1] = {"role": "user", "content": user_prompt} if retry_needed: if is_duplicate_retry: print(f" 🔄 Duplicate retry {duplicate_retry_count}/{max_duplicate_retries}") time.sleep(2) continue break except UnifiedClientError as e: error_msg = str(e) if "stopped by user" in error_msg: print("❌ Translation stopped by user during API call") return None, None, None # Treat cancelled errors (from client being closed) as timeout if "cancelled" in error_msg or "Gemini client not initialized" in error_msg: # Check stop flag before retrying if self.check_stop(): print("❌ Translation stopped by user during timeout retry") return None, None, None # During graceful stop, don't retry - skip this chunk graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active: print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Skipped (graceful stop)") return None, "graceful_stop", None if timeout_retry_count < max_timeout_retries: timeout_retry_count += 1 print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: {error_msg}, retrying ({timeout_retry_count}/{max_timeout_retries})...") # Reinitialize the client if it was closed (check correct client based on type) client_type = getattr(self.client, 'client_type', 'unknown') needs_reinit = False if client_type == 'gemini': needs_reinit = hasattr(self.client, 'gemini_client') and self.client.gemini_client is None elif client_type == 'openai': needs_reinit = hasattr(self.client, 'openai_client') and self.client.openai_client is None if needs_reinit: try: print(f" 🔄 Reinitializing {client_type} client...") self.client._setup_client() except Exception as reinit_err: print(f" ⚠️ Failed to reinitialize client: {reinit_err}") # Use SEND_INTERVAL_SECONDS as base, random from half to full import random base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) retry_delay = random.uniform(base_delay / 2, base_delay) print(f" ⏳ Waiting {retry_delay:.1f}s before retry...") time.sleep(retry_delay) continue else: print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached - marking chunk as failed") return "[TIMEOUT]", "timeout", None if "took" in error_msg and "timeout:" in error_msg: # Check stop flag before retrying if self.check_stop(): print("❌ Translation stopped by user during timeout retry") return None, None, None # During graceful stop, don't retry - skip this chunk graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active: print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timed out during graceful stop - skipping retry") return "[TIMEOUT]", "timeout", None if timeout_retry_count < max_timeout_retries: timeout_retry_count += 1 print(f" ⏱️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Chunk took too long, retry {timeout_retry_count}/{max_timeout_retries}") # Use SEND_INTERVAL_SECONDS as base, random from half to full import random base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) retry_delay = random.uniform(base_delay / 2, base_delay) print(f" ⏳ Waiting {retry_delay:.1f}s before retry...") time.sleep(retry_delay) continue else: print(f" ❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries reached - marking chunk as failed") return "[TIMEOUT]", "timeout", None elif "timed out" in error_msg and "timeout:" not in error_msg: # Check stop flag before retrying if self.check_stop(): print("❌ Translation stopped by user during timeout retry") return None, None, None # During graceful stop, don't retry - skip graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active: print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timed out during graceful stop - skipping retry") return "[TIMEOUT]", "timeout", None if timeout_retry_count < max_timeout_retries: timeout_retry_count += 1 print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: {error_msg}, retrying ({timeout_retry_count}/{max_timeout_retries})...") # Use SEND_INTERVAL_SECONDS as base, random from half to full import random base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) retry_delay = random.uniform(base_delay / 2, base_delay) print(f" ⏳ Waiting {retry_delay:.1f}s before retry...") time.sleep(retry_delay) continue else: print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached - marking chunk as failed") return "[TIMEOUT]", "timeout", None elif getattr(e, "error_type", None) == "rate_limit" or getattr(e, "http_status", None) == 429: # Rate limit errors - clean handling without traceback print("⚠️ Rate limited, sleeping 60s…") for i in range(60): if self.check_stop(): print("❌ Translation stopped during rate limit wait") return None, None, None time.sleep(1) continue else: # For unexpected errors, show the error message but suppress traceback in most cases if getattr(e, "error_type", None) in ["api_error", "validation", "prohibited_content"]: print(f"❌ API Error: {error_msg}") raise UnifiedClientError(f"API Error: {error_msg}") else: raise except Exception as e: print(f"❌ Unexpected error during API call: {e}") import traceback print(f"Full traceback:\n{traceback.format_exc()}") raise self.config.MAX_OUTPUT_TOKENS = original_max_tokens self.config.TEMP = original_temp total_simple_retries = truncation_retry_count + split_failed_retry_count if total_simple_retries > 0 or duplicate_retry_count > 0 or timeout_retry_count > 0: if duplicate_retry_count > 0: print(f" 🔄 Restored original temperature: {self.config.TEMP} (after {duplicate_retry_count} duplicate retries)") elif timeout_retry_count > 0: print(f" 🔄 Restored original settings after {timeout_retry_count} timeout retries") elif total_simple_retries > 0: print(f" 🔄 Restored original settings after {total_simple_retries} retries") if duplicate_retry_count >= max_duplicate_retries: print(f" ⚠️ WARNING: Duplicate content issue persists after {max_duplicate_retries} attempts") return result, finish_reason, raw_obj def get_token_budget_str(self): """Get token budget as string""" _tok_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip() max_tokens_limit, budget_str = parse_token_limit(_tok_env) return budget_str # ===================================================== # BATCH TRANSLATION PROCESSOR # ===================================================== class BatchTranslationProcessor: """Handles batch/parallel translation processing""" def __init__(self, config, client, base_msg, out_dir, progress_lock, save_progress_fn, update_progress_fn, check_stop_fn, image_translator=None, is_text_file=False, history_manager=None): self.config = config self.client = client self.base_msg = base_msg self.out_dir = out_dir self.progress_lock = progress_lock self.save_progress_fn = save_progress_fn self.update_progress_fn = update_progress_fn self.check_stop_fn = check_stop_fn self.image_translator = image_translator self.chapters_completed = 0 self.chunks_completed = 0 self.is_text_file = is_text_file # Optional shared HistoryManager for contextual translation across chapters self.history_manager = history_manager # Rolling summary support (batch mode): inject a snapshot per batch. # This is updated by the main thread between batches. import threading self._batch_rolling_summary_lock = threading.Lock() self._batch_rolling_summary_text = "" # exact rolling_summary.txt contents for current batch # Optionally log multi-key status if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys: stats = self.client.get_stats() print(f"🔑 Batch processor using multi-key mode: {stats.get('total_keys', 0)} keys") def set_batch_rolling_summary_text(self, text: str) -> None: """Set the rolling summary snapshot to be injected for the current batch.""" try: if text is None: text = "" except Exception: text = "" with self._batch_rolling_summary_lock: self._batch_rolling_summary_text = text def get_batch_rolling_summary_text(self) -> str: """Get the rolling summary snapshot (thread-safe).""" with self._batch_rolling_summary_lock: return self._batch_rolling_summary_text def process_single_chapter(self, chapter_data): """Process a single chapter (runs in thread)""" import threading from concurrent.futures import ThreadPoolExecutor, as_completed idx, chapter = chapter_data chap_num = chapter["num"] # Use the pre-calculated actual_chapter_num from the main loop actual_num = chapter.get('actual_chapter_num') # Fallback if not set (common in batch mode where first pass might be skipped) if actual_num is None: # CHUNK FIX: For split text/PDF chunks with decimal numbering, use chap_num directly # Chunks have 'is_chunk' flag and decimal 'num' values (1.0, 1.1, etc.) if chapter.get('is_chunk', False) and isinstance(chap_num, float): actual_num = chap_num else: # Try to extract it using the same logic as non-batch mode raw_num = FileUtilities.extract_actual_chapter_number(chapter, patterns=None, config=self.config) # Apply offset if configured offset = self.config.CHAPTER_NUMBER_OFFSET if hasattr(self.config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num += offset # Check if zero detection is disabled if hasattr(self.config, 'DISABLE_ZERO_DETECTION') and self.config.DISABLE_ZERO_DETECTION: actual_num = raw_num elif hasattr(self.config, '_uses_zero_based') and self.config._uses_zero_based: # This is a 0-based novel, adjust the number actual_num = raw_num + 1 else: # Default to raw number (1-based or unknown) actual_num = raw_num print(f" 📖 Extracted actual chapter number: {actual_num} (from raw: {raw_num})") # APPLY INTERRUPTIBLE THREADING DELAY AFTER determining chapter number thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) if thread_delay > 0: # Check if we need to wait (same logic as unified_api_client) if hasattr(self.client, '_thread_submission_lock') and hasattr(self.client, '_last_thread_submission_time'): with self.client._thread_submission_lock: current_time = time.time() time_since_last = current_time - self.client._last_thread_submission_time if time_since_last < thread_delay: sleep_time = thread_delay - time_since_last thread_name = threading.current_thread().name # Use actual_num now that it's been determined # Only log if not during graceful stop (about to be cancelled) graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if not graceful_stop_active: print(f"🧵 [{thread_name}] Applying thread delay: {sleep_time:.3f}s for Chapter {actual_num}") # Interruptible sleep - check stop flag every 0.1 seconds elapsed = 0 check_interval = 0.1 while elapsed < sleep_time: if self.check_stop_fn(): # Only log if not during graceful stop (expected interruption) graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if not graceful_stop_active: print(f"🛑 Threading delay interrupted by stop flag") raise Exception("Translation stopped by user during threading delay") sleep_chunk = min(check_interval, sleep_time - elapsed) time.sleep(sleep_chunk) elapsed += sleep_chunk self.client._last_thread_submission_time = time.time() if not hasattr(self.client, '_thread_submission_count'): self.client._thread_submission_count = 0 self.client._thread_submission_count += 1 # Initialize variables that might be needed in except block content_hash = None ai_features = None # Reinitialize Gemini client if it was closed by a previous timeout if hasattr(self.client, 'gemini_client') and self.client.gemini_client is None: try: self.client._setup_client() except Exception: pass try: # Check if this is from a text file is_text_source = self.is_text_file or chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f"🔄 Starting #{idx+1} (Internal: {terminology} {chap_num}, Actual: {terminology} {actual_num}) (thread: {threading.current_thread().name}) [File: {chapter.get('original_basename', f'{terminology}_{chap_num}')}]") content_hash = chapter.get("content_hash") or ContentProcessor.get_content_hash(chapter["body"]) # Determine output filename early so we can track it in progress fname = FileUtilities.create_chapter_filename(chapter, actual_num) with self.progress_lock: self.update_progress_fn(idx, actual_num, content_hash, fname, status="in_progress") self.save_progress_fn() chapter_body = chapter["body"] if chapter.get('has_images') and self.image_translator and self.config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Processing images for Chapter {actual_num}...") self.image_translator.set_current_chapter(actual_num) chapter_body, image_translations = process_chapter_images( chapter_body, actual_num, self.image_translator, self.check_stop_fn ) if image_translations: # Create a copy of the processed body from bs4 import BeautifulSoup c = chapter soup_for_text = BeautifulSoup(c["body"], 'html.parser') # Remove all translated content for trans_div in soup_for_text.find_all('div', class_='translated-text-only'): trans_div.decompose() # Use this cleaned version for text translation text_to_translate = str(soup_for_text) final_body_with_images = c["body"] else: text_to_translate = c["body"] image_translations = {} print(f"✅ Processed {len(image_translations)} images for Chapter {actual_num}") # Build chapter-specific system prompt with glossary compression glossary_path = find_glossary_file(self.out_dir) # Capture compression stats if enabled compress_glossary_enabled = os.getenv("COMPRESS_GLOSSARY_PROMPT", "0") == "1" if compress_glossary_enabled and glossary_path and os.path.exists(glossary_path): try: # Load glossary to get original size with open(glossary_path, 'r', encoding='utf-8') as f: if glossary_path.lower().endswith(('.csv', '.md', '.txt')): original_glossary = f.read() else: try: glossary_data = json.load(f) original_glossary = json.dumps(glossary_data, ensure_ascii=False, indent=2) except json.JSONDecodeError: # If JSON parsing fails, treat as text f.seek(0) original_glossary = f.read() original_length = len(original_glossary) # Build system prompt with compression # Use get_system_prompt(1) since this is a single chapter (no merging) base_prompt = self.config.get_system_prompt(actual_merge_count=1) chapter_system_prompt = build_system_prompt(base_prompt, glossary_path, source_text=chapter_body) # Extract compressed glossary from system prompt to measure compression # The glossary is appended after the prompt, so we can estimate the size prompt_without_glossary = base_prompt glossary_in_prompt = len(chapter_system_prompt) - len(prompt_without_glossary) if len(chapter_system_prompt) > len(prompt_without_glossary) else 0 if glossary_in_prompt > 0 and original_length > glossary_in_prompt: reduction_pct = ((original_length - glossary_in_prompt) / original_length * 100) # Calculate token savings try: import tiktoken try: enc = tiktoken.encoding_for_model(self.config.MODEL) except: enc = tiktoken.get_encoding('cl100k_base') original_tokens = len(enc.encode(original_glossary)) compressed_tokens = len(enc.encode(chapter_system_prompt)) - len(enc.encode(prompt_without_glossary)) token_reduction_pct = ((original_tokens - compressed_tokens) / original_tokens * 100) if original_tokens > 0 else 0 print(f"🗜️ Glossary: {original_length:,}→{glossary_in_prompt:,} chars ({reduction_pct:.1f}%), {original_tokens:,}→{compressed_tokens:,} tokens ({token_reduction_pct:.1f}%)") except ImportError: print(f"🗜️ Glossary compressed: {original_length:,} → {glossary_in_prompt:,} chars ({reduction_pct:.1f}% reduction)") except Exception as e: print(f"⚠️ Failed to measure glossary compression: {e}") chapter_system_prompt = build_system_prompt(self.config.get_system_prompt(actual_merge_count=1), glossary_path, source_text=chapter_body) else: chapter_system_prompt = build_system_prompt(self.config.get_system_prompt(actual_merge_count=1), glossary_path, source_text=chapter_body) # Check if chapter needs chunking from chapter_splitter import ChapterSplitter chapter_splitter = ChapterSplitter(model_name=self.config.MODEL) # Get token budget token_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip() if not token_env or token_env.lower() == "unlimited": max_input_tokens = 1000000 budget_str = "unlimited" elif token_env.isdigit(): max_input_tokens = int(token_env) budget_str = f"{max_input_tokens:,}" else: max_input_tokens = 1000000 budget_str = "unlimited" # Calculate available tokens for content based on effective OUTPUT limit (same as calculation phase) # Use output token limit with compression factor, not input limit max_output_tokens = self.config.get_effective_output_limit() safety_margin_output = 500 compression_factor = self.config.COMPRESSION_FACTOR available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) available_tokens = max(available_tokens, 1000) # Ensure minimum # Split into chunks if needed # Get filename for content type detection chapter_filename = chapter.get('filename') or chapter.get('original_basename', '') chunks = chapter_splitter.split_chapter(chapter_body, available_tokens, filename=chapter_filename) total_chunks = len(chunks) file_ref = chapter.get('original_basename', f'{terminology}_{chap_num}') # Initialize shared structures for chunk processing (works for 1 or many chunks) translated_chunks = [None] * total_chunks # Pre-allocate to maintain order chunks_lock = threading.Lock() if total_chunks > 1: print(f"✂️ Chapter {actual_num} requires {total_chunks} chunks - processing in parallel") def process_chunk(chunk_data): """Process a single chunk in parallel""" chunk_html, chunk_idx, chunk_total = chunk_data # Check if stop requested - but respect wait_for_chunks setting if local_stop_cb(): graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if not (graceful_stop_active and wait_for_chunks and chunk_total > 1): # Return 5 values to match expected signature return None, chunk_idx, None, False, "cancelled" # If wait_for_chunks is enabled, continue processing # Build user prompt for this chunk if total_chunks > 1: chunk_prompt_template = os.getenv("TRANSLATION_CHUNK_PROMPT", "[PART {chunk_idx}/{total_chunks}]\\n{chunk_html}") user_prompt = chunk_prompt_template.format( chunk_idx=chunk_idx, total_chunks=total_chunks, chunk_html=chunk_html ) else: user_prompt = chunk_html # Build history-based memory when contextual translation is enabled memory_msgs = [] if ( self.config.CONTEXTUAL and self.history_manager is not None and getattr(self.config, 'HIST_LIMIT', 0) > 0 ): try: # Thread-safe history access - load_history() already has internal locking history = self.history_manager.load_history() hist_limit = getattr(self.config, 'HIST_LIMIT', 0) trimmed = history[-hist_limit * 2:] include_source = os.getenv("INCLUDE_SOURCE_IN_HISTORY", "0") == "1" model_lower = getattr(self.config, 'MODEL', '').lower() is_gemini_3 = ('gemini-3' in model_lower) or ('gemini-exp-' in model_lower) if is_gemini_3: # Preserve raw content (thought signatures) and reconstruct text when missing for h in trimmed: if not isinstance(h, dict): continue role = h.get('role', 'user') raw_obj = h.get('_raw_content_object') content = h.get('content') or "" if (not content) and raw_obj: content = extract_text_from_raw_content(raw_obj) # Skip empty entries unless raw content exists if not content and raw_obj is None: continue if role == 'user' and not include_source: continue msg = {'role': role} if content: msg['content'] = content if raw_obj is not None: msg['_raw_content_object'] = raw_obj memory_msgs.append(msg) else: # Original memory block approach for non-Gemini 3 models memory_blocks = [] for h in trimmed: if not isinstance(h, dict): continue role = h.get('role', 'user') content = h.get('content', '') if not content: continue # Optionally skip previous source text when disabled if role == 'user' and not include_source: continue if role == 'user': prefix = ( "[MEMORY - PREVIOUS SOURCE TEXT]\\n" "This is prior source content provided for context only.\\n" "Do NOT translate or repeat this text directly in your response.\\n\\n" ) else: prefix = ( "[MEMORY - PREVIOUS TRANSLATION]\\n" "This is prior translated content provided for context only.\\n" "Do NOT repeat or re-output this translation.\\n\\n" ) footer = "\\n\\n[END MEMORY BLOCK]\\n" memory_blocks.append(prefix + content + footer) if memory_blocks: combined_memory = "\\n".join(memory_blocks) # Present history as an assistant message so the model # treats it as prior context, not a new user instruction. memory_msgs = [{ 'role': 'assistant', 'content': combined_memory }] except Exception as e: print(f"⚠️ Failed to build contextual memory for batch chunk: {e}") memory_msgs = [] # Build messages for this chunk (system + optional rolling summary + optional memory + user) rolling_summary_msgs = [] if getattr(self.config, 'USE_ROLLING_SUMMARY', False): try: rs_text = self.get_batch_rolling_summary_text() except Exception: rs_text = "" if isinstance(rs_text, str) and rs_text: # Do not strip/parse the file content. Only wrap to prevent accidental translation. rolling_summary_msgs = [{ "role": "assistant", "content": ( "CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n" "[MEMORY] Previous context summary:\n\n" + rs_text + "\n\n" "[END MEMORY]\n" "END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:" ) }] # Build optional assistant prefill message if configured assistant_prefill_msgs = [] if getattr(self.config, 'ASSISTANT_PROMPT', '') and self.config.ASSISTANT_PROMPT.strip(): assistant_prefill_msgs = [{"role": "assistant", "content": self.config.ASSISTANT_PROMPT.strip()}] chapter_msgs = ( [{"role": "system", "content": chapter_system_prompt}] + rolling_summary_msgs + memory_msgs + assistant_prefill_msgs + [{"role": "user", "content": user_prompt}] ) # Abort immediately if a prior chunk triggered prohibition (NOT for user stop) if chunk_abort_event.is_set(): raise UnifiedClientError("Chunk aborted due to prohibited content", error_type="cancelled") # Log combined prompt token count, including assistant/memory tokens when present total_tokens = 0 assistant_tokens = 0 for msg in chapter_msgs: content = msg.get("content", "") tokens = chapter_splitter.count_tokens(content) total_tokens += tokens if msg.get("role") == "assistant": assistant_tokens += tokens non_assistant_tokens = total_tokens - assistant_tokens if self.config.CONTEXTUAL and assistant_tokens > 0: print( f"💬 Chapter {actual_num}: Chunk {chunk_idx}/{total_chunks} combined prompt: " f"{total_tokens:,} tokens (system + user: {non_assistant_tokens:,}, " f"assistant/memory: {assistant_tokens:,}) / {budget_str} [File: {file_ref}]" ) else: print( f"💬 Chapter {actual_num}: Chunk {chunk_idx}/{total_chunks} combined prompt: " f"{total_tokens:,} tokens (system + user) / {budget_str} [File: {file_ref}]" ) # Generate filename before API call if chunk_idx < total_chunks: # This is a chunk - use chunk naming format # Prefer original_basename to preserve source file's zero-padding (e.g., 0009_10) original_basename = chapter.get('original_basename', '') if original_basename: # Use original basename (without extension) + _chunk_N base = os.path.splitext(original_basename)[0] fname = f"response_{base}_chunk_{chunk_idx}.html" elif isinstance(actual_num, float): # For decimal chapters like 1.5, use format like "response_001_5_chunk_1.html" major = int(actual_num) minor = int(round((actual_num - major) * 100)) # 1.5 -> 50, 1.1 -> 10 if minor > 0: fname = f"response_{major:03d}_{minor:02d}_chunk_{chunk_idx}.html" else: # It's like 1.0, just use the integer part fname = f"response_{major:03d}_chunk_{chunk_idx}.html" else: fname = f"response_{actual_num:03d}_chunk_{chunk_idx}.html" else: # Last chunk or single chunk - use regular naming fname = FileUtilities.create_chapter_filename(chapter, actual_num) if hasattr(self.client, 'set_output_filename'): self.client.set_output_filename(fname) if hasattr(self.client, '_current_output_file'): self.client._current_output_file = fname # Set thread-local label so downstream logs include chapter/chunk try: tls = self.client._get_thread_local_client() tls.current_request_label = f"Chapter {actual_num} (chunk {chunk_idx}/{total_chunks})" except Exception: pass # Log removed - unified_api_client._log_pre_stagger will log this # print(f"📤 Sending Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks} to API...") chapter_ctx = { 'chapter': actual_num, 'chunk': chunk_idx, 'total_chunks': total_chunks, } # Get chunk timeout from environment retry_timeout_enabled = os.getenv("RETRY_TIMEOUT", "0") == "1" chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "1800")) if retry_timeout_enabled else None # Timeout retry logic (same as sequential mode) try: max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2")) except Exception: max_timeout_retries = 2 timeout_retry_count = 0 while True: try: result, finish_reason, raw_obj_from_send = send_with_interrupt( chapter_msgs, self.client, self.config.TEMP, self.config.MAX_OUTPUT_TOKENS, local_stop_cb, chunk_timeout=chunk_timeout, context='translation', chapter_context=chapter_ctx, bypass_graceful_stop=True ) break # Success, exit retry loop except UnifiedClientError as e: error_msg = str(e) # Treat cancelled errors (from client being closed) as timeout if "cancelled" in error_msg or "Gemini client not initialized" in error_msg: # Check stop flag before retrying if local_stop_cb(): print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Translation stopped by user during timeout retry") return None, chunk_idx, None, False, "cancelled" # During graceful stop, don't retry - skip this chunk graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active: print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Skipped (graceful stop)") return None, chunk_idx, None, False, "graceful_stop" if timeout_retry_count < max_timeout_retries: timeout_retry_count += 1 print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: API cancelled/client closed, retrying ({timeout_retry_count}/{max_timeout_retries})...") # Reinitialize the client if it was closed (check correct client based on type) client_type = getattr(self.client, 'client_type', 'unknown') needs_reinit = False if client_type == 'gemini': needs_reinit = hasattr(self.client, 'gemini_client') and self.client.gemini_client is None elif client_type == 'openai': needs_reinit = hasattr(self.client, 'openai_client') and self.client.openai_client is None if needs_reinit: try: print(f" 🔄 Reinitializing {client_type} client...") self.client._setup_client() except Exception as reinit_err: print(f" ⚠️ Failed to reinitialize client: {reinit_err}") # Stagger retries to avoid simultaneous API calls # Use SEND_INTERVAL_SECONDS as base, random from half to full import random base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) retry_delay = random.uniform(base_delay / 2, base_delay) print(f" ⏳ Waiting {retry_delay:.1f}s before retry...") time.sleep(retry_delay) continue else: # Max retries reached, return timeout to trigger chapter abort print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached") return "[TIMEOUT]", chunk_idx, None, False, "timeout" # Check for timeout errors elif "timed out" in error_msg: # Check stop flag before retrying if local_stop_cb(): print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Translation stopped by user during timeout retry") return None, chunk_idx, None, False, "cancelled" # During graceful stop, don't retry - skip this chunk graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active: print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timed out during graceful stop - skipping retry") return "[TIMEOUT]", chunk_idx, None, False, "timeout" if timeout_retry_count < max_timeout_retries: timeout_retry_count += 1 print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: API call timed out after {chunk_timeout} seconds, retrying ({timeout_retry_count}/{max_timeout_retries})...") # Stagger retries to avoid simultaneous API calls # Use SEND_INTERVAL_SECONDS as base, random from half to full import random base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) retry_delay = random.uniform(base_delay / 2, base_delay) print(f" ⏳ Waiting {retry_delay:.1f}s before retry...") time.sleep(retry_delay) continue else: # Max retries reached, return timeout to trigger chapter abort print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached") return "[TIMEOUT]", chunk_idx, None, False, "timeout" else: # Not a timeout error, re-raise raise # Use the raw object directly from send_with_interrupt raw_obj = raw_obj_from_send # if raw_obj: # print(f"🧠 Captured thought signature for chunk {chunk_idx}/{total_chunks}") if total_chunks and int(total_chunks) > 1: print(f"📥 Received Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks} response, finish_reason: {finish_reason}") else: print(f"📥 Received Chapter {actual_num} response, finish_reason: {finish_reason}") # Char-ratio truncation retry (silent truncation) char_ratio_exhausted = False try: retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1" except Exception: retry_truncated_enabled = False char_ratio_enabled = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1" if retry_truncated_enabled and char_ratio_enabled: has_base64_image = ('data:image' in chunk_html) or ('base64,' in chunk_html) used_fallback = getattr(self.client, '_used_fallback_key', False) # Parse settings with sane bounds try: char_ratio_threshold_pct = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50")) except Exception: char_ratio_threshold_pct = 50.0 try: char_ratio_retry_limit = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1")) except Exception: char_ratio_retry_limit = 1 try: char_ratio_min_output_chars = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100")) except Exception: char_ratio_min_output_chars = 100 char_ratio_threshold_pct = max(0.0, min(100.0, char_ratio_threshold_pct)) char_ratio_threshold = char_ratio_threshold_pct / 100.0 if char_ratio_retry_limit < 1: char_ratio_retry_limit = 1 if char_ratio_min_output_chars < 0: char_ratio_min_output_chars = 0 char_ratio_retry_count = 0 while not has_base64_image: # Stop before any retries if local_stop_cb(): break input_char_count = len(chunk_html) output_char_count = len(result) if result else 0 char_ratio = (output_char_count / input_char_count) if input_char_count > 0 else 0 # Only apply the char-ratio check when we didn't already see a truncation/prohibited-content signal if finish_reason in ["length", "max_tokens", "content_filter", "prohibited_content"]: break if (char_ratio < char_ratio_threshold) and (output_char_count > char_ratio_min_output_chars): # If the key fallback logic triggered, accept the output to avoid burning retries on worse keys if used_fallback: print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio suggests truncation but fallback key was used - accepting output") break if char_ratio_retry_count >= char_ratio_retry_limit: print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: All {char_ratio_retry_limit} char-ratio retries exhausted; marking as TRUNCATED") char_ratio_exhausted = True break if char_ratio_retry_count == 0: print( f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio suggests truncation " f"(Input chars: {input_char_count}, Output chars: {output_char_count}, Ratio: {char_ratio:.2f} < {char_ratio_threshold:.2f}). " f"Attempting up to {char_ratio_retry_limit} retry(ies)..." ) char_ratio_retry_count += 1 print( f"🔄 Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry attempt " f"{char_ratio_retry_count}/{char_ratio_retry_limit}" ) # Force higher token limit on retries try: base_max_tokens = int(self.config.MAX_OUTPUT_TOKENS) except Exception: base_max_tokens = self.config.MAX_OUTPUT_TOKENS try: retry_cap = int(getattr(self.config, "MAX_RETRY_TOKENS", base_max_tokens)) except Exception: retry_cap = base_max_tokens if retry_cap <= 0: retry_cap = base_max_tokens retry_max_tokens = max(base_max_tokens, retry_cap) # Prevent nested truncation retries within the unified client during our char-ratio retries try: tls_retry_client = self.client._get_thread_local_client() except Exception: tls_retry_client = None if tls_retry_client is not None: setattr(tls_retry_client, "_in_truncation_retry", True) try: result_retry, finish_reason_retry, raw_obj_retry = send_with_interrupt( chapter_msgs, self.client, self.config.TEMP, retry_max_tokens, local_stop_cb, chunk_timeout=chunk_timeout, context='translation', chapter_context=chapter_ctx, bypass_graceful_stop=True ) except UnifiedClientError as e: # Treat timeout during char-ratio retry as a timeout for the chunk error_msg = str(e) if "cancelled" in error_msg or "Gemini client not initialized" in error_msg: if local_stop_cb(): print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Translation stopped by user during char-ratio retry") return None, chunk_idx, None, False, "cancelled" graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active: print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Skipped char-ratio retry (graceful stop)") return None, chunk_idx, None, False, "graceful_stop" print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry failed due to API cancellation") return "[TIMEOUT]", chunk_idx, None, False, "timeout" if "timed out" in error_msg: print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry timed out after {chunk_timeout} seconds") return "[TIMEOUT]", chunk_idx, None, False, "timeout" print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry error: {e}. Accepting current output.") break finally: if tls_retry_client is not None: try: setattr(tls_retry_client, "_in_truncation_retry", False) except Exception: pass retry_output_chars = len(result_retry) if result_retry else 0 if result_retry and retry_output_chars > output_char_count: print( f"✅ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry improved output " f"({output_char_count} → {retry_output_chars} chars)" ) result = result_retry finish_reason = finish_reason_retry raw_obj = raw_obj_retry # Re-check ratio / decide on further retries continue print( f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry did not improve output " f"({output_char_count} → {retry_output_chars} chars). Trying again if attempts remain..." ) continue # Ratio OK (or output too short to be meaningful): stop checking break # Treat truncation retries exhaustion as truncation even if finish_reason changed # In batch mode each worker has its own thread-local client; check that flag too try: tls_client = self.client._get_thread_local_client() except Exception: tls_client = None truncation_exhausted = False if tls_client is not None: truncation_exhausted = getattr(tls_client, "_truncation_retries_exhausted", False) if not truncation_exhausted: truncation_exhausted = getattr(self.client, "_truncation_retries_exhausted", False) # Clear the flag on whichever client had it so it doesn't bleed into later calls try: if tls_client is not None and getattr(tls_client, "_truncation_retries_exhausted", False): tls_client._truncation_retries_exhausted = False except Exception: pass try: if getattr(self.client, "_truncation_retries_exhausted", False): self.client._truncation_retries_exhausted = False except Exception: pass if finish_reason in ["length", "max_tokens"] or truncation_exhausted or char_ratio_exhausted: print(f" ⚠️ Chunk {chunk_idx}/{total_chunks} response was TRUNCATED!") # Track truncation status is_truncated = True else: is_truncated = False if result: # Remove chunk markers from result result = re.sub(r'\[PART \d+/\d+\]\s*', '', result, flags=re.IGNORECASE) return result, chunk_idx, raw_obj, is_truncated, finish_reason else: raise Exception(f"Empty result for chunk {chunk_idx}/{total_chunks}") # Use ThreadPoolExecutor to process chunks in parallel # Use same batch size as chapter-level parallelism max_chunk_workers = min(total_chunks, self.config.BATCH_SIZE) # Shared abort flag for this chapter's chunks (set when a chunk hits prohibited content) chunk_abort_event = threading.Event() # Stop callback that also checks the per-chapter abort flag def _user_stop_requested() -> bool: try: return (self.check_stop_fn() if hasattr(self, "check_stop_fn") else False) except Exception: return False def local_stop_cb() -> bool: # 1. Check for immediate aborts (errors, etc.) if chunk_abort_event.is_set(): return True # 2. Check for user stop request if not _user_stop_requested(): return False # 3. User requested stop. Check type of stop. graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if not graceful_stop_active: # Force stop return True # 4. Graceful stop active. Check policies. wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if wait_for_chunks: # User explicitly wants to wait for chunks -> Do not stop. return False # 5. Graceful stop, WAIT_FOR_CHUNKS=0. # Only wait if ALL chunks are already sent/done. try: if _all_chunks_sent_or_done(): return False except Exception: pass # 6. Otherwise (Graceful stop, partial chunks in flight) -> Stop. return True # WAIT_FOR_CHUNKS semantics (batch translation): # - WAIT_FOR_CHUNKS=1: always wait for remaining chunks of this chapter # - WAIT_FOR_CHUNKS=0: ONLY wait if all chunks have already been *sent to the API* (post-stagger) # Otherwise, cancel this chapter (do not write partial output). sent_or_done_chunks = set() # chunk indices (1-based) that were in-flight or completed def _update_sent_or_done_from_watchdog() -> None: try: import unified_api_client st = unified_api_client.get_api_watchdog_state() if hasattr(unified_api_client, 'get_api_watchdog_state') else {} entries = st.get('in_flight_entries', []) if isinstance(st, dict) else [] if not isinstance(entries, list): return chap_key = str(actual_num) for e in entries: if not isinstance(e, dict): continue if e.get('status') != 'in_flight': continue if str(e.get('chapter')) != chap_key: continue try: tot = int(e.get('total_chunks') or 0) except Exception: tot = 0 if tot and int(total_chunks or 0) and tot != int(total_chunks or 0): continue try: ch = int(e.get('chunk') or 0) except Exception: ch = 0 if ch: sent_or_done_chunks.add(ch) except Exception: return def _all_chunks_sent_or_done() -> bool: try: _update_sent_or_done_from_watchdog() except Exception: pass return int(total_chunks or 0) > 0 and len(sent_or_done_chunks) >= int(total_chunks or 0) def _cancel_chapter_due_to_stop(reason: str): # Ensure remaining chunk workers abort quickly try: chunk_abort_event.set() except Exception: pass try: print(f"🛑 Chapter {actual_num}: cancelling chapter (WAIT_FOR_CHUNKS=0) — {reason}") except Exception: pass # Force a real cancel so in-flight requests stop too (user asked for full-stop for this chapter) try: from unified_api_client import UnifiedClientError import unified_api_client if hasattr(unified_api_client, 'set_stop_flag'): unified_api_client.set_stop_flag(True) if hasattr(unified_api_client, 'global_stop_flag'): unified_api_client.global_stop_flag = True if hasattr(unified_api_client, 'UnifiedClient'): unified_api_client.UnifiedClient._global_cancelled = True if hasattr(unified_api_client, 'hard_cancel_all'): unified_api_client.hard_cancel_all() raise UnifiedClientError("Operation cancelled by user", error_type="cancelled") except Exception as e: # If UnifiedClientError isn't available for some reason, raise a normal cancellation raise last_chunk_raw_obj = None chapter_truncated = False # Track if any chunk was truncated with ThreadPoolExecutor(max_workers=max_chunk_workers, thread_name_prefix=f"Ch{actual_num}Chunk") as chunk_executor: # Submit chunks with staggered delay to prevent simultaneous starts thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) future_to_chunk = {} for idx, chunk_data in enumerate(chunks): # Sleep BEFORE submitting (apply to all chunks when multiple chunks exist) if thread_delay > 0 and total_chunks > 1: chunk_num = chunk_data[1] # Extract chunk number for logging print(f"🧵 Chapter {actual_num}: Delaying {thread_delay}s before submitting chunk {chunk_num}/{total_chunks}") # Interruptible sleep - check stop flag every 0.1s # But respect WAIT_FOR_CHUNKS setting during graceful stop elapsed = 0 check_interval = 0.1 while elapsed < thread_delay: # Read env vars INSIDE loop to catch stop pressed mid-delay graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' user_stop = _user_stop_requested() if user_stop: if graceful_stop_active and wait_for_chunks: # Explicit wait-for-chunks: keep going (we will still submit remaining chunks) pass elif graceful_stop_active and (not wait_for_chunks): # WAIT_FOR_CHUNKS disabled: only wait if every chunk is already sent. if _all_chunks_sent_or_done(): # Should be rare here (we're still about to submit), but keep consistent. pass else: _cancel_chapter_due_to_stop("stop requested before all chunks were sent") else: # Immediate stop print(f"🛑 Chunk submission delay interrupted") raise Exception("Translation stopped by user during chunk submission delay") if chunk_abort_event.is_set(): raise Exception("Translation stopped (chapter abort)") sleep_chunk = min(check_interval, thread_delay - elapsed) time.sleep(sleep_chunk) elapsed += sleep_chunk # Now submit the chunk future = chunk_executor.submit(process_chunk, chunk_data) future_to_chunk[future] = chunk_data[1] # Store chunk index # Collect results as they complete completed_chunks = 0 for future in as_completed(future_to_chunk): # Read env vars INSIDE loop to catch stop pressed mid-chunk graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' stop_requested = local_stop_cb() # FIRST: Get the result (future already completed, so this is instant) # With graceful stop ON, we should save completed work before stopping try: result, chunk_idx, raw_obj, is_truncated, finish_reason = future.result() # Handle graceful-stop skipped chunks if finish_reason == "graceful_stop": save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False)) if save_partial_results: fname = FileUtilities.create_chapter_filename(chapter, actual_num) partial_content = None try: tls = self.client._get_thread_local_client() partial_content = getattr(tls, '_last_truncated_content', None) except Exception: partial_content = getattr(self.client, '_last_truncated_content', None) if isinstance(partial_content, str) and partial_content: try: with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(partial_content) except Exception: pass with self.progress_lock: self.update_progress_fn( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TRUNCATED"], chapter_obj=chapter ) self.save_progress_fn() print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — saved truncated output") else: with self.progress_lock: self.update_progress_fn( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["PARTIAL"], chapter_obj=chapter ) self.save_progress_fn() print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — marked QA failed (PARTIAL)") chunk_abort_event.set() chunk_executor.shutdown(wait=False, cancel_futures=True) # Let the outer handler mark the chapter as pending/skipped raise UnifiedClientError( "Graceful stop active - not starting new API call", error_type="cancelled" ) # Handle cancelled chunks (skipped due to stop request) if finish_reason == "cancelled" or (result is None and finish_reason != "stop"): print(f"⏭️ Chunk {chunk_idx}/{total_chunks} cancelled (stop requested)") chunk_executor.shutdown(wait=False, cancel_futures=True) raise Exception("Translation stopped by user") # Immediate QA fail: stop remaining chunks and mark chapter if finish_reason in ("content_filter", "prohibited_content", "error"): # Signal other chunk workers to abort quickly (chapter-local only) chunk_abort_event.set() fname = FileUtilities.create_chapter_filename(chapter, actual_num) save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_prohibited_results', False)) if save_prohibited_results: # Do NOT preserve original; save AI output if any, otherwise empty try: with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(result if isinstance(result, str) else "") except Exception: pass with self.progress_lock: self.update_progress_fn( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["PROHIBITED_CONTENT"], chapter_obj=chapter ) self.save_progress_fn() chunk_executor.shutdown(wait=False, cancel_futures=True) return False, actual_num, None, None, None # Handle timeout failures - abort chapter and mark as failed if finish_reason == "timeout": chunk_abort_event.set() fname = FileUtilities.create_chapter_filename(chapter, actual_num) print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timeout - aborting chapter") with self.progress_lock: self.update_progress_fn( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TIMEOUT"], chapter_obj=chapter ) self.save_progress_fn() chunk_executor.shutdown(wait=False, cancel_futures=True) return False, actual_num, None, None, None # Handle truncation - abort chapter and mark as failed # Check if RETRY_TRUNCATED is enabled - if so, truncation should abort chapter retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1" if is_truncated and retry_truncated_enabled: chunk_abort_event.set() fname = FileUtilities.create_chapter_filename(chapter, actual_num) print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Truncated - aborting chapter") save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False)) if save_partial_results: try: with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(result if isinstance(result, str) else "") except Exception: pass with self.progress_lock: self.update_progress_fn( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TRUNCATED"], chapter_obj=chapter ) self.save_progress_fn() chunk_executor.shutdown(wait=False, cancel_futures=True) return False, actual_num, None, None, None if result: # Store result at correct index to maintain order with chunks_lock: translated_chunks[chunk_idx - 1] = result # chunk_idx is 1-based self.chunks_completed += 1 completed_chunks += 1 # Store the raw object if it's the last chunk (or the only chunk) if chunk_idx == total_chunks: last_chunk_raw_obj = raw_obj # Track if any chunk was truncated if is_truncated: chapter_truncated = True # Log redundant with "Received Chapter X, Chunk Y" above # print(f"✅ Chunk {chunk_idx}/{total_chunks} completed ({completed_chunks}/{total_chunks})") # Mark this chunk as done (if we got a real result) try: if isinstance(chunk_idx, int) and chunk_idx > 0: sent_or_done_chunks.add(int(chunk_idx)) except Exception: pass # AFTER storing result: check if we should stop if stop_requested and total_chunks > 1: if graceful_stop_active and (wait_for_chunks or _all_chunks_sent_or_done()): # Wait for remaining chunks - continue processing print(f"⏳ Graceful stop — waiting for remaining chunks of chapter {actual_num}...") else: # WAIT_FOR_CHUNKS disabled and not all chunks were actually sent: # cancel this chapter entirely (no partial output). try: chunk_executor.shutdown(wait=False, cancel_futures=True) except Exception: pass _cancel_chapter_due_to_stop("stop requested before all chunks were sent") except Exception as e: chunk_idx = future_to_chunk[future] # Don't print chunk error - will be printed at chapter level raise # Verify chunks - handle partial completion is_partial_result = False if None in translated_chunks: missing = [i+1 for i, chunk in enumerate(translated_chunks) if chunk is None] completed = [i+1 for i, chunk in enumerate(translated_chunks) if chunk is not None] graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if graceful_stop_active and completed: # Only allow partial output when WAIT_FOR_CHUNKS is explicitly enabled. # When WAIT_FOR_CHUNKS is disabled, we cancel the whole chapter instead. if wait_for_chunks: print(f"⚠️ Chapter {actual_num}: partial translation ({len(completed)}/{total_chunks} chunks) due to graceful stop") translated_chunks = [c for c in translated_chunks if c is not None] is_partial_result = True else: _cancel_chapter_due_to_stop(f"missing chunks {missing} (WAIT_FOR_CHUNKS=0)") else: raise Exception(f"Failed to translate chunks: {missing}") # Combine all chunks if total_chunks > 1: result = '\n'.join(translated_chunks) print(f"🔗 Combined {total_chunks} chunks for Chapter {actual_num}") else: result = translated_chunks[0] if translated_chunks else None if not result: raise Exception("No translation result produced") # Enhanced mode workflow (same as non-batch): # 1. Original HTML -> html2text -> Markdown/plain text (during extraction) # 2. Markdown sent to translation API (better for translation quality) # 3. Translated markdown -> HTML conversion (here) if result and chapter.get("enhanced_extraction", False): print(f"🔄 Converting translated markdown back to HTML...") result = convert_enhanced_text_to_html(result, chapter) # Emergency Image Restoration (if enabled) if result and self.config.EMERGENCY_IMAGE_RESTORE: # Use original HTML if available (for enhanced extraction), otherwise use chapter_body source_html = chapter.get('original_html', chapter_body) result = ContentProcessor.emergency_restore_images(result, source_html) if self.config.REMOVE_AI_ARTIFACTS: result = ContentProcessor.clean_ai_artifacts(result, True) result = ContentProcessor.clean_memory_artifacts(result) cleaned = re.sub(r"^```(?:html)?\s*\n?", "", result, count=1, flags=re.MULTILINE) cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE) cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=self.config.REMOVE_AI_ARTIFACTS) # Post-process: Remove duplicate H1+P pairs from translated OUTPUT if enabled remove_duplicate_h1_p = os.getenv('REMOVE_DUPLICATE_H1_P', '0') == '1' if remove_duplicate_h1_p and cleaned: # First: HTML-based duplicate removal from bs4 import BeautifulSoup output_soup = BeautifulSoup(cleaned, 'html.parser') for h1_tag in output_soup.find_all('h1'): h1_id = h1_tag.get('id', '') if h1_id and h1_id.startswith('split-'): continue h1_text = h1_tag.get_text(strip=True) if 'SPLIT MARKER' in h1_text: continue # Check next sibling (P after H1) next_sibling = h1_tag.find_next_sibling() if next_sibling and next_sibling.name == 'p': if h1_text == next_sibling.get_text(strip=True): next_sibling.decompose() continue # Check previous sibling (P before H1) prev_sibling = h1_tag.find_previous_sibling() if prev_sibling and prev_sibling.name == 'p': if h1_text == prev_sibling.get_text(strip=True): prev_sibling.decompose() cleaned = str(output_soup) # Second: Markdown-based duplicate removal (for enhanced extraction mode) # Pattern: "Title Text\n\n# Title Text" - remove the plain text line before markdown header def remove_markdown_duplicate_headers_batch(text): lines = text.split('\n') result_lines = [] i = 0 while i < len(lines): line = lines[i] # Check if this is a non-empty line followed by blank lines and then a markdown header if line.strip() and not line.strip().startswith('#'): # Look ahead for pattern: [blank lines] [# header with same text] j = i + 1 # Skip blank lines while j < len(lines) and not lines[j].strip(): j += 1 # Check if next non-blank line is a markdown header if j < len(lines): next_line = lines[j] header_match = re.match(r'^(#{1,6})\s+(.+)$', next_line) if header_match: header_text = header_match.group(2).strip() # Compare with current line (stripped) if line.strip() == header_text: # Skip this duplicate line, keep blanks and header i += 1 continue result_lines.append(line) i += 1 return '\n'.join(result_lines) cleaned = remove_markdown_duplicate_headers_batch(cleaned) # Check for empty or failed response BEFORE writing to disk if not cleaned or not str(cleaned).strip(): print(f"❌ Batch: Translation empty for chapter {actual_num} — skipping file write") with self.progress_lock: self.update_progress_fn(idx, actual_num, content_hash, None, status="qa_failed", qa_issues_found=["EMPTY_OUTPUT"]) self.save_progress_fn() return False, actual_num, None, None, None if is_qa_failed_response(cleaned): failure_reason = get_failure_reason(cleaned) print(f"❌ Batch: Translation failed for chapter {actual_num} - marked as failed, no output file created (reason: {failure_reason})") with self.progress_lock: fname = FileUtilities.create_chapter_filename(chapter, actual_num) save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False)) save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_prohibited_results', False)) should_save = (save_prohibited_results if is_prohibited_failure(cleaned, failure_reason) else save_partial_results) if should_save: try: with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(cleaned if isinstance(cleaned, str) else "") except Exception: pass self.update_progress_fn(idx, actual_num, content_hash, fname, status="qa_failed", ai_features=ai_features) self.save_progress_fn() return False, actual_num, None, None, None # NOTE: We no longer append to translation history here in the worker thread. # History is now written in the main thread per batch, in a stable order. fname = FileUtilities.create_chapter_filename(chapter, actual_num) # CRITICAL: Unescape img tags that were converted to HTML entities (applies to ALL HTML) # Pattern matches: <img ... /> where the tag ends with / # Post-process: Fix empty attribute tags for BeautifulSoup mode if os.getenv('FIX_EMPTY_ATTR_TAGS_BS', '0') == '1' and not chapter.get('enhanced_extraction', False): cleaned = _fix_empty_attr_tags_bs(cleaned) img_count = len(re.findall(r'<img\s[^>]*?/>', cleaned, flags=re.IGNORECASE)) if img_count > 0: print(f"🖼️ Unescaping {img_count} img tag(s) from HTML entities (post-processing)") cleaned = re.sub( r'<(img\s[^>]*?/)>', r'<\1>', cleaned, flags=re.IGNORECASE ) if self.is_text_file: # For text files, save as plain text fname_txt = fname.replace('.html', '.txt') if fname.endswith('.html') else fname # Extract text from HTML from bs4 import BeautifulSoup soup = BeautifulSoup(cleaned, 'html.parser') text_content = soup.get_text(strip=True) # Merge image translations back with text translation if 'final_body_with_images' in locals() and image_translations: # Parse both versions soup_with_images = BeautifulSoup(final_body_with_images, 'html.parser') soup_with_text = BeautifulSoup(cleaned, 'html.parser') # Get the translated text content (without images) body_content = soup_with_text.body # Add image translations to the translated content for trans_div in soup_with_images.find_all('div', class_='translated-text-only'): body_content.insert(0, trans_div) final_html = str(soup_with_text) cleaned = final_html with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(cleaned) # Update with .txt filename with self.progress_lock: self.update_progress_fn(idx, actual_num, content_hash, fname_txt, status="completed", ai_features=ai_features) self.save_progress_fn() else: # Original code for EPUB files with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(cleaned) print(f"💾 Saved Chapter {actual_num}: {fname} ({len(cleaned)} chars)") # Initialize ai_features at the beginning to ensure it's always defined if ai_features is None: ai_features = None # Extract and save AI features for future duplicate detection if (self.config.RETRY_DUPLICATE_BODIES and hasattr(self.config, 'DUPLICATE_DETECTION_MODE') and self.config.DUPLICATE_DETECTION_MODE in ['ai-hunter', 'cascading']): try: # Extract features from the translated content cleaned_text = re.sub(r'<[^>]+>', '', cleaned).strip() # Note: self.translator doesn't exist, so we can't extract features here # The features will need to be extracted during regular processing print(f" ⚠️ AI features extraction not available in batch mode") except Exception as e: print(f" ⚠️ Failed to extract AI features: {e}") with self.progress_lock: # Check for truncation or partial result first if chapter_truncated: chapter_status = "qa_failed" print(f"⚠️ Batch: Chapter {actual_num} marked as qa_failed: Response was truncated") self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features, qa_issues_found=["TRUNCATED"]) self.save_progress_fn() return False, actual_num, None, None, None elif is_partial_result: chapter_status = "qa_failed" print(f"⚠️ Batch: Chapter {actual_num} marked as qa_failed: Partial translation (graceful stop)") self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features, qa_issues_found=["PARTIAL"]) self.save_progress_fn() return False, actual_num, None, None, None else: chapter_status = "completed" self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features) self.save_progress_fn() self.chapters_completed += 1 # Log removed - executor loop will log "Chapter X done" # print(f"✅ Chapter {actual_num} completed successfully") # Return chapter body and final cleaned translation so the main thread # can append to translation history in a stable batch order. return True, actual_num, chapter_body, cleaned, last_chunk_raw_obj except Exception as e: # Graceful-stop pre-send cancellations are expected (they prevent queued calls from starting). # Do not spam per-chapter "failed" logs, and do not mark these chapters as failed. error_msg = str(e) is_graceful_stop_skip = ( "graceful stop active - not starting new api call" in (error_msg or "").lower() or (hasattr(e, 'error_type') and getattr(e, 'error_type', None) == 'cancelled' and os.environ.get('GRACEFUL_STOP') == '1') ) if is_graceful_stop_skip: # Keep a concise log so the user understands why the chapter didn't run. # (Do NOT include the original error text, since it is noisy and is suppressed in the GUI logger.) print(f"⏭️ Chapter {actual_num} skipped (graceful stop)") try: fname = FileUtilities.create_chapter_filename(chapter, actual_num) with self.progress_lock: # Reset back to pending so it can be resumed later. self.update_progress_fn(idx, actual_num, content_hash, fname, status="pending") self.save_progress_fn() except Exception: pass return False, actual_num, None, None, None with self.progress_lock: # Use the same output filename so we can track failed chapters properly fname = FileUtilities.create_chapter_filename(chapter, actual_num) # Check if it's a timeout failure if "[TIMEOUT]" in error_msg or (hasattr(e, 'error_type') and e.error_type == 'timeout'): self.update_progress_fn(idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TIMEOUT"], chapter_obj=chapter) else: self.update_progress_fn(idx, actual_num, content_hash, fname, status="failed") self.save_progress_fn() # Print consolidated error message if total_chunks > 1: print(f"❌ Chapter {actual_num} failed (chunk {chunk_idx}/{total_chunks}): {e}") else: print(f"❌ Chapter {actual_num} failed: {e}") # No history for failed chapters return False, actual_num, None, None, None def process_merged_group(self, merge_group, progress_manager): """ Process a merge group (multiple chapters merged into a single API request). Args: merge_group: List of (idx, chapter) tuples to merge progress_manager: ProgressManager instance for updating merged chapter status Returns: List of results, each in format: (success, actual_num, hist_user, hist_assistant, raw_obj) """ import threading if len(merge_group) == 1: # Single chapter, process normally result = self.process_single_chapter(merge_group[0]) return [result] # Get info for all chapters in the group chapters_data = [] # List of (chapter_num, content, idx, chapter_obj, content_hash) parent_idx, parent_chapter = merge_group[0] parent_actual_num = parent_chapter.get('actual_chapter_num', parent_chapter['num']) # Check for graceful stop before starting work graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active or self.check_stop_fn(): # Return failed results for all chapters in the group results = [] for idx, chapter in merge_group: actual_num = chapter.get('actual_chapter_num', chapter['num']) results.append((False, actual_num, None, None, None)) raise Exception("Translation stopped by user") # Only log if not about to be stopped thread_name = threading.current_thread().name print(f"\n🔗 [{thread_name}] Processing MERGED group: Chapters {[c.get('actual_chapter_num', c['num']) for _, c in merge_group]}") # Double-check stop after logging but before doing real work if self.check_stop_fn(): # Return failed results for all chapters in the group results = [] for idx, chapter in merge_group: actual_num = chapter.get('actual_chapter_num', chapter['num']) results.append((False, actual_num, None, None, None)) raise Exception("Translation stopped by user") # Check ignore settings for filtering batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' use_title_tag = os.getenv('USE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active remove_duplicate_h1_p = os.getenv('REMOVE_DUPLICATE_H1_P', '0') == '1' for idx, chapter in merge_group: actual_num = chapter.get('actual_chapter_num', chapter['num']) content_hash = chapter.get("content_hash") or ContentProcessor.get_content_hash(chapter["body"]) # Get chapter body and apply ignore filters if needed chapter_body = chapter["body"] if (not use_title_tag or ignore_header_tags or remove_duplicate_h1_p) and chapter_body: from bs4 import BeautifulSoup body_soup = BeautifulSoup(chapter_body, 'html.parser') # Remove title tags if ignored (including those in ) if not use_title_tag: for title_tag in body_soup.find_all('title'): title_tag.decompose() # Remove header tags if ignored if ignore_header_tags: for header_tag in body_soup.find_all(['h1', 'h2', 'h3']): header_tag.decompose() # Remove duplicate H1+P pairs (where P is adjacent to H1 with same text) if remove_duplicate_h1_p: for h1_tag in body_soup.find_all('h1'): # Skip split marker H1 tags h1_id = h1_tag.get('id', '') if h1_id and h1_id.startswith('split-'): continue h1_text = h1_tag.get_text(strip=True) if 'SPLIT MARKER' in h1_text: continue # Check next sibling (P after H1) next_sibling = h1_tag.find_next_sibling() if next_sibling and next_sibling.name == 'p': p_text = next_sibling.get_text(strip=True) if h1_text == p_text: next_sibling.decompose() continue # Check previous sibling (P before H1) prev_sibling = h1_tag.find_previous_sibling() if prev_sibling and prev_sibling.name == 'p': p_text = prev_sibling.get_text(strip=True) if h1_text == p_text: prev_sibling.decompose() chapter_body = str(body_soup) # Also apply markdown-based duplicate removal (for enhanced extraction mode) if remove_duplicate_h1_p and chapter_body: def remove_markdown_duplicate_headers_input(text): lines = text.split('\n') result = [] i = 0 while i < len(lines): line = lines[i] # Check if this is a non-empty line followed by blank lines and then a markdown header if line.strip() and not line.strip().startswith('#'): # Look ahead for pattern: [blank lines] [# header with same text] j = i + 1 # Skip blank lines while j < len(lines) and not lines[j].strip(): j += 1 # Check if next non-blank line is a markdown header if j < len(lines): next_line = lines[j] header_match = re.match(r'^(#{1,6})\s+(.+)$', next_line) if header_match: header_text = header_match.group(2).strip() # Compare with current line (stripped) if line.strip() == header_text: # Skip this duplicate line, keep blanks and header i += 1 continue result.append(line) i += 1 return '\n'.join(result) chapter_body = remove_markdown_duplicate_headers_input(chapter_body) chapters_data.append((actual_num, chapter_body, idx, chapter, content_hash)) try: # Mark all chapters as in_progress for actual_num, _, idx, chapter, content_hash in chapters_data: with self.progress_lock: # Determine output filename for tracking (consistent with process_single_chapter) fname = FileUtilities.create_chapter_filename(chapter, actual_num) self.update_progress_fn(idx, actual_num, content_hash, fname, status="in_progress", chapter_obj=chapter) self.save_progress_fn() # Merge chapter contents merge_input = [(cn, content, ch) for cn, content, _, ch, _ in chapters_data] merged_content = RequestMerger.merge_chapters(merge_input) expected_chapters = [cn for cn, _, _, _, _ in chapters_data] print(f" 📊 Merged {len(merge_group)} chapters ({len(merged_content):,} chars total)") # Build system prompt with glossary # Use get_system_prompt() with actual merge count to conditionally include split marker instruction glossary_path = find_glossary_file(self.out_dir) base_system_prompt = self.config.get_system_prompt(actual_merge_count=len(merge_group)) chapter_system_prompt = build_system_prompt( base_system_prompt, glossary_path, source_text=merged_content ) # Build messages rolling_summary_msgs = [] if getattr(self.config, 'USE_ROLLING_SUMMARY', False): try: rs_text = self.get_batch_rolling_summary_text() except Exception: rs_text = "" if isinstance(rs_text, str) and rs_text: rolling_summary_msgs = [{ "role": "assistant", "content": ( "CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n" "[MEMORY] Previous context summary:\n\n" + rs_text + "\n\n" "[END MEMORY]\n" "END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:" ) }] memory_msgs = [] if (self.config.CONTEXTUAL and self.history_manager is not None and getattr(self.config, 'HIST_LIMIT', 0) > 0): try: history = self.history_manager.load_history() hist_limit = getattr(self.config, 'HIST_LIMIT', 0) trimmed = history[-hist_limit * 2:] include_source = os.getenv("INCLUDE_SOURCE_IN_HISTORY", "0") == "1" for h in trimmed: if not isinstance(h, dict): continue role = h.get('role', 'user') raw_obj = h.get('_raw_content_object') content = h.get('content') or "" if role == 'user' and not include_source: continue if (not content) and raw_obj is None: continue msg = {'role': role} if content: msg['content'] = content if raw_obj is not None: msg['_raw_content_object'] = raw_obj memory_msgs.append(msg) except Exception as e: print(f" ⚠️ Failed to load history for merged group: {e}") # Build optional assistant prefill message if configured assistant_prefill_msgs = [] if getattr(self.config, 'ASSISTANT_PROMPT', '') and self.config.ASSISTANT_PROMPT.strip(): assistant_prefill_msgs = [{"role": "assistant", "content": self.config.ASSISTANT_PROMPT.strip()}] msgs = [{"role": "system", "content": chapter_system_prompt}] + rolling_summary_msgs + memory_msgs + assistant_prefill_msgs + [ {"role": "user", "content": merged_content} ] # Prepare split-failed retry controls try: split_retry_limit = int(getattr(self.config, 'SPLIT_FAILED_RETRY_ATTEMPTS', 2)) except Exception: split_retry_limit = 2 disable_fallback_flag = (os.getenv('DISABLE_MERGE_FALLBACK', '0') == '1') or bool(getattr(self.config, 'DISABLE_MERGE_FALLBACK', False)) # Use toggle/config for split retries (works in batch and non-batch) split_retry_enabled = (os.getenv('RETRY_SPLIT_FAILED', '0') == '1') or bool(getattr(self.config, 'RETRY_SPLIT_FAILED', False)) split_retry_attempts = 0 print(f" [DEBUG] Split retry enabled={split_retry_enabled}, limit={split_retry_limit}, disable_fallback={disable_fallback_flag}") # Log combined prompt token count for merged request (treated as Chunk 1/1). try: # Use the same token counter as regular batch splitting. # Instantiate a lightweight ChapterSplitter here for counting only. chapter_splitter = ChapterSplitter(model_name=self.config.MODEL) # Count tokens for system+assistant(user/memory) messages total_tokens = 0 assistant_tokens = 0 for m in msgs: content = m.get("content", "") tokens = chapter_splitter.count_tokens(content) total_tokens += tokens if m.get("role") == "assistant": assistant_tokens += tokens non_assistant_tokens = total_tokens - assistant_tokens # Determine a stable file reference based on parent chapter parent_file_ref = ( parent_chapter.get('original_basename') or parent_chapter.get('filename') or f"Chapter_{parent_actual_num}" ) # Get budget string from MAX_INPUT_TOKENS token_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip() _, budget_str = parse_token_limit(token_env) if self.config.CONTEXTUAL and assistant_tokens > 0: print( f"💬 Chunk 1/1 combined prompt: " f"{total_tokens:,} tokens (system + user: {non_assistant_tokens:,}, " f"assistant/memory: {assistant_tokens:,}) / {budget_str} [File: {parent_file_ref}]" ) else: print( f"💬 Chunk 1/1 combined prompt: " f"{total_tokens:,} tokens (system + user) / {budget_str} [File: {parent_file_ref}]" ) except Exception as e: # Never break translation due to logging issues. print(f" ⚠️ Failed to log combined prompt tokens for merged group: {e}") # Get max output tokens env_max_output = os.getenv("MAX_OUTPUT_TOKENS", "") if env_max_output.isdigit() and int(env_max_output) > 0: mtoks = int(env_max_output) else: mtoks = self.config.MAX_OUTPUT_TOKENS # Finite retry loop to avoid infinite re-requests when Split‑the‑Merge keeps failing. max_merge_attempts = (max(1, split_retry_limit) + 1) if split_retry_enabled else 1 split_retry_attempts = 0 # Track char-ratio retries across the entire merged-group request sequence # (don't reset per split-retry attempt) char_ratio_attempts_used = 0 while split_retry_attempts < max_merge_attempts: # Call API for merged content print(f" 🌐 Sending merged request to API...") # Build chapter context with merged chapter numbers for progress bar display merged_chapter_nums_for_context = [cn for cn, _, _, _, _ in chapters_data] chapter_ctx = { 'chapter': parent_actual_num, 'chunk': 1, 'total_chunks': 1, 'merged_chapters': merged_chapter_nums_for_context, } merged_response, finish_reason, raw_obj = send_with_interrupt( msgs, self.client, self.config.TEMP, mtoks, self.check_stop_fn, context='translation', chapter_context=chapter_ctx, ) # Preserve the finish reason from the merged API call for later status decisions. merged_finish_reason = finish_reason truncation_exhausted = getattr(self.client, "_truncation_retries_exhausted", False) if truncation_exhausted: try: self.client._truncation_retries_exhausted = False except Exception: pass # During graceful stop, let the split complete instead of aborting if self.check_stop_fn() and os.environ.get('GRACEFUL_STOP') != '1': raise Exception("Translation stopped by user") if not merged_response: raise Exception("Empty response from API for merged request") # Char-ratio truncation retry (silent truncation) char_ratio_exhausted = False try: retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1" except Exception: retry_truncated_enabled = False char_ratio_enabled = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1" if retry_truncated_enabled and char_ratio_enabled and not truncation_exhausted: has_base64_image = ('data:image' in merged_content) or ('base64,' in merged_content) used_fallback = getattr(self.client, '_used_fallback_key', False) # Parse settings with sane bounds try: char_ratio_threshold_pct = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50")) except Exception: char_ratio_threshold_pct = 50.0 try: char_ratio_retry_limit = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1")) except Exception: char_ratio_retry_limit = 1 try: char_ratio_min_output_chars = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100")) except Exception: char_ratio_min_output_chars = 100 char_ratio_threshold_pct = max(0.0, min(100.0, char_ratio_threshold_pct)) char_ratio_threshold = char_ratio_threshold_pct / 100.0 if char_ratio_retry_limit < 1: char_ratio_retry_limit = 1 if char_ratio_min_output_chars < 0: char_ratio_min_output_chars = 0 char_ratio_retry_count = 0 while not has_base64_image: if self.check_stop_fn(): break # Only apply the char-ratio check when we didn't already see a truncation/prohibited-content signal if merged_finish_reason in ["length", "max_tokens", "content_filter", "prohibited_content"]: break input_char_count = len(merged_content) output_char_count = len(merged_response) if merged_response else 0 char_ratio = (output_char_count / input_char_count) if input_char_count > 0 else 0 if (char_ratio < char_ratio_threshold) and (output_char_count > char_ratio_min_output_chars): if used_fallback: print(f"⚠️ Merged group: Char-ratio suggests truncation but fallback key was used - accepting output") break # IMPORTANT: track retries across the whole merged-group request sequence # so split-failed merge retries don't reset the char-ratio retry budget. if char_ratio_attempts_used >= char_ratio_retry_limit: print(f"❌ Merged group: All {char_ratio_retry_limit} char-ratio retries exhausted; marking as TRUNCATED") char_ratio_exhausted = True break if char_ratio_retry_count == 0: remaining = max(0, char_ratio_retry_limit - char_ratio_attempts_used) print( f"⚠️ Merged group: Char-ratio suggests truncation " f"(Input chars: {input_char_count}, Output chars: {output_char_count}, Ratio: {char_ratio:.2f} < {char_ratio_threshold:.2f}). " f"Attempting up to {remaining} retry(ies)..." ) # Consume one attempt (global across this merged group) char_ratio_attempts_used += 1 char_ratio_retry_count += 1 print(f"🔄 Merged group: Char-ratio retry attempt {char_ratio_attempts_used}/{char_ratio_retry_limit}") # Force higher token limit on retries try: base_max_tokens = int(mtoks) except Exception: base_max_tokens = mtoks try: retry_cap = int(getattr(self.config, "MAX_RETRY_TOKENS", base_max_tokens)) except Exception: retry_cap = base_max_tokens if retry_cap <= 0: retry_cap = base_max_tokens retry_max_tokens = max(base_max_tokens, retry_cap) # Prevent nested truncation retries within the unified client during our char-ratio retries try: tls_retry_client = self.client._get_thread_local_client() except Exception: tls_retry_client = None if tls_retry_client is not None: setattr(tls_retry_client, "_in_truncation_retry", True) try: merged_response_retry, finish_reason_retry, raw_obj_retry = send_with_interrupt( msgs, self.client, self.config.TEMP, retry_max_tokens, self.check_stop_fn, context='translation', chapter_context=chapter_ctx, ) finally: if tls_retry_client is not None: try: setattr(tls_retry_client, "_in_truncation_retry", False) except Exception: pass # Capture truncation exhaustion that might have occurred during the retry try: retry_trunc_exhausted = getattr(self.client, "_truncation_retries_exhausted", False) if retry_trunc_exhausted: truncation_exhausted = True self.client._truncation_retries_exhausted = False except Exception: pass if self.check_stop_fn() and os.environ.get('GRACEFUL_STOP') != '1': raise Exception("Translation stopped by user") retry_output_chars = len(merged_response_retry) if merged_response_retry else 0 if merged_response_retry and retry_output_chars > output_char_count: print(f"✅ Merged group: Char-ratio retry improved output ({output_char_count} → {retry_output_chars} chars)") merged_response = merged_response_retry finish_reason = finish_reason_retry raw_obj = raw_obj_retry merged_finish_reason = finish_reason_retry continue print( f"⚠️ Merged group: Char-ratio retry did not improve output " f"({output_char_count} → {retry_output_chars} chars). Trying again if attempts remain..." ) continue break # Check for truncation (use preserved finish reason so retries/merges don't lose the flag) merged_truncated = merged_finish_reason in ["length", "max_tokens"] or truncation_exhausted or char_ratio_exhausted if merged_truncated: print(f" ⚠️ Merged response was TRUNCATED!") # Clean the merged response cleaned = merged_response if self.config.REMOVE_AI_ARTIFACTS: cleaned = ContentProcessor.clean_ai_artifacts(cleaned, True) cleaned = ContentProcessor.clean_memory_artifacts(cleaned) cleaned = re.sub(r"^```(?:html)?\s*\n?", "", cleaned, count=1, flags=re.MULTILINE) cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE) # Post-process: Fix empty attribute tags for BeautifulSoup mode if os.getenv('FIX_EMPTY_ATTR_TAGS_BS', '0') == '1': try: enhanced_group_check = any(bool(ch.get('enhanced_extraction')) for _, _, _, ch, _ in chapters_data) except Exception: enhanced_group_check = False if not enhanced_group_check: cleaned = _fix_empty_attr_tags_bs(cleaned) # Get parent chapter info parent_actual_num, parent_content, parent_idx, parent_chapter, parent_content_hash = chapters_data[0] merged_child_nums = [cn for cn, _, _, _, _ in chapters_data[1:]] # Check if enhanced extraction was used try: enhanced_group = any(bool(ch.get('enhanced_extraction')) for _, _, _, ch, _ in chapters_data) except Exception: enhanced_group = False # Check if Split the Merge is enabled split_the_merge = os.getenv('SPLIT_THE_MERGE', '0') == '1' # If Split the Merge is enabled, SKIP markdown→HTML conversion here # We'll do it AFTER splitting so markers are preserved if not split_the_merge and enhanced_group and isinstance(cleaned, str): print(" 🔄 Converting merged enhanced text back to HTML...") try: cleaned = convert_enhanced_text_to_html(cleaned, parent_chapter) except Exception as conv_err: print(f" ⚠️ Enhanced HTML conversion failed: {conv_err} — saving raw content") # Emergency Image Restoration (if enabled) if self.config.EMERGENCY_IMAGE_RESTORE: cleaned = ContentProcessor.emergency_restore_images(cleaned, merged_content) # Optionally restore paragraphs if the output lacks structure if getattr(self.config, 'EMERGENCY_RESTORE', False): try: if cleaned and cleaned.count('

') < 3 and len(cleaned) > 300: cleaned = ContentProcessor.emergency_restore_paragraphs(cleaned) except Exception: pass # Check for truncation / QA failures first results = [] if is_qa_failed_response(cleaned): # Only save file for debugging if it contains meaningful content beyond error markers cleaned_stripped = cleaned.strip() is_only_error_marker = cleaned_stripped in [ "[TRANSLATION FAILED]", "[Content Blocked]", "[IMAGE TRANSLATION FAILED]", "[EXTRACTION FAILED]", "[RATE LIMITED]", "[]" ] or cleaned_stripped.startswith("[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]") or cleaned_stripped.startswith("[CONTENT BLOCKED - ORIGINAL TEXT PRESERVED]") failure_reason = get_failure_reason(cleaned) save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False)) save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_prohibited_results', False)) should_save = (save_prohibited_results if is_prohibited_failure(cleaned, failure_reason) else save_partial_results) if should_save: parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num) try: cleaned_to_save = cleaned if split_the_merge: cleaned_to_save = re.sub( r']*id=\"split-\\d+\"[^>]*>.*?\\s*', '', cleaned_to_save, flags=re.IGNORECASE | re.DOTALL, ) with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f: f.write(cleaned_to_save if isinstance(cleaned_to_save, str) else "") except Exception: pass elif not is_only_error_marker and cleaned_stripped: parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num) try: cleaned_to_save = cleaned if split_the_merge: cleaned_to_save = re.sub( r']*id=\"split-\\d+\"[^>]*>.*?\\s*', '', cleaned_to_save, flags=re.IGNORECASE | re.DOTALL, ) with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f: f.write(cleaned_to_save) except Exception: pass # Use each chapter's own expected filename so we overwrite the existing in_progress entry for actual_num, _, idx, chapter, content_hash in chapters_data: chapter_fname = FileUtilities.create_chapter_filename(chapter, actual_num) with self.progress_lock: self.update_progress_fn( idx, actual_num, content_hash, chapter_fname, status="qa_failed", chapter_obj=chapter, ) self.save_progress_fn() results.append((False, actual_num, None, None, None)) return results # Now handle split-the-merge disable_fallback = disable_fallback_flag split_sections = None if split_the_merge and len(chapters_data) > 1: # Try to split by invisible markers split_sections = RequestMerger.split_by_markers(cleaned, len(chapters_data)) # If split failed, optionally retry; if retries exhausted, mark qa_failed when fallback disabled if split_the_merge and (not split_sections or len(split_sections) != len(chapters_data)): if split_retry_enabled and split_retry_attempts + 1 < max_merge_attempts: split_retry_attempts += 1 print(f" 🔄 Split failed — retrying merged request (attempt {split_retry_attempts}/{max_merge_attempts - 1})") continue if disable_fallback: print(f" ⚠️ Split failed and fallback disabled - marking merged group as qa_failed") # Only save file for debugging if it contains meaningful content beyond error markers cleaned_stripped = cleaned.strip() is_only_error_marker = cleaned_stripped in [ "[TRANSLATION FAILED]", "[Content Blocked]", "[IMAGE TRANSLATION FAILED]", "[EXTRACTION FAILED]", "[RATE LIMITED]", "[]" ] or cleaned_stripped.startswith("[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]") or cleaned_stripped.startswith("[CONTENT BLOCKED - ORIGINAL TEXT PRESERVED]") if not is_only_error_marker and cleaned_stripped: # Save for debugging - contains actual translation attempt that failed split parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num) try: cleaned_to_save = cleaned if split_the_merge: cleaned_to_save = re.sub( r']*id=\"split-\\d+\"[^>]*>.*?\\s*', '', cleaned_to_save, flags=re.IGNORECASE | re.DOTALL, ) with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f: f.write(cleaned_to_save) except Exception: pass # IMPORTANT: # Use each chapter's own expected filename so we overwrite the # existing in_progress entry instead of creating composite keys. for actual_num, _, idx, chapter, content_hash in chapters_data: chapter_fname = FileUtilities.create_chapter_filename(chapter, actual_num) with self.progress_lock: self.update_progress_fn( idx, actual_num, content_hash, chapter_fname, status="qa_failed", qa_issues_found=["SPLIT_FAILED"], chapter_obj=chapter, ) self.save_progress_fn() results.append((False, actual_num, None, None, None)) return results # If split failed and fallback is allowed, optionally retry merged translation if split_the_merge and (not split_sections or len(split_sections) != len(chapters_data)) and split_retry_enabled: if split_retry_attempts < split_retry_limit: split_retry_attempts += 1 attempt_no = split_retry_attempts print(f" 🔄 Split failed retry {attempt_no}/{split_retry_limit} — requesting new merged translation") time.sleep(1) # Try a fresh merged request on next loop iteration continue else: print(f" ⚠️ Split failed after {split_retry_limit} retries, falling back to merged output") if split_sections and len(split_sections) == len(chapters_data): # Split successful - save each section as individual file print(f" ✂️ Splitting merged content into {len(split_sections)} individual files") saved_files = [] for i, (actual_num, content, idx, chapter, content_hash) in enumerate(chapters_data): section_content = split_sections[i] # NOW convert markdown→HTML for each section if enhanced extraction was used if enhanced_group and isinstance(section_content, str): try: section_content = convert_enhanced_text_to_html(section_content, chapter) except Exception as conv_err: print(f" ⚠️ Enhanced HTML conversion failed for chapter {actual_num}: {conv_err}") # Generate filename for this chapter using content.opf naming fname = FileUtilities.create_chapter_filename(chapter, actual_num) # Handle text file mode if getattr(self, 'is_text_file', False): fname = fname.replace('.html', '.txt') from bs4 import BeautifulSoup soup = BeautifulSoup(section_content, 'html.parser') section_content = soup.get_text(strip=True) # Save the section with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(section_content) saved_files.append((actual_num, fname, idx, chapter, content_hash)) print(f" 💾 Saved Chapter {actual_num}: {fname} ({len(section_content)} chars)") # Mark all chapters as completed or qa_failed (for truncated) with self.progress_lock: for actual_num, fname, idx, chapter, content_hash in saved_files: chapter_status = "qa_failed" if merged_truncated else "completed" qa_issues = ["TRUNCATED"] if merged_truncated else None self.update_progress_fn( idx, actual_num, content_hash, fname, status=chapter_status, qa_issues_found=qa_issues, chapter_obj=chapter ) self.chapters_completed += 1 # Save once after all updates self.save_progress_fn() # Build results - if truncated, treat as failure for all chapters if merged_truncated: for actual_num, _, idx, chapter, content_hash in chapters_data: results.append((False, actual_num, None, None, None)) else: results.append((True, chapters_data[0][0], merged_content, merged_response, raw_obj)) for actual_num, _, idx, chapter, content_hash in chapters_data[1:]: results.append((True, actual_num, None, None, None)) print(f" ✅ Split the Merge complete: {len(saved_files)} files created") return results # Normal merged behavior (split not enabled or header count mismatch) # Save entire merged response to parent chapter's file fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num) # If Split-the-Merge was enabled but we couldn't split reliably, remove injected markers cleaned_to_save = cleaned if split_the_merge and len(chapters_data) > 1: cleaned_to_save = re.sub( r']*id=\"split-\\d+\"[^>]*>.*?\\s*', '', cleaned_to_save, flags=re.IGNORECASE | re.DOTALL, ) # If translating a plain text source, mirror non-merged behavior and write .txt if getattr(self, 'is_text_file', False): parent_fname = fname.replace('.html', '.txt') from bs4 import BeautifulSoup soup = BeautifulSoup(cleaned_to_save, 'html.parser') text_content = soup.get_text(strip=True) with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f: f.write(text_content) saved_name = parent_fname else: with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f: f.write(cleaned_to_save) saved_name = fname print(f" 💾 Saved merged content to Chapter {parent_actual_num}: {saved_name} ({len(cleaned_to_save)} chars)") with self.progress_lock: if merged_truncated: # Truncated merged response: mark ALL chapters as qa_failed # Check if we can retry this truncation failure as a general merge failure if (not char_ratio_exhausted) and split_retry_enabled and split_retry_attempts + 1 < max_merge_attempts: split_retry_attempts += 1 print(f" 🔄 Truncated merged response — retrying request (attempt {split_retry_attempts}/{max_merge_attempts - 1})") time.sleep(2) continue # Check if we can retry this truncation failure as a general merge failure if (not char_ratio_exhausted) and split_retry_enabled and split_retry_attempts + 1 < max_merge_attempts: split_retry_attempts += 1 print(f" 🔄 Truncated merged response — retrying request (attempt {split_retry_attempts}/{max_merge_attempts - 1})") time.sleep(2) continue qa_issues = ["TRUNCATED"] self.update_progress_fn( parent_idx, parent_actual_num, parent_content_hash, saved_name, status="qa_failed", qa_issues_found=qa_issues, chapter_obj=parent_chapter ) for actual_num, _, idx, chapter, content_hash in chapters_data[1:]: self.update_progress_fn( idx, actual_num, content_hash, None, status="qa_failed", qa_issues_found=qa_issues, chapter_obj=chapter ) self.chapters_completed += len(chapters_data) else: # Normal success path: parent completed, children merged self.update_progress_fn( parent_idx, parent_actual_num, parent_content_hash, saved_name, status="completed", merged_chapters=merged_child_nums, chapter_obj=parent_chapter ) self.chapters_completed += 1 # Then mark all child chapters as merged (only after parent is completed) for actual_num, _, idx, chapter, content_hash in chapters_data[1:]: progress_manager.mark_as_merged(idx, actual_num, content_hash, parent_actual_num, chapter, parent_output_file=saved_name) self.chapters_completed += 1 # Save once after all updates self.save_progress_fn() # Build results based on truncation status if merged_truncated: for actual_num, _, idx, chapter, content_hash in chapters_data: results.append((False, actual_num, None, None, None)) else: results.append((True, parent_actual_num, merged_content, merged_response, raw_obj)) for actual_num, _, idx, chapter, content_hash in chapters_data[1:]: results.append((True, actual_num, None, None, None)) return results # Should never hit this line; guard to prevent infinite loop raise RuntimeError("Merged translation exited retry loop without returning a result") except Exception as e: print(f"❌ Merged group failed: {e} (NOTE: API Error triggered cancellation logic)") # Mark all chapters as failed results = [] for actual_num, _, idx, chapter, content_hash in chapters_data: with self.progress_lock: fname = FileUtilities.create_chapter_filename(chapter, actual_num) self.update_progress_fn(idx, actual_num, content_hash, fname, status="failed", chapter_obj=chapter) self.save_progress_fn() results.append((False, actual_num, None, None, None)) return results # ===================================================== # UNIFIED UTILITIES # ===================================================== def sanitize_resource_filename(filename): """Sanitize resource filenames for filesystem compatibility""" filename = unicodedata.normalize('NFC', filename) replacements = { '/': '_', '\\': '_', ':': '_', '*': '_', '?': '_', '"': '_', '<': '_', '>': '_', '|': '_', '\0': '', '\n': '_', '\r': '_' } for old, new in replacements.items(): filename = filename.replace(old, new) filename = ''.join(char for char in filename if ord(char) >= 32) name, ext = os.path.splitext(filename) if not name: name = 'resource' return name + ext def should_retain_source_extension(): """Read GUI toggle for retaining original extension and no 'response_' prefix. This is stored in config or env by the GUI; we read env as bridge. """ return os.getenv('RETAIN_SOURCE_EXTENSION', os.getenv('retain_source_extension', '0')) in ('1', 'true', 'True') def make_safe_filename(title, actual_num): """Create a safe filename that works across different filesystems""" if not title: return f"chapter_{actual_num:03d}" title = unicodedata.normalize('NFC', str(title)) dangerous_chars = { '/': '_', '\\': '_', ':': '_', '*': '_', '?': '_', '"': '_', '<': '_', '>': '_', '|': '_', '\0': '', '\n': ' ', '\r': ' ', '\t': ' ' } for old, new in dangerous_chars.items(): title = title.replace(old, new) title = ''.join(char for char in title if ord(char) >= 32) title = re.sub(r'\s+', '_', title) title = title.strip('_.• \t') if not title or title == '_' * len(title): title = f"chapter_{actual_num:03d}" return title def get_content_hash(html_content): """Create a stable hash of content""" return ContentProcessor.get_content_hash(html_content) def clean_ai_artifacts(text, remove_artifacts=True): """Remove AI response artifacts from text""" return ContentProcessor.clean_ai_artifacts(text, remove_artifacts) def find_glossary_file(output_dir): """Return path to glossary file preferring CSV/MD/TXT over JSON, or None if not found""" candidates = [ os.path.join(output_dir, "glossary.csv"), os.path.join(output_dir, "glossary.md"), os.path.join(output_dir, "glossary.txt"), os.path.join(output_dir, "glossary.json"), ] for p in candidates: if os.path.exists(p): return p return None def clean_memory_artifacts(text): """Remove any memory/summary artifacts""" return ContentProcessor.clean_memory_artifacts(text) def emergency_restore_paragraphs(text, original_html=None, verbose=True): """Emergency restoration when AI returns wall of text""" return ContentProcessor.emergency_restore_paragraphs(text, original_html, verbose) def _fix_empty_attr_tags_bs(text: str) -> str: """Post-process: escape hallucinated empty-attribute tags in BeautifulSoup output. Transforms patterns like Content into <Tag Attr>Content for non-standard HTML tags, preserving real HTML tags untouched. """ known_tags = { 'html','head','body','title','meta','link','style','script','noscript', 'p','div','span','br','hr','img','a','h1','h2','h3','h4','h5','h6', 'ul','ol','li','dl','dt','dd', 'pre','code','em','strong','b','i','u','s','strike','del','ins','mark','small','sub','sup', 'table','thead','tbody','tr','td','th','caption','col','colgroup', 'blockquote','q','cite', 'section','article','header','footer','nav','main','aside','details','summary', 'figure','figcaption', 'form','input','button','select','option','textarea','label','fieldset','legend', 'iframe','canvas','svg','math', 'video','audio','source','track','embed','object','param', 'map','area', 'center', 'font', 'base' } def _repl_pair(m): tagname = m.group(1) if tagname.lower() in known_tags: return m.group(0) attrname = m.group(2) content = m.group(3) return f"<{tagname} {attrname}>{content}" text = re.sub(r'<([a-zA-Z0-9_\-]+)\s+([a-zA-Z0-9_\-]+)=""\s*>(.*?)', _repl_pair, text, flags=re.DOTALL) return text def is_meaningful_text_content(html_content): """Check if chapter has meaningful text beyond just structure""" return ContentProcessor.is_meaningful_text_content(html_content) # ===================================================== # GLOBAL SETTINGS AND FLAGS # ===================================================== logging.basicConfig(level=logging.DEBUG) try: if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='ignore') except AttributeError: if sys.stdout is None: devnull = open(os.devnull, "wb") sys.stdout = io.TextIOWrapper(devnull, encoding='utf-8', errors='ignore') elif hasattr(sys.stdout, 'buffer'): try: sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore') except: pass _stop_requested = False def set_stop_flag(value): """Set the global stop flag""" global _stop_requested _stop_requested = value def is_stop_requested(): """Check if stop was requested""" global _stop_requested return _stop_requested def set_output_redirect(log_callback=None): """Redirect print statements to a callback function for GUI integration""" if log_callback: import threading class CallbackWriter: def __init__(self, callback): self.callback = callback self.main_thread = threading.main_thread() def write(self, text): if text.strip(): # The callback (append_log) is already thread-safe - it handles QTimer internally # So we can call it directly from any thread self.callback(text.strip()) def flush(self): pass sys.stdout = CallbackWriter(log_callback) # ===================================================== # EPUB AND FILE PROCESSING # ===================================================== def extract_chapter_number_from_filename(filename, opf_spine_position=None, opf_spine_data=None): """Extract chapter number from filename. Preference order: 1) Rightmost digits in the filename (0 if all zeros) 2) Special keywords with no digits -> 0 3) Legacy fallback patterns """ # Normalize: strip directory, extension, and response_ prefix for parsing basename = os.path.basename(filename) base_no_ext = os.path.splitext(basename)[0] if base_no_ext.lower().startswith('response_'): base_no_ext = base_no_ext[len('response_'):] base_no_ext_lower = base_no_ext.lower() # Priority 1: digits in filename (use rightmost match to mirror GUI column) numbers = re.findall(r'[0-9]+', base_no_ext) if numbers: last_num = int(numbers[-1]) if last_num == 0: return 0, 'filename_zero' return last_num, 'filename_digits' # Priority 2: special keyword files with no digits -> chapter 0 # Priority 3: special keyword files with no digits -> chapter 0 special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface', 'nav', 'message', 'info', 'notice', 'colophon', 'dedication', 'epigraph', 'foreword', 'acknowledgment', 'author', 'appendix', 'glossary', 'bibliography'] if any(name in base_no_ext_lower for name in special_keywords): return 0, 'special_file' # Priority 3: legacy fallback patterns name_without_ext = base_no_ext fallback_patterns = [ (r'^response_(\d+)[_\.]', 'response_prefix'), (r'[Cc]hapter[_\s]*(\d+)', 'chapter_word'), (r'[Cc]h[_\s]*(\d+)', 'ch_abbreviation'), (r'No(\d+)', 'no_prefix'), (r'第(\d+)[章话回]', 'chinese_chapter'), (r'-h-(\d+)', 'h_suffix'), # For your -h-16 pattern (r'_(\d+)', 'underscore_suffix'), (r'-(\d+)', 'dash_suffix'), (r'(\d+)', 'trailing_number'), ] for pattern, method in fallback_patterns: match = re.search(pattern, name_without_ext, re.IGNORECASE) if match: return int(match.group(1)), method return None, None def process_chapter_images(chapter_html: str, actual_num: int, image_translator: ImageTranslator, check_stop_fn=None) -> Tuple[str, Dict[str, str]]: """Process and translate images in a chapter""" from bs4 import BeautifulSoup images = image_translator.extract_images_from_chapter(chapter_html) if not images: return chapter_html, {} print(f"🖼️ Found {len(images)} images in chapter {actual_num}") soup = BeautifulSoup(chapter_html, 'html.parser') image_translations = {} translated_count = 0 max_images_per_chapter = int(os.getenv('MAX_IMAGES_PER_CHAPTER', '10')) if len(images) > max_images_per_chapter: print(f" ⚠️ Chapter has {len(images)} images - processing first {max_images_per_chapter} only") images = images[:max_images_per_chapter] for idx, img_info in enumerate(images, 1): if check_stop_fn and check_stop_fn(): print("❌ Image translation stopped by user") break img_src = img_info['src'] original_img_src = img_src # keep for DOM matching img_path = None # Handle inline data URI images (e.g., PDF image render mode) if img_src.startswith('data:image'): try: import base64, uuid, mimetypes header, b64data = img_src.split(',', 1) mime = 'image/png' if ':' in header and ';' in header: mime = header.split(';')[0].split(':')[1] or mime ext = mimetypes.guess_extension(mime) or '.png' os.makedirs(image_translator.images_dir, exist_ok=True) temp_name = f"datauri_{actual_num}_{idx}_{uuid.uuid4().hex}{ext}" img_path = os.path.join(image_translator.images_dir, temp_name) with open(img_path, 'wb') as f: f.write(base64.b64decode(b64data)) # Keep img_src pointing to original so DOM match works; translator uses img_path except Exception as e: print(f" ❌ Failed to decode data URI image: {e}") continue if img_path is None and img_src.startswith('../'): img_path = os.path.join(image_translator.output_dir, img_src[3:]) elif img_path is None and img_src.startswith('./'): img_path = os.path.join(image_translator.output_dir, img_src[2:]) elif img_path is None and img_src.startswith('/'): img_path = os.path.join(image_translator.output_dir, img_src[1:]) elif img_path is None: possible_paths = [ os.path.join(image_translator.images_dir, os.path.basename(img_src)), os.path.join(image_translator.output_dir, img_src), os.path.join(image_translator.output_dir, 'images', os.path.basename(img_src)), os.path.join(image_translator.output_dir, os.path.basename(img_src)), os.path.join(image_translator.output_dir, os.path.dirname(img_src), os.path.basename(img_src)) ] img_path = None for path in possible_paths: if os.path.exists(path): img_path = path print(f" ✅ Found image at: {path}") break if not img_path: print(f" ❌ Image not found in any location for: {img_src}") print(f" Tried: {possible_paths}") continue img_path = os.path.normpath(img_path) if not os.path.exists(img_path): print(f" ⚠️ Image not found: {img_path}") print(f" 📁 Images directory: {image_translator.images_dir}") print(f" 📁 Output directory: {image_translator.output_dir}") print(f" 📁 Working directory: {os.getcwd()}") if os.path.exists(image_translator.images_dir): files = os.listdir(image_translator.images_dir) print(f" 📁 Files in images dir: {files[:5]}...") continue print(f" 🔍 Processing image {idx}/{len(images)}: {os.path.basename(img_path)}") context = "" if img_info.get('alt'): context += f", Alt text: {img_info['alt']}" if translated_count > 0: delay = float(os.getenv('IMAGE_API_DELAY', '1.0')) time.sleep(delay) translation_result = image_translator.translate_image(img_path, context, check_stop_fn) print(f"\n🔍 DEBUG: Image {idx}/{len(images)}") print(f" Translation result: {'Success' if translation_result and '[Image Translation Error:' not in translation_result else 'Failed'}") if translation_result and "[Image Translation Error:" in translation_result: print(f" Error message: {translation_result}") if translation_result: img_tag = None for img in soup.find_all('img'): if img.get('src') == original_img_src: img_tag = img break if img_tag: hide_label = os.getenv("HIDE_IMAGE_TRANSLATION_LABEL", "0") == "1" print(f" 🔍 DEBUG: Integration Phase") print(f" 🏷️ Hide label mode: {hide_label}") src_display = img_tag.get('src', '') if src_display.startswith('data:image'): src_display = src_display[:80] + '...' print(f" 📍 Found img tag: {src_display}") # Store the translation result in the dictionary FIRST image_translations[img_path] = translation_result # Parse the translation result to integrate into the chapter HTML if '

' in translation_result: trans_soup = BeautifulSoup(translation_result, 'html.parser') # Try to get the full container first full_container = trans_soup.find('div', class_=['translated-text-only', 'image-with-translation']) if full_container: # Clone the container to avoid issues new_container = BeautifulSoup(str(full_container), 'html.parser').find('div') img_tag.replace_with(new_container) print(f" ✅ Replaced image with full translation container") else: # Fallback: manually build the structure trans_div = trans_soup.find('div', class_='image-translation') if trans_div: container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'}) img_tag.replace_with(container) if not hide_label: new_img = soup.new_tag('img', src=img_src) if img_info.get('alt'): new_img['alt'] = img_info.get('alt') container.append(new_img) # Clone the translation div content new_trans_div = soup.new_tag('div', **{'class': 'image-translation'}) # Copy all children from trans_div to new_trans_div for child in trans_div.children: if hasattr(child, 'name'): new_trans_div.append(BeautifulSoup(str(child), 'html.parser')) else: new_trans_div.append(str(child)) container.append(new_trans_div) print(f" ✅ Built container with translation div") else: print(f" ⚠️ No translation div found in result") continue else: # Plain text translation - build structure manually container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'}) img_tag.replace_with(container) if not hide_label: new_img = soup.new_tag('img', src=img_src) if img_info.get('alt'): new_img['alt'] = img_info.get('alt') container.append(new_img) # Create translation div with content translation_div = soup.new_tag('div', **{'class': 'image-translation'}) if not hide_label: label_p = soup.new_tag('p') label_em = soup.new_tag('em') #label_em.string = "[Image text translation:]" label_p.append(label_em) translation_div.append(label_p) trans_p = soup.new_tag('p') trans_p.string = translation_result translation_div.append(trans_p) container.append(translation_div) print(f" ✅ Created plain text translation structure") translated_count += 1 # Save to translated_images folder trans_filename = f"ch{actual_num:03d}_img{idx:02d}_translation.html" trans_filepath = os.path.join(image_translator.translated_images_dir, trans_filename) # Extract just the translation content for saving save_soup = BeautifulSoup(translation_result, 'html.parser') save_div = save_soup.find('div', class_='image-translation') if not save_div: # Create a simple div for plain text save_div = f'

{translation_result}

' with open(trans_filepath, 'w', encoding='utf-8') as f: f.write(f""" Chapter {actual_num} - Image {idx} Translation

Chapter {actual_num} - Image {idx}

Original: {os.path.basename(img_path)}


{save_div} """) print(f" ✅ Saved translation to: {trans_filename}") else: print(f" ⚠️ Could not find image tag in HTML for: {img_src}") if translated_count > 0: print(f" 🖼️ Successfully translated {translated_count} images") # Debug output final_html = str(soup) trans_count = final_html.count('
') print(f" 📊 Final HTML has {trans_count} translation divs") print(f" 📊 image_translations dict has {len(image_translations)} entries") prog = image_translator.load_progress() if "image_chunks" in prog: completed_images = [] for img_key, img_data in prog["image_chunks"].items(): if len(img_data["completed"]) == img_data["total"]: completed_images.append(img_key) for img_key in completed_images: del prog["image_chunks"][img_key] if completed_images: image_translator.save_progress(prog) print(f" 🧹 Cleaned up progress for {len(completed_images)} completed images") image_translator.save_translation_log(actual_num, image_translations) return str(soup), image_translations else: print(f" ℹ️ No images were successfully translated") return chapter_html, {} def detect_novel_numbering(chapters): """Detect if the novel uses 0-based or 1-based chapter numbering with improved accuracy""" print("[DEBUG] Detecting novel numbering system...") if not chapters: return False if isinstance(chapters[0], str): print("[DEBUG] Text file detected, skipping numbering detection") return False patterns = PatternManager.FILENAME_EXTRACT_PATTERNS # Special check for prefix_suffix pattern like "0000_1.xhtml" prefix_suffix_pattern = r'^(\d+)_(\d+)[_\.]' # Track chapter numbers from different sources filename_numbers = [] content_numbers = [] has_prefix_suffix = False prefix_suffix_numbers = [] for idx, chapter in enumerate(chapters): extracted_num = None # Check filename patterns if 'original_basename' in chapter and chapter['original_basename']: filename = chapter['original_basename'] elif 'filename' in chapter: filename = os.path.basename(chapter['filename']) else: continue # First check for prefix_suffix pattern prefix_match = re.search(prefix_suffix_pattern, filename, re.IGNORECASE) if prefix_match: has_prefix_suffix = True # Use the SECOND number (after underscore) suffix_num = int(prefix_match.group(2)) prefix_suffix_numbers.append(suffix_num) extracted_num = suffix_num print(f"[DEBUG] Prefix_suffix pattern matched: {filename} -> Chapter {suffix_num}") else: # Try other patterns for pattern in patterns: match = re.search(pattern, filename) if match: extracted_num = int(match.group(1)) #print(f"[DEBUG] Pattern '{pattern}' matched: {filename} -> Chapter {extracted_num}") break if extracted_num is not None: filename_numbers.append(extracted_num) # Also check chapter content for chapter declarations if 'body' in chapter: # Look for "Chapter N" in the first 1000 characters content_preview = chapter['body'][:1000] content_match = re.search(r'Chapter\s+(\d+)', content_preview, re.IGNORECASE) if content_match: content_num = int(content_match.group(1)) content_numbers.append(content_num) print(f"[DEBUG] Found 'Chapter {content_num}' in content") # Decision logic with improved heuristics # 1. If using prefix_suffix pattern, trust those numbers exclusively if has_prefix_suffix and prefix_suffix_numbers: min_suffix = min(prefix_suffix_numbers) if min_suffix >= 1: print(f"[DEBUG] ✅ 1-based novel detected (prefix_suffix pattern starts at {min_suffix})") return False else: print(f"[DEBUG] ✅ 0-based novel detected (prefix_suffix pattern starts at {min_suffix})") return True # 2. If we have content numbers, prefer those over filename numbers if content_numbers: min_content = min(content_numbers) # Check if we have a good sequence starting from 0 or 1 if 0 in content_numbers and 1 in content_numbers: print(f"[DEBUG] ✅ 0-based novel detected (found both Chapter 0 and Chapter 1 in content)") return True elif min_content == 1: print(f"[DEBUG] ✅ 1-based novel detected (content chapters start at 1)") return False # 3. Fall back to filename numbers if filename_numbers: min_filename = min(filename_numbers) max_filename = max(filename_numbers) # Check for a proper sequence # If we have 0,1,2,3... it's likely 0-based # If we have 1,2,3,4... it's likely 1-based # Count how many chapters we have in sequence starting from 0 zero_sequence_count = 0 for i in range(len(chapters)): if i in filename_numbers: zero_sequence_count += 1 else: break # Count how many chapters we have in sequence starting from 1 one_sequence_count = 0 for i in range(1, len(chapters) + 1): if i in filename_numbers: one_sequence_count += 1 else: break print(f"[DEBUG] Zero-based sequence length: {zero_sequence_count}") print(f"[DEBUG] One-based sequence length: {one_sequence_count}") # If we have a better sequence starting from 1, it's 1-based if one_sequence_count > zero_sequence_count and min_filename >= 1: print(f"[DEBUG] ✅ 1-based novel detected (better sequence match starting from 1)") return False # If we have any 0 in filenames and it's part of a sequence if 0 in filename_numbers and zero_sequence_count >= 3: print(f"[DEBUG] ✅ 0-based novel detected (found 0 in sequence)") return True # 4. Default to 1-based if uncertain print(f"[DEBUG] ✅ Defaulting to 1-based novel (insufficient evidence for 0-based)") return False def validate_chapter_continuity(chapters): """Validate chapter continuity and warn about issues""" if not chapters: print("No chapters to translate") return issues = [] # Get all chapter numbers chapter_nums = [c['num'] for c in chapters] actual_nums = [c.get('actual_chapter_num', c['num']) for c in chapters] # Check for duplicates duplicates = [num for num in chapter_nums if chapter_nums.count(num) > 1] if duplicates: issues.append(f"Duplicate chapter numbers found: {set(duplicates)}") # Check for gaps in sequence min_num = min(chapter_nums) max_num = max(chapter_nums) expected = set(range(min_num, max_num + 1)) actual = set(chapter_nums) missing = expected - actual if missing: issues.append(f"Missing chapter numbers: {sorted(missing)}") # Show gaps more clearly gaps = [] sorted_missing = sorted(missing) if sorted_missing: start = sorted_missing[0] end = sorted_missing[0] for num in sorted_missing[1:]: if num == end + 1: end = num else: gaps.append(f"{start}-{end}" if start != end else str(start)) start = end = num gaps.append(f"{start}-{end}" if start != end else str(start)) issues.append(f"Gap ranges: {', '.join(gaps)}") # Check for duplicate titles title_map = {} for c in chapters: title_lower = c['title'].lower().strip() if title_lower in title_map: title_map[title_lower].append(c['num']) else: title_map[title_lower] = [c['num']] for title, nums in title_map.items(): if len(nums) > 1: issues.append(f"Duplicate title '{title}' in chapters: {nums}") # Print summary print("\n" + "="*60) print("📚 CHAPTER VALIDATION SUMMARY") print("="*60) print(f"Total chapters: {len(chapters)}") print(f"Chapter range: {min_num} to {max_num}") print(f"Expected count: {max_num - min_num + 1}") print(f"Actual count: {len(chapters)}") if len(chapters) != (max_num - min_num + 1): print(f"⚠️ Chapter count mismatch - missing {(max_num - min_num + 1) - len(chapters)} chapters") if issues: print("\n⚠️ Issues found:") for issue in issues: print(f" - {issue}") else: print("✅ No continuity issues detected") print("="*60 + "\n") def validate_epub_structure(output_dir): """Validate that all necessary EPUB structure files are present""" print("🔍 Validating EPUB structure...") required_files = { 'container.xml': 'META-INF container file (critical)', '*.opf': 'OPF package file (critical)', '*.ncx': 'Navigation file (recommended)' } found_files = {} missing_files = [] container_path = os.path.join(output_dir, 'container.xml') if os.path.exists(container_path): found_files['container.xml'] = 'Found' print(" ✅ container.xml - Found") else: missing_files.append('container.xml') print(" ❌ container.xml - Missing (CRITICAL)") opf_files = [] ncx_files = [] for file in os.listdir(output_dir): if file.lower().endswith('.opf'): opf_files.append(file) elif file.lower().endswith('.ncx'): ncx_files.append(file) if opf_files: found_files['opf'] = opf_files print(f" ✅ OPF file(s) - Found: {', '.join(opf_files)}") else: missing_files.append('*.opf') print(" ❌ OPF file - Missing (CRITICAL)") if ncx_files: found_files['ncx'] = ncx_files print(f" ✅ NCX file(s) - Found: {', '.join(ncx_files)}") else: missing_files.append('*.ncx') print(" ⚠️ NCX file - Missing (navigation may not work)") html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')] if html_files: print(f" ✅ Translated chapters - Found: {len(html_files)} files") else: print(" ⚠️ No translated chapter files found") critical_missing = [f for f in missing_files if f in ['container.xml', '*.opf']] if not critical_missing: print("✅ EPUB structure validation PASSED") print(" All critical files present for EPUB reconstruction") return True else: print("❌ EPUB structure validation FAILED") print(f" Missing critical files: {', '.join(critical_missing)}") print(" EPUB reconstruction may fail without these files") return False def check_epub_readiness(output_dir): """Check if the output directory is ready for EPUB compilation""" print("📋 Checking EPUB compilation readiness...") issues = [] if not validate_epub_structure(output_dir): issues.append("Missing critical EPUB structure files") html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')] if not html_files: issues.append("No translated chapter files found") else: print(f" ✅ Found {len(html_files)} translated chapters") metadata_path = os.path.join(output_dir, 'metadata.json') if os.path.exists(metadata_path): print(" ✅ Metadata file present") try: with open(metadata_path, 'r', encoding='utf-8') as f: metadata = json.load(f) if 'title' not in metadata: issues.append("Metadata missing title") except Exception as e: issues.append(f"Metadata file corrupted: {e}") else: issues.append("Missing metadata.json file") resource_dirs = ['css', 'fonts', 'images'] found_resources = 0 for res_dir in resource_dirs: res_path = os.path.join(output_dir, res_dir) if os.path.exists(res_path): files = [f for f in os.listdir(res_path) if os.path.isfile(os.path.join(res_path, f))] if files: found_resources += len(files) print(f" ✅ Found {len(files)} {res_dir} files") if found_resources > 0: print(f" ✅ Total resources: {found_resources} files") else: print(" ⚠️ No resource files found (this may be normal)") if not issues: print("🎉 EPUB compilation readiness: READY") print(" All necessary files present for EPUB creation") return True else: print("⚠️ EPUB compilation readiness: ISSUES FOUND") for issue in issues: print(f" • {issue}") return False def cleanup_previous_extraction(output_dir): """Clean up any files from previous extraction runs (preserves CSS files)""" # Remove 'css' from cleanup_items to preserve CSS files cleanup_items = [ 'images', # Removed 'css' from this list '.resources_extracted' ] epub_structure_files = [ 'container.xml', 'content.opf', 'toc.ncx' ] cleaned_count = 0 # Clean up directories (except CSS) for item in cleanup_items: if item.startswith('.'): continue item_path = os.path.join(output_dir, item) try: if os.path.isdir(item_path): shutil.rmtree(item_path) print(f"🧹 Removed directory: {item}") cleaned_count += 1 except Exception as e: print(f"⚠️ Could not remove directory {item}: {e}") # Clean up EPUB structure files for epub_file in epub_structure_files: file_path = os.path.join(output_dir, epub_file) try: if os.path.isfile(file_path): os.remove(file_path) print(f"🧹 Removed EPUB file: {epub_file}") cleaned_count += 1 except Exception as e: print(f"⚠️ Could not remove {epub_file}: {e}") # Clean up any loose .opf and .ncx files try: for file in os.listdir(output_dir): if file.lower().endswith(('.opf', '.ncx')): file_path = os.path.join(output_dir, file) if os.path.isfile(file_path): os.remove(file_path) print(f"🧹 Removed EPUB file: {file}") cleaned_count += 1 except Exception as e: print(f"⚠️ Error scanning for EPUB files: {e}") # Remove extraction marker marker_path = os.path.join(output_dir, '.resources_extracted') try: if os.path.isfile(marker_path): os.remove(marker_path) print(f"🧹 Removed extraction marker") cleaned_count += 1 except Exception as e: print(f"⚠️ Could not remove extraction marker: {e}") # Check if CSS files exist and inform user they're being preserved css_path = os.path.join(output_dir, 'css') if os.path.exists(css_path): try: css_files = [f for f in os.listdir(css_path) if os.path.isfile(os.path.join(css_path, f))] if css_files: print(f"📚 Preserving {len(css_files)} CSS files") except Exception: pass if cleaned_count > 0: print(f"🧹 Cleaned up {cleaned_count} items from previous runs (CSS files preserved)") return cleaned_count # ===================================================== # API AND TRANSLATION UTILITIES # ===================================================== def send_with_interrupt(messages, client, temperature, max_tokens, stop_check_fn, chunk_timeout=None, request_id=None, context=None, chapter_context=None, bypass_graceful_stop=False): """Send API request with interrupt capability and optional timeout retry. Optional context parameter is passed through to the client to improve payload labeling. chapter_context (dict) may contain "chapter", "chunk", "total_chunks", and "merged_chapters". When provided and the client supports set_chapter_context, it will be applied inside the API thread so that thread-local payload metadata is accurate. """ # Import UnifiedClientError at function level to avoid scoping issues from unified_api_client import UnifiedClientError # The client.send() call will handle multi-key rotation automatically result_queue = queue.Queue() cancel_event = threading.Event() # Honor RETRY_TIMEOUT toggle: when off, disable chunk timeout entirely retry_env = os.getenv("RETRY_TIMEOUT") # Default: wrapper chunk timeout is OFF unless RETRY_TIMEOUT is explicitly truthy retry_timeout_enabled = bool(retry_env) and retry_env.strip().lower() not in ("0", "false", "off", "") if not retry_timeout_enabled: chunk_timeout = None def _clear_watchdog_for_chapter_context() -> None: """Best-effort cleanup so the GUI watchdog doesn't stay stuck when this wrapper abandons a call.""" try: import unified_api_client clear_fn = getattr(unified_api_client, '_api_watchdog_clear_chapter', None) if not callable(clear_fn): return chap = None merged = None if isinstance(chapter_context, dict): chap = chapter_context.get('chapter') merged = chapter_context.get('merged_chapters') if chap is not None: clear_fn(chap) if merged: try: for mc in merged: if mc is not None: clear_fn(mc) except Exception: pass except Exception: pass def api_call(): try: start_time = time.time() # Apply chapter/chunk context in THIS thread so UnifiedClient's # thread-local chapter_info is visible to payload saving. if chapter_context and hasattr(client, 'set_chapter_context'): try: client.set_chapter_context( chapter=chapter_context.get('chapter'), chunk=chapter_context.get('chunk'), total_chunks=chapter_context.get('total_chunks'), merged_chapters=chapter_context.get('merged_chapters'), ) except Exception: # Context is best-effort and should never break the call pass # Build send parameters (context is optional) send_params = { 'messages': messages, 'temperature': temperature, 'max_tokens': max_tokens, } sig = inspect.signature(client.send) if 'context' in sig.parameters and context is not None: send_params['context'] = context result = client.send(**send_params) # If the caller has already timed out/cancelled, do not publish a stale result. if cancel_event.is_set(): return # Capture raw response object for thought signatures (if available) raw_obj = None if hasattr(client, 'get_last_response_object'): resp_obj = client.get_last_response_object() if resp_obj and hasattr(resp_obj, 'raw_content_object'): raw_obj = resp_obj.raw_content_object # print("🧠 Captured thought signature for history in send_with_interrupt") elapsed = time.time() - start_time # Include raw_obj in the result tuple result_queue.put((result, elapsed, raw_obj)) except Exception as e: # If already cancelled, suppress late exceptions from the abandoned call. if cancel_event.is_set(): return result_queue.put(e) # Pre-send submission spacing to align staggered logs with actual delay try: thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", os.getenv("THREAD_SUBMISSION_DELAY", "0.1"))) except Exception: thread_delay = 0.1 try: api_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) except Exception: api_delay = 2.0 enforce_delay = max(thread_delay, api_delay) if enforce_delay > 0: global _translation_thread_submit_lock, _translation_last_thread_submit with _translation_thread_submit_lock: now = time.time() remaining = enforce_delay - (now - _translation_last_thread_submit) if remaining > 0: elapsed = 0.0 step = 0.1 while elapsed < remaining: if stop_check_fn(): raise UnifiedClientError("Translation stopped by user during threading delay", error_type="cancelled") dt = min(step, remaining - elapsed) time.sleep(dt) elapsed += dt _translation_last_thread_submit = time.time() else: _translation_last_thread_submit = now api_thread = threading.Thread(target=api_call) api_thread.daemon = True api_thread.start() timeout = chunk_timeout check_interval = 0.5 elapsed = 0 while True: try: result = result_queue.get(timeout=check_interval) if isinstance(result, Exception): # For expected errors like rate limits, preserve the error type without extra traceback if hasattr(result, 'error_type') and result.error_type == "rate_limit": raise result elif "429" in str(result) or "rate limit" in str(result).lower(): # Convert generic exceptions to UnifiedClientError for rate limits raise UnifiedClientError(str(result), error_type="rate_limit") else: raise result if isinstance(result, tuple): # Unpack the tuple (now includes raw_obj) if len(result) == 3: api_result, api_time, raw_obj = result # Store raw_obj as an attribute for later retrieval if hasattr(api_result, '__class__'): # If api_result is a tuple, return a new tuple with raw_obj if isinstance(api_result, tuple): return (*api_result, raw_obj) else: # Store as attribute for retrieval api_result._raw_obj = raw_obj else: # Backward compatibility for old format api_result, api_time = result if chunk_timeout is not None and api_time > chunk_timeout: # Set cleanup flag when chunk timeout occurs if hasattr(client, '_in_cleanup'): client._in_cleanup = True cancel_event.set() if hasattr(client, 'cancel_current_operation'): client.cancel_current_operation() # Clear watchdog entries for this chapter since we're abandoning the result. _clear_watchdog_for_chapter_context() try: api_thread.join(timeout=2.0) except Exception: pass raise UnifiedClientError(f"API call took {api_time:.1f}s (timeout: {chunk_timeout}s)") # If graceful stop was requested, mark that an API call completed if os.environ.get('GRACEFUL_STOP') == '1': os.environ['GRACEFUL_STOP_COMPLETED'] = '1' return api_result return result except queue.Empty: # During graceful stop, don't cancel the API call - let it complete # Unless bypass_graceful_stop is enabled, in which case we defer to stop_check_fn logic should_stop = stop_check_fn() graceful_active = os.environ.get('GRACEFUL_STOP') == '1' # Hard cancellation (e.g. double-click force stop via hard_cancel_all) # overrides graceful stop protection for in-flight calls. hard_cancelled = hasattr(client, 'is_globally_cancelled') and client.is_globally_cancelled() # During graceful stop, protect in-flight calls unless hard-cancelled. should_cancel = hard_cancelled or (should_stop and not graceful_active) if should_cancel: # Set cleanup flag when user stops if hasattr(client, '_in_cleanup'): client._in_cleanup = True cancel_event.set() if hasattr(client, 'cancel_current_operation'): client.cancel_current_operation() # Clear watchdog entries for this chapter since we're abandoning the result. _clear_watchdog_for_chapter_context() try: api_thread.join(timeout=2.0) except Exception: pass raise UnifiedClientError("Translation stopped by user") elapsed += check_interval if chunk_timeout is not None and elapsed >= chunk_timeout: if hasattr(client, '_in_cleanup'): client._in_cleanup = True cancel_event.set() if hasattr(client, 'cancel_current_operation'): client.cancel_current_operation() # Clear watchdog entries for this chapter since we're abandoning the result. _clear_watchdog_for_chapter_context() # Give the background thread a brief chance to unwind after transport closure try: api_thread.join(timeout=2.0) except Exception: pass raise UnifiedClientError(f"API call timed out after {chunk_timeout} seconds") def handle_api_error(processor, error, chunk_info=""): """Handle API errors with multi-key support""" error_str = str(error) # Check for rate limit if "429" in error_str or "rate limit" in error_str.lower(): if processor.config.use_multi_api_keys: print(f"⚠️ Rate limit hit {chunk_info}, client should rotate to next key") stats = processor.client.get_stats() print(f"📊 API Stats - Active keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}") if stats.get('active_keys', 0) == 0: print("⏳ All API keys are cooling down - will wait and retry") print(f"🔄 Multi-key error handling: Rate limit processed, preparing for key rotation...") time.sleep(0.1) # Brief pause after rate limit detection for stability return True # Always retry else: print(f"⚠️ Rate limit hit {chunk_info}, waiting before retry...") time.sleep(60) print(f"🔄 Single-key error handling: Rate limit wait completed, ready for retry...") time.sleep(0.1) # Brief pause after rate limit wait for stability return True # Always retry # Other errors print(f"❌ API Error {chunk_info}: {error_str}") return False def parse_token_limit(env_value): """Parse token limit from environment variable""" if not env_value or env_value.strip() == "": return None, "unlimited" env_value = env_value.strip() if env_value.lower() == "unlimited": return None, "unlimited" if env_value.isdigit() and int(env_value) > 0: limit = int(env_value) return limit, str(limit) return 1000000, "1000000 (default)" def build_system_prompt(user_prompt, glossary_path=None, source_text=None): """Build the system prompt with glossary - TRUE BRUTE FORCE VERSION""" append_glossary = os.getenv("APPEND_GLOSSARY", "1") == "1" actual_glossary_path = glossary_path # Replace {target_lang} placeholder if present target_lang = os.getenv("OUTPUT_LANGUAGE", "English") if user_prompt and "{target_lang}" in user_prompt: user_prompt = user_prompt.replace("{target_lang}", target_lang) system = user_prompt if user_prompt else "" if append_glossary and actual_glossary_path and os.path.exists(actual_glossary_path): try: print(f"✅ Loading glossary from: {os.path.abspath(actual_glossary_path)}") # Try to load as JSON first try: with open(actual_glossary_path, "r", encoding="utf-8") as gf: glossary_data = json.load(gf) glossary_text = json.dumps(glossary_data, ensure_ascii=False, indent=2) print(f"Loaded as JSON") except json.JSONDecodeError: # If JSON fails, just read as raw text with open(actual_glossary_path, "r", encoding="utf-8") as gf: glossary_text = gf.read() # Apply glossary compression if enabled and source text is provided compress_glossary_enabled = os.getenv("COMPRESS_GLOSSARY_PROMPT", "0") == "1" if compress_glossary_enabled and source_text: try: from glossary_compressor import compress_glossary original_glossary_text = glossary_text # Store original for token counting original_length = len(glossary_text) glossary_text = compress_glossary(glossary_text, source_text, glossary_format='auto') compressed_length = len(glossary_text) reduction_pct = ((original_length - compressed_length) / original_length * 100) if original_length > 0 else 0 # Also calculate token savings if tiktoken is available try: import tiktoken try: enc = tiktoken.encoding_for_model(os.getenv("MODEL", "gpt-4")) except: enc = tiktoken.get_encoding("cl100k_base") # Count tokens for original and compressed glossary original_tokens = len(enc.encode(original_glossary_text)) compressed_tokens = len(enc.encode(glossary_text)) token_reduction = original_tokens - compressed_tokens token_reduction_pct = (token_reduction / original_tokens * 100) if original_tokens > 0 else 0 print(f"🗜️ Glossary: {original_length:,}→{compressed_length:,} chars ({reduction_pct:.1f}%), {original_tokens:,}→{compressed_tokens:,} tokens ({token_reduction_pct:.1f}%)") except ImportError: # If tiktoken is not available, just show character reduction print(f"🗜️ Glossary compressed: {original_length:,} → {compressed_length:,} chars ({reduction_pct:.1f}% reduction)") except Exception as e: print(f"⚠️ Glossary compression failed: {e}") # Continue with uncompressed glossary if system: system += "\n\n" custom_prompt = os.getenv("APPEND_GLOSSARY_PROMPT", "").strip() if not custom_prompt: raise ValueError( "APPEND_GLOSSARY_PROMPT environment variable is not set!\n" "Please configure your glossary append format in:\n" "Glossary Manager → Automatic Glossary → Glossary Append Format" ) system += f"{custom_prompt}\n{glossary_text}" print(f"✅ Glossary appended ({len(glossary_text):,} characters)") # Check for glossary extension file (only if ADD_ADDITIONAL_GLOSSARY is enabled) add_additional_glossary = os.getenv("ADD_ADDITIONAL_GLOSSARY", "0") == "1" if add_additional_glossary: glossary_dir = os.path.dirname(actual_glossary_path) # Check for extension with any supported format additional_glossary_path = None for ext in ['.csv', '.md', '.txt', '.json']: candidate = os.path.join(glossary_dir, f"glossary_extension{ext}") if os.path.exists(candidate): additional_glossary_path = candidate break if additional_glossary_path: try: print(f"✅ Loading glossary extension from: {os.path.basename(additional_glossary_path)}") with open(additional_glossary_path, "r", encoding="utf-8") as af: additional_glossary_text = af.read() # Apply same compression logic if enabled if compress_glossary_enabled and source_text: try: from glossary_compressor import compress_glossary original_add_length = len(additional_glossary_text) additional_glossary_text = compress_glossary(additional_glossary_text, source_text, glossary_format='auto') compressed_add_length = len(additional_glossary_text) add_reduction_pct = ((original_add_length - compressed_add_length) / original_add_length * 100) if original_add_length > 0 else 0 print(f"🗃️ Glossary extension compressed: {original_add_length:,} → {compressed_add_length:,} chars ({add_reduction_pct:.1f}% reduction)") except Exception as e: print(f"⚠️ Glossary extension compression failed: {e}") # Append glossary extension system += f"\n\n{additional_glossary_text}" print(f"✅ Glossary extension appended ({len(additional_glossary_text):,} characters)") except Exception as e: print(f"⚠️ Failed to load glossary extension: {e}") except Exception as e: print(f"[ERROR] Could not load glossary: {e}") import traceback print(f"[ERROR] Full traceback: {traceback.format_exc()}") else: if not append_glossary: #print(f"[DEBUG] ❌ Glossary append disabled") pass elif not actual_glossary_path: # Check if we're translating CSV/JSON files (they typically don't need glossaries) input_path = os.getenv('EPUB_PATH', '') if not input_path.lower().endswith(('.csv', '.json')): print(f"[DEBUG] ❌ No glossary path provided") elif not os.path.exists(actual_glossary_path): print(f"[DEBUG] ❌ Glossary file does not exist: {actual_glossary_path}") # Calculate token count for system prompt try: import tiktoken try: enc = tiktoken.encoding_for_model(os.getenv("MODEL", "gpt-4")) except: enc = tiktoken.get_encoding("cl100k_base") system_tokens = len(enc.encode(system)) print(f"🎯 Final system prompt: {len(system):,} chars, {system_tokens:,} tokens") except ImportError: print(f"🎯 Final system prompt length: {len(system)} characters") return system def translate_title(title, client, system_prompt, user_prompt, temperature=0.3): """Translate the book title using the configured settings""" if not title or not title.strip(): return title print(f"📚 Processing book title: {title}") try: if os.getenv("TRANSLATE_BOOK_TITLE", "1") == "0": print(f"📚 Book title translation disabled - keeping original") return title # Check if we're using a translation service (not AI) client_type = getattr(client, 'client_type', '') is_translation_service = client_type in ['deepl', 'google_translate'] if is_translation_service: # For translation services, send only the text without AI prompts print(f"📚 Using translation service ({client_type}) - sending text directly") messages = [ {"role": "user", "content": title} ] max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens) else: # For AI services, use prompts as before book_title_prompt = os.getenv("BOOK_TITLE_PROMPT", "Translate this book title to English while retaining any acronyms:") # Get the system prompt for book titles, with fallback to default book_title_system_prompt = os.getenv("BOOK_TITLE_SYSTEM_PROMPT", "You are a translator. Respond with only the translated text, nothing else. Do not add any explanation or additional content.") # Replace {target_lang} variable with output language output_lang = os.getenv("OUTPUT_LANGUAGE", "English") book_title_prompt = book_title_prompt.replace("{target_lang}", output_lang) book_title_system_prompt = book_title_system_prompt.replace("{target_lang}", output_lang) messages = [ {"role": "system", "content": book_title_system_prompt}, {"role": "user", "content": f"{book_title_prompt}\n\n{title}"} ] max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens) print(f"[DEBUG] Raw API response: '{translated_title}'") print(f"[DEBUG] Response length: {len(translated_title)} (original: {len(title)})") newline = '\n' print(f"[DEBUG] Has newlines: {repr(translated_title) if newline in translated_title else 'No'}") translated_title = translated_title.strip() if ((translated_title.startswith('"') and translated_title.endswith('"')) or (translated_title.startswith("'") and translated_title.endswith("'"))): translated_title = translated_title[1:-1].strip() if '\n' in translated_title: print(f"⚠️ API returned multi-line content, keeping original title") return title # Check for JSON-like structured content, but allow simple brackets like [END] if (any(char in translated_title for char in ['{', '}']) or '"role":' in translated_title or '"content":' in translated_title or ('[[' in translated_title and ']]' in translated_title)): # Only flag double brackets print(f"⚠️ API returned structured content, keeping original title") return title if any(tag in translated_title.lower() for tag in ['

', '

', '

', '

', '= 2: return True # Single strong error indicator in very short response if len(content_str) < 50 and error_count >= 1: return True return False # Additional helper function for debugging def get_failure_reason(content): """ Returns the specific reason why content was marked as qa_failed Useful for debugging and logging """ if not content: return "Empty content" if os.getenv("DISABLE_QA_MARKER_CHECKS", "0") == "1": return "QA marker checks disabled" content_str = str(content).strip() content_lower = content_str.lower() # Check each category and return the first match failure_categories = { "Explicit Failure Marker": [ "[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]", "[IMAGE TRANSLATION FAILED]", "API response unavailable", "[]" ], "HTTP Error": [ "authentication_error", "rate_limit_error", "api_error" ], "Content Filter": [ "content_filter", "safety filter", "blocked by safety" ], "Timeout": [ "timeout", "timed out", "apitimeouterror" ], "Rate Limit": [ "rate limit exceeded", "quota exceeded", "too many requests" ], "Refusal Pattern": [ "i cannot", "i can't", "unable to process" ], "Empty Response": [ '"text": ""', "choices: [ { text: ''" ] } for category, markers in failure_categories.items(): if len(content_str) >= _qa_marker_limit: continue for marker in markers: if marker in content_str or marker in content_lower: return f"{category}: {marker}" if len(content_str) < 50: return f"Short response with error indicators: {content_str[:30]}..." return "Unknown failure pattern" def is_prohibited_failure(content, failure_reason=None): """Best-effort detection of prohibited/blocked failures for save routing.""" try: fr = str(failure_reason or "").lower() if "content filter" in fr or "blocked" in fr or "prohibited" in fr: return True except Exception: pass try: cl = str(content or "").lower() if "content_filter" in cl or "content blocked" in cl or "blocked by safety" in cl: return True except Exception: pass return False def convert_enhanced_text_to_html(plain_text, chapter_info=None): """Convert markdown/plain text back to HTML after translation (for enhanced mode) This function handles the conversion of translated markdown back to HTML. The input is the TRANSLATED text that was originally extracted using html2text. """ import re preserve_structure = chapter_info.get('preserve_structure', False) if chapter_info else False # Pre-process: Convert angle-bracket "tag-like" sequences into HTML entities. # This prevents markdown converters from stripping/mangling them. # IMPORTANT: Preserve real anchor tags ( and ) so EPUB TOC links remain clickable. def _escape_tag_like(m): inner = m.group(1) # e.g. 'a href="..."' or '/a' try: mname = re.match(r'\s*/?\s*([a-zA-Z0-9]+)', inner) tag = (mname.group(1) if mname else "").lower() except Exception: tag = "" # Allowlist: tags we must keep as real HTML in enhanced mode if tag in ("a",): return "<" + inner + ">" return "<" + inner + ">" plain_text = re.sub(r'<(/?[a-zA-Z][^>]*)>', _escape_tag_like, plain_text) # Check if user prefers markdown2 (legacy behavior) use_markdown2 = os.getenv('USE_MARKDOWN2_CONVERTER', '0') == '1' if use_markdown2: # Use markdown2 for conversion (legacy behavior) try: import markdown2 has_markdown = any([ '##' in plain_text, '**' in plain_text, '*' in plain_text and not '**' in plain_text, '[' in plain_text and '](' in plain_text, '```' in plain_text, '> ' in plain_text, '- ' in plain_text or '* ' in plain_text or '1. ' in plain_text ]) if has_markdown or preserve_structure: html = markdown2.markdown(plain_text, extras=[ 'cuddled-lists', 'fenced-code-blocks', 'break-on-newline', 'smarty-pants', 'tables', ]) # Post-process: Fix setext headers that were created from separator lines. # These are NOT real headers—just text followed by ==== or ----. # Restore both the text AND the underline so nothing is lost. html = re.sub(r'

(.*?)

', r'

\1

\n

====

', html) html = re.sub(r'

(.*?)

', r'

\1

\n

----

', html) if not '

' in html: lines = html.split('\n') processed_lines = [] for line in lines: line = line.strip() if line and not line.startswith('<') and not line.endswith('>'): processed_lines.append(f'

{line}

') elif line: processed_lines.append(line) html = '\n'.join(processed_lines) # CRITICAL: Unescape img, svg, picture, figure, figcaption, canvas, map, area tags that were converted to HTML entities # Logic: # 1. Closing tags (e.g. ) -> Always unescape # 2. Container tags (svg, picture, figure, figcaption) -> Unescape even if bare (often used without attrs) # 3. Ambiguous tags (img, area, map, canvas, source, image) -> Unescape ONLY if they have attributes (space followed by content) # This avoids false positives like "The of effect" in fantasy text. img_count = len(re.findall(r'</?(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)', html, flags=re.IGNORECASE)) # if img_count > 0: # print(f"🖼️ Unescaping {img_count} image-related tag(s) from HTML entities (markdown2)") html = re.sub( r'<((?:/(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)(?:\s.*?)?)|(?:(?:svg|picture|figure|figcaption)(?:\s.*?)?)|(?:(?:img|image|source|area|map|canvas)\s.*?))>', r'<\1>', html, flags=re.IGNORECASE | re.DOTALL ) return html except ImportError: print("⚠️ markdown2 not available, falling back to markdown library") # Use markdown library with setext headers disabled (default, recommended) try: import markdown from markdown.extensions import Extension from markdown.blockprocessors import BlockProcessor # Custom extension to disable setext headers class NoSetextHeadersExtension(Extension): def extendMarkdown(self, md): # Remove the setext header processor if 'setextheader' in md.parser.blockprocessors: md.parser.blockprocessors.deregister('setextheader') # Check if the text contains markdown patterns has_markdown = any([ '##' in plain_text, # Headers '**' in plain_text, # Bold '*' in plain_text and not '**' in plain_text, # Italic '[' in plain_text and '](' in plain_text, # Links '```' in plain_text, # Code blocks '> ' in plain_text, # Blockquotes '- ' in plain_text or '* ' in plain_text or '1. ' in plain_text # Lists ]) if has_markdown or preserve_structure: # Use markdown with setext headers disabled # Don't use 'extra' as it escapes parentheses and brackets md = markdown.Markdown(extensions=[ 'nl2br', 'sane_lists', 'fenced_code', 'tables', NoSetextHeadersExtension() ]) html = md.convert(plain_text) # Post-process to ensure proper paragraph structure if not '

' in html: # If markdown didn't create paragraphs, wrap content lines = html.split('\n') processed_lines = [] for line in lines: line = line.strip() if line and not line.startswith('<') and not line.endswith('>'): processed_lines.append(f'

{line}

') elif line: processed_lines.append(line) html = '\n'.join(processed_lines) # CRITICAL: Unescape img, svg, picture, figure, figcaption, canvas, map, area tags that were converted to HTML entities # Logic: # 1. Closing tags (e.g. ) -> Always unescape # 2. Container tags (svg, picture, figure, figcaption) -> Unescape even if bare # 3. Ambiguous tags (img, area, map, canvas, source, image) -> Unescape ONLY if they have attributes img_count = len(re.findall(r'</?(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)', html, flags=re.IGNORECASE)) # if img_count > 0: # print(f"🖼️ Unescaping {img_count} image-related tag(s) from HTML entities (markdown)") html = re.sub( r'<((?:/(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)(?:\s.*?)?)|(?:(?:svg|picture|figure|figcaption)(?:\s.*?)?)|(?:(?:img|image|source|area|map|canvas)\s.*?))>', r'<\1>', html, flags=re.IGNORECASE | re.DOTALL ) return html except ImportError: print("⚠️ markdown not available, using fallback HTML conversion") # Fallback: Manual markdown-to-HTML conversion lines = plain_text.strip().split('\n') html_parts = [] in_code_block = False code_block_content = [] for line in lines: # Handle code blocks if line.strip().startswith('```'): if in_code_block: # End code block html_parts.append('
' + '\n'.join(code_block_content) + '
') code_block_content = [] in_code_block = False else: # Start code block in_code_block = True continue if in_code_block: code_block_content.append(line) continue line = line.strip() if not line: # Preserve empty lines as paragraph breaks if html_parts and not html_parts[-1].endswith('

'): # Only add break if not already after a closing tag html_parts.append('
') continue # Check for markdown headers if line.startswith('#'): match = re.match(r'^(#+)\s*(.+)$', line) if match: level = min(len(match.group(1)), 6) header_text = match.group(2).strip() html_parts.append(f'{header_text}') continue # Check for blockquotes if line.startswith('> '): quote_text = line[2:].strip() html_parts.append(f'
{quote_text}
') continue # Check for lists if re.match(r'^[*\-+]\s+', line): list_text = re.sub(r'^[*\-+]\s+', '', line) html_parts.append(f'
  • {list_text}
  • ') continue if re.match(r'^\d+\.\s+', line): list_text = re.sub(r'^\d+\.\s+', '', line) html_parts.append(f'
  • {list_text}
  • ') continue # Convert inline markdown # Bold line = re.sub(r'\*\*(.+?)\*\*', r'\1', line) line = re.sub(r'__(.+?)__', r'\1', line) # Italic line = re.sub(r'\*(.+?)\*', r'\1', line) line = re.sub(r'_(.+?)_', r'\1', line) # Links line = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', line) # Code inline line = re.sub(r'`([^`]+)`', r'\1', line) # Regular paragraph html_parts.append(f'

    {line}

    ') # Post-process lists to wrap in ul/ol tags final_html = [] in_list = False list_type = None for part in html_parts: if part.startswith('
  • '): if not in_list: # Determine list type based on context (simplified) list_type = 'ul' # Default to unordered final_html.append(f'<{list_type}>') in_list = True final_html.append(part) else: if in_list: final_html.append(f'') in_list = False final_html.append(part) # Close any open list if in_list: final_html.append(f'') html = '\\n'.join(final_html) # CRITICAL: Unescape img, svg, picture, figure, figcaption, canvas, map, area tags that were converted to HTML entities # Logic: # 1. Closing tags -> Always unescape # 2. Container tags -> Unescape even if bare # 3. Ambiguous tags -> Unescape ONLY if they have attributes img_count = len(re.findall(r'</?(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)', html, flags=re.IGNORECASE)) # if img_count > 0: # print(f"🖼️ Unescaping {img_count} image-related tag(s) from HTML entities (fallback)") html = re.sub( r'<((?:/(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)(?:\s.*?)?)|(?:(?:svg|picture|figure|figcaption)(?:\s.*?)?)|(?:(?:img|image|source|area|map|canvas)\s.*?))>', r'<\1>', html, flags=re.IGNORECASE | re.DOTALL ) return html # ===================================================== # MAIN TRANSLATION FUNCTION # ===================================================== def main(log_callback=None, stop_callback=None): """Main translation function with enhanced duplicate detection and progress tracking""" global STOP_LOGGED, _stop_requested STOP_LOGGED = False _stop_requested = False # Reset stop flag for new translation instance # Also reset unified_api_client global flags try: from unified_api_client import set_stop_flag as uac_set_stop_flag uac_set_stop_flag(False) except Exception: pass config = TranslationConfig() builtins._DISABLE_ZERO_DETECTION = config.DISABLE_ZERO_DETECTION if config.DISABLE_ZERO_DETECTION: print("=" * 60) print("⚠️ 0-BASED DETECTION DISABLED BY USER") print("⚠️ All chapter numbers will be used exactly as found") print("=" * 60) args = None chapters_completed = 0 chunks_completed = 0 args = None chapters_completed = 0 chunks_completed = 0 input_path = config.input_path if not input_path and len(sys.argv) > 1: input_path = sys.argv[1] is_text_file = input_path.lower().endswith(('.txt', '.csv', '.json', '.md')) is_pdf_file = input_path.lower().endswith('.pdf') if is_text_file: os.environ["IS_TEXT_FILE_TRANSLATION"] = "1" import json as _json _original_load = _json.load def debug_json_load(fp, *args, **kwargs): result = _original_load(fp, *args, **kwargs) if isinstance(result, list) and len(result) > 0: if isinstance(result[0], dict) and 'original_name' in result[0]: print(f"[DEBUG] Loaded glossary list with {len(result)} items from {fp.name if hasattr(fp, 'name') else 'unknown'}") return result _json.load = debug_json_load if log_callback: set_output_redirect(log_callback) def check_stop(): if stop_callback and stop_callback(): # Don't log stop message if wait_for_chunks is active - let translation continue graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if not (graceful_stop_active and wait_for_chunks): log_stop_once() return True return is_stop_requested() if config.EMERGENCY_RESTORE: print("✅ Emergency paragraph restoration is ENABLED") else: print("⚠️ Emergency paragraph restoration is DISABLED") print(f"[DEBUG] REMOVE_AI_ARTIFACTS environment variable: {os.getenv('REMOVE_AI_ARTIFACTS', 'NOT SET')}") print(f"[DEBUG] REMOVE_AI_ARTIFACTS parsed value: {config.REMOVE_AI_ARTIFACTS}") if config.REMOVE_AI_ARTIFACTS: print("⚠️ AI artifact removal is ENABLED - will clean AI response artifacts") else: print("✅ AI artifact removal is DISABLED - preserving all content as-is") if '--epub' in sys.argv or (len(sys.argv) > 1 and sys.argv[1].endswith(('.epub', '.txt', '.csv', '.json', '.pdf', '.md'))): import argparse parser = argparse.ArgumentParser() parser.add_argument('epub', help='Input EPUB or text file') args = parser.parse_args() input_path = args.epub is_text_file = input_path.lower().endswith(('.txt', '.csv', '.json', '.md')) is_pdf_file = input_path.lower().endswith('.pdf') # Disable Break Split Count for EPUB files (only works with plain text files) if input_path.lower().endswith('.epub'): if os.getenv('BREAK_SPLIT_COUNT', ''): print("⚠️ Break Split Count disabled for EPUB files (only works with .txt files)") os.environ['BREAK_SPLIT_COUNT'] = '' if is_text_file: file_base = os.path.splitext(os.path.basename(input_path))[0] else: epub_base = os.path.splitext(os.path.basename(input_path))[0] file_base = epub_base # Allow callers (e.g. Discord bot) to control where outputs are written. # This avoids relying on process-wide cwd changes (os.chdir), which is unsafe in multi-threaded apps. output_root = (os.getenv("OUTPUT_DIRECTORY") or os.getenv("OUTPUT_DIR") or "").strip() if output_root: try: os.makedirs(output_root, exist_ok=True) except Exception: # If we can't create the root, fall back to relative output. output_root = "" out = os.path.join(output_root, file_base) if output_root else file_base os.makedirs(out, exist_ok=True) print(f"[DEBUG] Created output folder → {out}") cleanup_previous_extraction(out) os.environ["EPUB_OUTPUT_DIR"] = out payloads_dir = out # Manage translation history persistence based on contextual + rolling settings history_file = os.path.join(payloads_dir, "translation_history.json") if os.path.exists(history_file): if config.CONTEXTUAL and config.TRANSLATION_HISTORY_ROLLING: # Preserve existing history across runs when using rolling window print(f"[DEBUG] Preserving translation history (rolling window enabled) → {history_file}") elif config.CONTEXTUAL: # Contextual on but rolling disabled: start fresh each run os.remove(history_file) print(f"[DEBUG] CONTEXTUAL enabled without rolling - purged translation history → {history_file}") else: # Contextual off: never keep history os.remove(history_file) print("[DEBUG] CONTEXTUAL disabled - cleared translation history") history_manager = HistoryManager(payloads_dir) chapter_splitter = ChapterSplitter(model_name=config.MODEL) chunk_context_manager = ChunkContextManager() progress_manager = ProgressManager(payloads_dir) # Prepare progress callback for chapter extraction # Filter to show only every 10% progress update chapter_progress_callback = None _progress_state = {} # Track last shown percentage for each progress type if log_callback: def chapter_progress_callback(msg): # Check if this is a progress message with percentage import re # Try to extract percentage from formatted progress bars percent_match = re.search(r'\((\d+)%\)', msg) if percent_match: percent = int(percent_match.group(1)) # Determine progress type from message if '📂' in msg or 'Scanning' in msg: prog_type = 'scan' elif '📦' in msg or 'Extracting' in msg: prog_type = 'extract' elif '📚' in msg or 'Processing chapters' in msg: prog_type = 'process' elif '📊' in msg or 'metadata' in msg.lower(): prog_type = 'metadata' else: prog_type = 'other' # Get last shown percentage for this type last_percent = _progress_state.get(prog_type, -1) # Show if: crossed a 10% threshold, or reached 100% should_show = (percent // 10 > last_percent // 10) or (percent == 100) if should_show: _progress_state[prog_type] = percent log_callback(msg) else: # Not a progress percentage message, always show log_callback(msg) # Import Chapter_Extractor module functions import Chapter_Extractor # GlossaryManager is now a module with functions, not a class print("🔍 Checking for deleted output files...") progress_manager.cleanup_missing_files(out) progress_manager.save() if check_stop(): return # Check if model needs API key model_needs_api_key = not (config.MODEL.lower() in ['google-translate', 'google-translate-free'] or '@' in config.MODEL or config.MODEL.startswith('vertex/') or config.MODEL.startswith('authgpt/') or config.MODEL.startswith('antigravity/')) if model_needs_api_key and not config.API_KEY: print("❌ Error: Set API_KEY, OPENAI_API_KEY, or OPENAI_OR_Gemini_API_KEY in your environment.") return # Set dummy API key for models that don't need one if not config.API_KEY: config.API_KEY = 'dummy-key-not-required' #print(f"[DEBUG] Found API key: {config.API_KEY[:10]}...") print(f"[DEBUG] Using model = {config.MODEL}") print(f"[DEBUG] Max output tokens = {config.MAX_OUTPUT_TOKENS}") client = UnifiedClient(model=config.MODEL, api_key=config.API_KEY, output_dir=out) if hasattr(client, 'use_multi_keys') and client.use_multi_keys: stats = client.get_stats() print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded") print(f" Active keys: {stats.get('active_keys', 0)}") else: print(f"🔑 Single-key mode: Using {config.MODEL}") # Reset cleanup state when starting new translation if hasattr(client, 'reset_cleanup_state'): client.reset_cleanup_state() if is_pdf_file: print("📄 Processing PDF file...") try: txt_processor = TextFileProcessor(input_path, out) chapters = txt_processor.extract_chapters() txt_processor.save_original_structure() metadata = { "title": os.path.splitext(os.path.basename(input_path))[0], "type": "pdf", "chapter_count": len(chapters) } except ImportError as e: print(f"❌ Error: PDF processor not available: {e}") if log_callback: log_callback(f"❌ Error: PDF processor not available: {e}") return except Exception as e: print(f"❌ Error processing PDF file: {e}") if log_callback: log_callback(f"❌ Error processing PDF file: {e}") return elif is_text_file: print("📄 Processing text file...") try: txt_processor = TextFileProcessor(input_path, out) chapters = txt_processor.extract_chapters() txt_processor.save_original_structure() metadata = { "title": os.path.splitext(os.path.basename(input_path))[0], "type": "text", "chapter_count": len(chapters) } except ImportError as e: print(f"❌ Error: Text file processor not available: {e}") if log_callback: log_callback(f"❌ Error: Text file processor not available: {e}") return except Exception as e: print(f"❌ Error processing text file: {e}") if log_callback: log_callback(f"❌ Error processing text file: {e}") return else: # Check if we should use async extraction (for GUI mode) use_async_extraction = os.getenv("USE_ASYNC_CHAPTER_EXTRACTION", "0") == "1" if use_async_extraction and log_callback: print("🚀 Using async chapter extraction (subprocess mode)...") from chapter_extraction_manager import ChapterExtractionManager # Create manager with log callback extraction_manager = ChapterExtractionManager(log_callback=log_callback) # Get extraction mode extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower() # Define completion callback extraction_result = {"completed": False, "result": None} def on_extraction_complete(result): extraction_result["completed"] = True extraction_result["result"] = result # Safety check for None result if result is None: log_callback("❌ Chapter extraction failed: No result returned") return if result.get("success"): log_callback(f"✅ Chapter extraction completed: {result.get('chapters', 0)} chapters") else: log_callback(f"❌ Chapter extraction failed: {result.get('error', 'Unknown error')}") # Start async extraction extraction_manager.extract_chapters_async( input_path, out, extraction_mode=extraction_mode, progress_callback=lambda msg: log_callback(f"📊 {msg}"), completion_callback=on_extraction_complete ) # Wait for completion while not extraction_result["completed"]: if check_stop(): extraction_manager.stop_extraction() return time.sleep(0.1) # Check every 100ms # Check if extraction was successful if not extraction_result["result"] or not extraction_result["result"].get("success"): log_callback("❌ Chapter extraction failed") return # Load the extracted data metadata_path = os.path.join(out, "metadata.json") if os.path.exists(metadata_path): with open(metadata_path, 'r', encoding='utf-8') as f: metadata = json.load(f) else: metadata = extraction_result["result"].get("metadata", {}) # The async extraction should have saved chapters directly, similar to the sync version # We need to reconstruct the chapters list with body content # Check if the extraction actually created a chapters.json file with full content chapters_full_path = os.path.join(out, "chapters_full.json") chapters_info_path = os.path.join(out, "chapters_info.json") chapters = [] # First try to load full chapters if saved if os.path.exists(chapters_full_path): log_callback("Loading full chapters data...") with open(chapters_full_path, 'r', encoding='utf-8') as f: chapters = json.load(f) log_callback(f"✅ Loaded {len(chapters)} chapters with content") elif os.path.exists(chapters_info_path): # Fall back to loading from individual files log_callback("Loading chapter info and searching for content files...") with open(chapters_info_path, 'r', encoding='utf-8') as f: chapters_info = json.load(f) # List all files in the output directory all_files = os.listdir(out) log_callback(f"Found {len(all_files)} files in output directory") # Try to match chapter files for info in chapters_info: chapter_num = info['num'] found = False # Try different naming patterns patterns = [ f"chapter_{chapter_num:04d}_", # With leading zeros f"chapter_{chapter_num}_", # Without leading zeros f"ch{chapter_num:04d}_", # Shortened with zeros f"ch{chapter_num}_", # Shortened without zeros f"{chapter_num:04d}_", # Just number with zeros f"{chapter_num}_" # Just number ] for pattern in patterns: # Find files matching this pattern (any extension) matching_files = [f for f in all_files if f.startswith(pattern)] if matching_files: # Prefer HTML/XHTML files html_files = [f for f in matching_files if f.endswith(('.html', '.xhtml', '.htm'))] if html_files: chapter_file = html_files[0] else: chapter_file = matching_files[0] chapter_path = os.path.join(out, chapter_file) try: with open(chapter_path, 'r', encoding='utf-8') as f: content = f.read() chapters.append({ "num": chapter_num, "title": info.get("title", f"Chapter {chapter_num}"), "body": content, "filename": info.get("original_filename", ""), "has_images": info.get("has_images", False), "file_size": len(content), "content_hash": info.get("content_hash", "") }) found = True break except Exception as e: log_callback(f"⚠️ Error reading {chapter_file}: {e}") if not found: log_callback(f"⚠️ No file found for Chapter {chapter_num}") # Log available files for debugging if len(all_files) < 50: similar_files = [f for f in all_files if str(chapter_num) in f] if similar_files: log_callback(f" Similar files: {similar_files[:3]}") if not chapters: log_callback("❌ No chapters could be loaded!") log_callback(f"❌ Output directory: {out}") log_callback(f"❌ Files in directory: {len(os.listdir(out))} files") # Show first few files for debugging sample_files = os.listdir(out)[:10] log_callback(f"❌ Sample files: {sample_files}") return # Sort chapters by OPF spine order if available opf_path = os.path.join(out, 'content.opf') if os.path.exists(opf_path) and chapters: log_callback("📋 Sorting chapters according to OPF spine order...") # Call module-level function directly chapters = Chapter_Extractor._sort_by_opf_spine(chapters, opf_path) log_callback("✅ Chapters sorted according to OPF reading order") else: print("🚀 Using comprehensive chapter extraction with resource handling...") with zipfile.ZipFile(input_path, 'r') as zf: metadata = Chapter_Extractor._extract_epub_metadata(zf) chapters = Chapter_Extractor.extract_chapters(zf, out, progress_callback=chapter_progress_callback) print(f"\n📚 Extraction Summary:") print(f" Total chapters extracted: {len(chapters)}") if chapters: nums = [c.get('num', 0) for c in chapters] print(f" Chapter range: {min(nums)} to {max(nums)}") # Check for gaps in the sequence expected_count = max(nums) - min(nums) + 1 if len(chapters) < expected_count: print(f"\n⚠️ Potential missing chapters detected:") print(f" Expected {expected_count} chapters (from {min(nums)} to {max(nums)})") print(f" Actually found: {len(chapters)} chapters") print(f" Potentially missing: {expected_count - len(chapters)} chapters") validate_chapter_continuity(chapters) print("\n" + "="*50) validate_epub_structure(out) print("="*50 + "\n") progress_manager.migrate_to_content_hash(chapters) progress_manager.save() if check_stop(): return metadata_path = os.path.join(out, "metadata.json") if os.path.exists(metadata_path): with open(metadata_path, 'r', encoding='utf-8') as mf: metadata = json.load(mf) metadata["chapter_count"] = len(chapters) metadata["chapter_titles"] = {str(c["num"]): c["title"] for c in chapters} print(f"[DEBUG] Initializing client with model = {config.MODEL}") client = UnifiedClient(api_key=config.API_KEY, model=config.MODEL, output_dir=out) # Log translation anti-duplicate parameters usage if os.getenv("ENABLE_ANTI_DUPLICATE", "0") == "1": ad_top_p = os.getenv("TOP_P", "1.0") ad_top_k = os.getenv("TOP_K", "0") ad_freq = os.getenv("FREQUENCY_PENALTY", "0.0") ad_pres = os.getenv("PRESENCE_PENALTY", "0.0") ad_rep = os.getenv("REPETITION_PENALTY", "1.0") print(f"🎯 Anti-duplicate enabled for translation (top_p={ad_top_p}, top_k={ad_top_k}, freq_penalty={ad_freq}, presence_penalty={ad_pres}, repetition_penalty={ad_rep})") if hasattr(client, 'use_multi_keys') and client.use_multi_keys: stats = client.get_stats() print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded") print(f" Active keys: {stats.get('active_keys', 0)}") else: print(f"🔑 Single-key mode: Using {config.MODEL}") # Reset cleanup state when starting new translation if hasattr(client, 'reset_cleanup_state'): client.reset_cleanup_state() if "title" in metadata and config.TRANSLATE_BOOK_TITLE and not metadata.get("title_translated", False): original_title = metadata["title"] print(f"📚 Original title: {original_title}") if not check_stop(): translated_title = translate_title( original_title, client, None, None, config.TEMP ) metadata["original_title"] = original_title metadata["title"] = translated_title metadata["title_translated"] = True print(f"📚 Translated title: {translated_title}") else: print("❌ Title translation skipped due to stop request") # Translate other metadata fields if configured translate_metadata_fields_str = os.getenv('TRANSLATE_METADATA_FIELDS', '{}') metadata_translation_mode = os.getenv('METADATA_TRANSLATION_MODE', 'together') try: translate_metadata_fields = json.loads(translate_metadata_fields_str) if translate_metadata_fields and any(translate_metadata_fields.values()): # Filter out fields that should be translated (excluding already translated fields) fields_to_translate = {} skipped_fields = [] for field_name, should_translate in translate_metadata_fields.items(): if should_translate and field_name != 'title' and field_name in metadata: # Check if already translated if metadata.get(f"{field_name}_translated", False): skipped_fields.append(field_name) print(f"✓ Skipping {field_name} - already translated") else: fields_to_translate[field_name] = should_translate if fields_to_translate: print("\n" + "="*50) print("📋 METADATA TRANSLATION PHASE") print("="*50) print(f"🌐 Translating {len(fields_to_translate)} metadata fields...") # Get metadata system prompt from environment system_prompt = os.getenv('METADATA_SYSTEM_PROMPT', '') if system_prompt: # Get field-specific prompts field_prompts_str = os.getenv('METADATA_FIELD_PROMPTS', '{}') try: field_prompts = json.loads(field_prompts_str) except: field_prompts = {} if not field_prompts and not field_prompts.get('_default'): print("❌ No field prompts configured, skipping metadata translation") else: # Get language configuration lang_behavior = os.getenv('LANG_PROMPT_BEHAVIOR', 'auto') forced_source_lang = os.getenv('FORCED_SOURCE_LANG', 'Korean') output_language = os.getenv('OUTPUT_LANGUAGE', 'English') # Determine source language source_lang = metadata.get('language', '').lower() if lang_behavior == 'never': lang_str = "" elif lang_behavior == 'always': lang_str = forced_source_lang else: # auto if 'zh' in source_lang or 'chinese' in source_lang: lang_str = 'Chinese' elif 'ja' in source_lang or 'japanese' in source_lang: lang_str = 'Japanese' elif 'ko' in source_lang or 'korean' in source_lang: lang_str = 'Korean' else: lang_str = '' # Check if batch translation is enabled for parallel processing batch_translate_enabled = os.getenv('BATCH_TRANSLATION', '0') == '1' batch_size = int(os.getenv('BATCH_SIZE', '50')) # Default batch size if batch_translate_enabled and len(fields_to_translate) > 1: print(f"⚡ Using parallel metadata translation mode ({len(fields_to_translate)} fields, batch size: {batch_size})...") # Import ThreadPoolExecutor for parallel processing from concurrent.futures import ThreadPoolExecutor, as_completed import threading # Thread-safe results storage translation_results = {} results_lock = threading.Lock() def translate_metadata_field(field_name, original_value): """Translate a single metadata field""" try: print(f"\n📋 Translating {field_name}: {original_value[:100]}..." if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}") # Get field-specific prompt prompt_template = field_prompts.get(field_name, field_prompts.get('_default', '')) if not prompt_template: print(f"⚠️ No prompt configured for field '{field_name}', skipping") return None # Replace variables in prompt field_prompt = prompt_template.replace('{source_lang}', lang_str) field_prompt = field_prompt.replace('{output_lang}', output_language) field_prompt = field_prompt.replace('{target_lang}', output_language) field_prompt = field_prompt.replace('{field_value}', str(original_value)) # Check if we're using a translation service (not AI) client_type = getattr(client, 'client_type', '') is_translation_service = client_type in ['deepl', 'google_translate'] if is_translation_service: # For translation services, send only the field value without AI prompts print(f"🌐 Using translation service ({client_type}) - sending field directly") messages = [ {"role": "user", "content": str(original_value)} ] else: # For AI services, use prompts as before messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"{field_prompt}\n\n{original_value}"} ] # Add delay for rate limiting if config.DELAY > 0: time.sleep(config.DELAY) # Make API call content, finish_reason = client.send( messages, temperature=config.TEMP, max_tokens=config.MAX_OUTPUT_TOKENS ) translated_value = content.strip() # Store result thread-safely with results_lock: translation_results[field_name] = { 'original': original_value, 'translated': translated_value, 'success': True } print(f"✅ Translated {field_name}: {translated_value}") return translated_value except Exception as e: print(f"❌ Failed to translate {field_name}: {e}") with results_lock: translation_results[field_name] = { 'original': original_value, 'translated': None, 'success': False, 'error': str(e) } return None # Execute parallel translations with limited workers max_workers = min(len(fields_to_translate), batch_size) with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all translation tasks futures = {} for field_name in fields_to_translate: if field_name in metadata and not check_stop(): original_value = metadata[field_name] future = executor.submit(translate_metadata_field, field_name, original_value) futures[future] = field_name # Wait for completion for future in as_completed(futures): if check_stop(): print("❌ Metadata translation stopped by user") break # Apply results to metadata for field_name, result in translation_results.items(): if result['success'] and result['translated']: metadata[f"original_{field_name}"] = result['original'] metadata[field_name] = result['translated'] metadata[f"{field_name}_translated"] = True else: # Sequential translation mode (individual translation) mode_desc = "sequential" if not batch_translate_enabled else "sequential (single field)" print(f"📝 Using {mode_desc} translation mode...") for field_name in fields_to_translate: if not check_stop() and field_name in metadata: original_value = metadata[field_name] print(f"\n📋 Translating {field_name}: {original_value[:100]}..." if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}") # Get field-specific prompt prompt_template = field_prompts.get(field_name, field_prompts.get('_default', '')) if not prompt_template: print(f"⚠️ No prompt configured for field '{field_name}', skipping") continue # Replace variables in prompt field_prompt = prompt_template.replace('{source_lang}', lang_str) field_prompt = field_prompt.replace('{output_lang}', output_language) field_prompt = field_prompt.replace('{target_lang}', output_language) field_prompt = field_prompt.replace('{field_value}', str(original_value)) # Check if we're using a translation service (not AI) client_type = getattr(client, 'client_type', '') is_translation_service = client_type in ['deepl', 'google_translate'] if is_translation_service: # For translation services, send only the field value without AI prompts print(f"🌐 Using translation service ({client_type}) - sending field directly") messages = [ {"role": "user", "content": str(original_value)} ] else: # For AI services, use prompts as before messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": f"{field_prompt}\n\n{original_value}"} ] try: # Add delay using the config instance from main() if config.DELAY > 0: # ✅ FIXED - use config.DELAY instead of config.SEND_INTERVAL time.sleep(config.DELAY) # Use the same client instance from main() # ✅ FIXED - Properly unpack tuple response and provide max_tokens content, finish_reason = client.send( messages, temperature=config.TEMP, max_tokens=config.MAX_OUTPUT_TOKENS # ✅ FIXED - provide max_tokens to avoid NoneType error ) translated_value = content.strip() # ✅ FIXED - use content from unpacked tuple metadata[f"original_{field_name}"] = original_value metadata[field_name] = translated_value metadata[f"{field_name}_translated"] = True print(f"✅ Translated {field_name}: {translated_value}") except Exception as e: print(f"❌ Failed to translate {field_name}: {e}") else: if check_stop(): print("❌ Metadata translation stopped by user") break else: print("📋 No additional metadata fields to translate") except Exception as e: print(f"⚠️ Error processing metadata translation settings: {e}") import traceback traceback.print_exc() with open(metadata_path, 'w', encoding='utf-8') as mf: json.dump(metadata, mf, ensure_ascii=False, indent=2) print(f"💾 Saved metadata with {'translated' if metadata.get('title_translated', False) else 'original'} title") print("\n" + "="*50) print("📑 GLOSSARY GENERATION PHASE") print("="*50) # Skip glossary generation for CSV/JSON/MD files (they are typically glossaries themselves) if input_path.lower().endswith(('.csv', '.json', '.md')): print("📑 Skipping glossary generation for CSV/JSON/MD file") print(" CSV/JSON/MD files are treated as plain text and typically don't need glossaries") else: print(f"📑 DEBUG: ENABLE_AUTO_GLOSSARY = '{os.getenv('ENABLE_AUTO_GLOSSARY', 'NOT SET')}'") print(f"📑 DEBUG: MANUAL_GLOSSARY = '{config.MANUAL_GLOSSARY}'") print(f"📑 DEBUG: Manual glossary exists? {os.path.isfile(config.MANUAL_GLOSSARY) if config.MANUAL_GLOSSARY else False}") print(f"📑 DEBUG: APPEND_GLOSSARY = '{os.getenv('APPEND_GLOSSARY', '1')}'") print(f"📑 DEBUG: APPEND_GLOSSARY_PROMPT = '{os.getenv('APPEND_GLOSSARY_PROMPT', 'NOT SET')}'") print(f"📑 DEBUG: Duplicate algorithm = '{os.getenv('GLOSSARY_DUPLICATE_ALGORITHM', 'auto')}'") print(f"📑 DEBUG: Fuzzy threshold = '{os.getenv('GLOSSARY_FUZZY_THRESHOLD', '0.90')}'") print(f"📑 DEBUG: Include gender context = '{os.getenv('GLOSSARY_INCLUDE_GENDER_CONTEXT', '0')}'") print(f"📑 DEBUG: Context window size = '{os.getenv('GLOSSARY_CONTEXT_WINDOW', '2')}'") print(f"📑 DEBUG: Min frequency = '{os.getenv('GLOSSARY_MIN_FREQUENCY', '1')}'") print(f"📑 DEBUG: Max names = '{os.getenv('GLOSSARY_MAX_NAMES', '50')}'") print(f"📑 DEBUG: Max titles = '{os.getenv('GLOSSARY_MAX_TITLES', '50')}'") print(f"📑 DEBUG: Translation batch = '{os.getenv('GLOSSARY_BATCH_SIZE', '50')}'") print(f"📑 DEBUG: Max text size = '{os.getenv('GLOSSARY_MAX_TEXT_SIZE', '50000')}'") print(f"📑 DEBUG: Max sentences = '{os.getenv('GLOSSARY_MAX_SENTENCES', '200')}'") print(f"📑 DEBUG: Use smart filter = '{os.getenv('GLOSSARY_USE_SMART_FILTER', '1')}'") print(f"📑 DEBUG: Chapter split threshold = '{os.getenv('GLOSSARY_CHAPTER_SPLIT_THRESHOLD', '50000')}'") print(f"📑 DEBUG: Target language = '{os.getenv('GLOSSARY_TARGET_LANGUAGE', 'English')}'") # Check if glossary.csv already exists in the source folder existing_glossary_csv = os.path.join(out, "glossary.csv") existing_glossary_json = os.path.join(out, "glossary.json") print(f"📑 DEBUG: Existing glossary.csv? {os.path.exists(existing_glossary_csv)}") print(f"📑 DEBUG: Existing glossary.json? {os.path.exists(existing_glossary_json)}") def _nonempty(path): try: return os.path.getsize(path) > 0 except Exception: return False def _has_glossary_data(path): """Return True only if the glossary file contains at least one entry.""" try: ext = os.path.splitext(path)[1].lower() if ext in [".csv", ".txt", ".md"]: with open(path, 'r', encoding='utf-8', errors='ignore') as f: lines = [line for line in f.readlines() if line.strip()] # Require at least one non-header data line return len(lines) > 1 if ext == ".json": with open(path, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, dict): if "entries" in data and isinstance(data["entries"], dict): return len(data["entries"]) > 0 return len(data) > 0 if isinstance(data, list): return len(data) > 0 # Unknown extension: fallback to non-empty size check return _nonempty(path) except Exception: return False # If manual glossary is present but empty/header-only, clear it so auto-gen can run if config.MANUAL_GLOSSARY and os.path.isfile(config.MANUAL_GLOSSARY) and not _has_glossary_data(config.MANUAL_GLOSSARY): print("📑 Manual glossary is empty; ignoring to allow automatic generation.") config.MANUAL_GLOSSARY = "" os.environ.pop("MANUAL_GLOSSARY", None) if config.MANUAL_GLOSSARY and os.path.isfile(config.MANUAL_GLOSSARY) and _has_glossary_data(config.MANUAL_GLOSSARY): ext = os.path.splitext(config.MANUAL_GLOSSARY)[1].lower() # Treat .txt and .md files as CSV format (keep original extension) if ext in [".csv", ".txt"]: target_name = "glossary.csv" elif ext == ".md": target_name = "glossary.md" elif ext == ".json": target_name = "glossary.json" else: # Default to CSV for unknown extensions target_name = "glossary.csv" print(f"⚠️ Unknown glossary extension '{ext}', treating as CSV") target_path = os.path.join(out, target_name) if os.path.abspath(config.MANUAL_GLOSSARY) != os.path.abspath(target_path): shutil.copy(config.MANUAL_GLOSSARY, target_path) print("📑 Using manual glossary from:", config.MANUAL_GLOSSARY) else: print("📑 Using existing glossary:", config.MANUAL_GLOSSARY) # Copy glossary extension if configured if os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1': additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '') if additional_glossary_path and os.path.exists(additional_glossary_path): # Preserve original extension ext = os.path.splitext(additional_glossary_path)[1] additional_target = os.path.join(out, f"glossary_extension{ext}") # Only copy if target doesn't already exist if not os.path.exists(additional_target): try: shutil.copy(additional_glossary_path, additional_target) print(f"📑 Copied glossary extension: {os.path.basename(additional_glossary_path)}") except Exception as e: print(f"⚠️ Failed to copy glossary extension: {e}") else: print(f"📑 Using existing glossary extension in output folder") # If existing glossaries in output are empty, delete them so they don't block auto-gen if os.path.exists(existing_glossary_csv) and not _has_glossary_data(existing_glossary_csv): try: os.remove(existing_glossary_csv) print("📑 Removed empty glossary.csv to allow automatic generation.") except Exception as e: print(f"⚠️ Could not remove empty glossary.csv: {e}") if os.path.exists(existing_glossary_json) and not _has_glossary_data(existing_glossary_json): try: os.remove(existing_glossary_json) print("📑 Removed empty glossary.json to allow automatic generation.") except Exception as e: print(f"⚠️ Could not remove empty glossary.json: {e}") elif (os.path.exists(existing_glossary_csv) and _has_glossary_data(existing_glossary_csv)) or \ (os.path.exists(existing_glossary_json) and _has_glossary_data(existing_glossary_json)): print("📑 Existing glossary file detected in source folder - skipping automatic generation") target_glossary_path = None if os.path.exists(existing_glossary_csv) and _has_glossary_data(existing_glossary_csv): print(f"📑 Using existing glossary.csv: {existing_glossary_csv}") target_glossary_path = existing_glossary_csv elif os.path.exists(existing_glossary_json) and _has_glossary_data(existing_glossary_json): print(f"📑 Using existing glossary.json: {existing_glossary_json}") target_glossary_path = existing_glossary_json # --- Check and inject book title if missing --- if target_glossary_path and target_glossary_path.endswith('.csv'): try: include_title = os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1" auto_inject = os.getenv("GLOSSARY_AUTO_INJECT_BOOK_TITLE", "0") == "1" # Auto-inject applies only to already loaded existing glossary files (post-dedup context) if include_title and auto_inject: # Read existing content with open(target_glossary_path, 'r', encoding='utf-8') as f: lines = f.readlines() # Check if book entry exists has_book_entry = False for line in lines: if line.lower().startswith("book,"): has_book_entry = True break if not has_book_entry: print("📑 Checking for missing book title entry in existing glossary...") # Use GlossaryManager to find/translate title import GlossaryManager # Get raw title from input EPUB epub_path_env = os.getenv("EPUB_PATH", "") raw_title = GlossaryManager._extract_raw_title_from_epub(epub_path_env) # Get translated title from output metadata trans_title = GlossaryManager._extract_translated_title_from_metadata(out) if raw_title or trans_title: # Determine values (prefer distinct, fallback to what we have) r_val = raw_title if raw_title else (trans_title if trans_title else "") t_val = trans_title if trans_title else (raw_title if raw_title else "") # Insert book entry in token-efficient format if detected, or standard CSV is_token_format = any(l.strip().startswith("Glossary Columns:") for l in lines) if is_token_format: # Insert into token efficient format # Find start of BOOKS section or create it at top book_lines = [ f"=== BOOKS ===\n", f"* {t_val} ({r_val})\n", "\n" ] # Find where to insert (after Glossary Columns) insert_idx = 0 for i, l in enumerate(lines): if l.strip().startswith("Glossary Columns:"): insert_idx = i + 2 # Skip blank line break # Check if BOOKS section already exists to avoid duplication has_books_section = any(l.strip() == "=== BOOKS ===" for l in lines) if not has_books_section: for bl in reversed(book_lines): lines.insert(insert_idx, bl) else: # Standard CSV injection book_line = f"book,{r_val},{t_val},,\n" # Find insertion point (after header if present) insert_idx = 0 if lines and "type," in lines[0].lower(): insert_idx = 1 lines.insert(insert_idx, book_line) # Write back with open(target_glossary_path, 'w', encoding='utf-8') as f: f.writelines(lines) print(f"📚 Auto-injected book title into existing glossary: {t_val} ({r_val})") except Exception as e: print(f"⚠️ Failed to inject book title: {e}") # ---------------------------------------------- # Copy glossary extension if configured if os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1': additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '') if additional_glossary_path and os.path.exists(additional_glossary_path): # Preserve original extension ext = os.path.splitext(additional_glossary_path)[1] additional_target = os.path.join(out, f"glossary_extension{ext}") # Only copy if target doesn't already exist if not os.path.exists(additional_target): try: shutil.copy(additional_glossary_path, additional_target) print(f"📑 Copied glossary extension: {os.path.basename(additional_glossary_path)}") except Exception as e: print(f"⚠️ Failed to copy glossary extension: {e}") else: print(f"📑 Using existing glossary extension in output folder") elif os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1": model = os.getenv("MODEL", "gpt-4") if is_traditional_translation_api(model): print("📑 Automatic glossary generation disabled") print(f" {model} does not support glossary extraction") print(" Traditional translation APIs cannot identify character names/terms") else: print("📑 Starting automatic glossary generation...") try: # Use the new process-safe glossary worker from glossary_process_worker import generate_glossary_in_process import concurrent.futures import multiprocessing instructions = "" # Get extraction workers setting extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) if extraction_workers == 1: # Auto-detect for better performance extraction_workers = min(os.cpu_count() or 4, 4) print(f"📑 Using {extraction_workers} CPU cores for glossary generation") # Collect environment variables to pass to subprocess env_vars = {} important_vars = [ 'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES', 'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS', 'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'GLOSSARY_MAX_SENTENCES', 'AUTO_GLOSSARY_PROMPT', 'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV', 'GLOSSARY_PARALLEL_ENABLED', 'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK', 'GLOSSARY_SKIP_ALL_VALIDATION', 'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY', 'MAX_OUTPUT_TOKENS', 'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY', 'GLOSSARY_DUPLICATE_ALGORITHM', 'GLOSSARY_INCLUDE_GENDER_CONTEXT', 'GLOSSARY_CONTEXT_WINDOW', 'GLOSSARY_INCLUDE_BOOK_TITLE', 'EPUB_PATH', # Match GUI batching settings 'BATCH_TRANSLATION', 'BATCH_SIZE', 'BATCHING_MODE', 'BATCH_GROUP_SIZE', # Keep submission staggering consistent with GUI 'THREAD_SUBMISSION_DELAY_SECONDS', ] for var in important_vars: if var in os.environ: env_vars[var] = os.environ[var] # NOTE: Avoid multiprocessing.Manager() here. # Instead, have the subprocess append logs to a file and tail it from the parent. log_queue = None # Write subprocess logs to a central logs folder (not the output book folder) try: _project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) except Exception: _project_root = os.path.abspath(".") logs_dir = os.path.join(_project_root, "logs") try: os.makedirs(logs_dir, exist_ok=True) except Exception: pass glossary_log_fp = os.path.join( logs_dir, f"glossary_subprocess_{int(time.time() * 1000)}_{os.getpid()}.log" ) try: # Ensure file exists (fresh per run) with open(glossary_log_fp, "w", encoding="utf-8") as _f: _f.write("") except Exception: pass # Use ProcessPoolExecutor for true parallelism (completely bypasses GIL) print("📑 Starting glossary generation in separate process...") with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: # Submit to separate process WITH log queue future = executor.submit( generate_glossary_in_process, out, chapters, instructions, env_vars, log_queue, # Queue disabled (None) glossary_log_fp # log_file_path for parent tailing ) # Poll for completion and stream logs in real-time poll_count = 0 graceful_stop_notice_shown = False # Tail the subprocess log file for visibility _log_pos = 0 _seen_worker_output = False _submit_ts = time.time() # Start wait logging after 5s (avoid a noisy "0s" line) _last_wait_log_ts = _submit_ts while not future.done(): poll_count += 1 # Tail subprocess log file (best-effort) try: if glossary_log_fp and os.path.exists(glossary_log_fp): with open(glossary_log_fp, "r", encoding="utf-8", errors="ignore") as _lf: _lf.seek(_log_pos) new = _lf.read() _log_pos = _lf.tell() if new: _seen_worker_output = True # Print as-is (already line-delimited) for _ln in new.splitlines(): if _ln.strip(): print(_ln) except Exception: pass # If the subprocess hasn't produced any output yet, show periodic wait logs. # This helps during Windows spawn/import/pickling startup delays. if not _seen_worker_output: now = time.time() if (now - _last_wait_log_ts) >= 5.0: _last_wait_log_ts = now try: elapsed = int(now - _submit_ts) except Exception: elapsed = 0 # Best-effort status: future.running() may become True once the worker begins executing. try: running = future.running() except Exception: running = False state = "booting" if running else "spawning" print(f"⏳ Waiting for glossary subprocess to start ({state})… {elapsed}s") # Super short sleep to yield to GUI time.sleep(0.01) # Check for stop every 100 polls if poll_count % 100 == 0: if check_stop(): graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' def _glossary_all_chunks_submitted() -> bool: """Best-effort check via GlossaryManager status file. IMPORTANT: We only consider chunks "submitted" once they have been sent to the API (i.e., transitioned to in-flight after any stagger/delay). Executor/thread submission is NOT sufficient for this. """ try: import json as _json import tempfile as _tempfile stop_fp = os.environ.get('GLOSSARY_STOP_FILE') or os.path.join(_tempfile.gettempdir(), 'glossarion_glossary.stop') status_fp = os.environ.get('GLOSSARY_STATUS_FILE') or f"{stop_fp}.status.json" if not status_fp or not os.path.exists(status_fp): return False with open(status_fp, 'r', encoding='utf-8') as f: st = _json.load(f) if not isinstance(st, dict): return False # Prefer the explicit "all_sent" flag written by GlossaryManager. if st.get('all_sent') is True: return True # Fallback: compare counts (sent_chunks is post-stagger/in-flight). total = int(st.get('total_chunks', 0) or 0) sent = int(st.get('sent_chunks', 0) or 0) return total > 0 and sent >= total except Exception: return False def _glossary_any_in_flight() -> bool: """Detect in-flight API calls from the glossary subprocess via watchdog files.""" try: import json as _json wd_dir = os.environ.get('GLOSSARION_WATCHDOG_DIR') if not wd_dir or not os.path.isdir(wd_dir): return False import glob as _glob for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json')): try: with open(fp, 'r', encoding='utf-8') as f: st = _json.load(f) if isinstance(st, dict) and int(st.get('in_flight', 0) or 0) > 0: return True except Exception: continue except Exception: return False return False should_wait = False if graceful_stop_active: if wait_for_chunks: # WAIT_FOR_CHUNKS=1: allow in-flight work to complete. should_wait = True else: # WAIT_FOR_CHUNKS=0: wait if all chunks were already sent to the API. # Also check the watchdog directly — the status file written by # the subprocess _sent_monitor thread polls at 0.1s intervals and # may lag behind the actual API state. The watchdog file is # updated synchronously when an API call goes in-flight, so it # is the reliable ground truth. should_wait = _glossary_all_chunks_submitted() or _glossary_any_in_flight() if should_wait: if not graceful_stop_notice_shown: if wait_for_chunks: print("⏳ Graceful stop — waiting for glossary generation to finish...") else: print("⏳ Graceful stop — waiting (glossary API call in flight)...") graceful_stop_notice_shown = True continue print("📑 ❌ Glossary generation cancelled") # Escalate to FULL STOP for glossary cancellation: # 1) Touch the shared stop file so the glossary subprocess stops starting new work. try: import tempfile as _tempfile stop_fp = os.environ.get('GLOSSARY_STOP_FILE') or os.path.join(_tempfile.gettempdir(), 'glossarion_glossary.stop') os.environ['GLOSSARY_STOP_FILE'] = stop_fp with open(stop_fp, 'w', encoding='utf-8') as f: f.write('stop') except Exception: pass # 2) Hard-cancel HTTP sessions in THIS process (best-effort). try: import unified_api_client if hasattr(unified_api_client, 'set_stop_flag'): unified_api_client.set_stop_flag(True) if hasattr(unified_api_client, 'global_stop_flag'): unified_api_client.global_stop_flag = True if hasattr(unified_api_client, 'UnifiedClient'): unified_api_client.UnifiedClient._global_cancelled = True if hasattr(unified_api_client, 'hard_cancel_all'): unified_api_client.hard_cancel_all() except Exception: pass # 3) Terminate glossary subprocess(es) if they are still running. # ProcessPoolExecutor cancellation does NOT reliably kill a running worker. try: import psutil wd_dir = os.environ.get('GLOSSARION_WATCHDOG_DIR') if wd_dir and os.path.isdir(wd_dir): import glob as _glob import re as _re for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json')): try: base = os.path.basename(fp) m = _re.search(r"api_watchdog_(\d+)\.json$", base) if not m: continue pid = int(m.group(1)) if pid == os.getpid(): continue p = psutil.Process(pid) # Best-effort terminate, then kill if needed. try: p.terminate() except Exception: pass except Exception: continue # Give processes a brief moment to exit, then kill any remaining. try: time.sleep(0.2) except Exception: pass if wd_dir and os.path.isdir(wd_dir): import glob as _glob import re as _re for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json')): try: base = os.path.basename(fp) m = _re.search(r"api_watchdog_(\d+)\.json$", base) if not m: continue pid = int(m.group(1)) if pid == os.getpid(): continue p = psutil.Process(pid) try: if p.is_running(): p.kill() except Exception: pass except Exception: continue except Exception: pass # 4) Best-effort: clear watchdog files so the GUI progress bar doesn't stick. try: wd_dir = os.environ.get('GLOSSARION_WATCHDOG_DIR') if wd_dir and os.path.isdir(wd_dir): import glob as _glob for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json*')): try: os.remove(fp) except Exception: pass except Exception: pass try: executor.shutdown(wait=False, cancel_futures=True) except Exception: pass return # Get any remaining logs from subprocess log file try: if glossary_log_fp and os.path.exists(glossary_log_fp): with open(glossary_log_fp, "r", encoding="utf-8", errors="ignore") as _lf: _lf.seek(_log_pos) new = _lf.read() _log_pos = _lf.tell() if new: for _ln in new.splitlines(): if _ln.strip(): print(_ln) except Exception: pass # If Stop was requested at any point, treat as CANCELLED and do not continue. try: if check_stop(): print("📑 ❌ Automatic glossary generation CANCELLED") return except Exception: pass # Get result if future.done(): try: result = future.result(timeout=0.1) if isinstance(result, dict): if result.get('success'): print(f"📑 ✅ Glossary generation completed successfully") else: print(f"📑 ❌ Glossary generation failed: {result.get('error')}") if result.get('traceback'): print(f"📑 Error details:\n{result.get('traceback')}") except Exception as e: print(f"📑 ❌ Error retrieving glossary result: {e}") # Only mark completed when not stopping. print("✅ Automatic glossary generation COMPLETED") # If the user requested graceful stop (wait_for_chunks), stop here after glossary is done. try: graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if graceful_stop_active and wait_for_chunks and check_stop(): print("✅ Glossary generation finished. Stopping as requested (wait for chunks).") return except Exception: pass # Copy glossary extension if configured (after auto-glossary generation) if os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1': additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '') if additional_glossary_path and os.path.exists(additional_glossary_path): # Preserve original extension ext = os.path.splitext(additional_glossary_path)[1] additional_target = os.path.join(out, f"glossary_extension{ext}") # Only copy if target doesn't already exist if not os.path.exists(additional_target): try: shutil.copy(additional_glossary_path, additional_target) print(f"📑 Copied glossary extension: {os.path.basename(additional_glossary_path)}") except Exception as e: print(f"⚠️ Failed to copy glossary extension: {e}") else: print(f"📑 Using existing glossary extension in output folder") # Handle deferred glossary appending if os.getenv('DEFER_GLOSSARY_APPEND') == '1': print("📑 Processing deferred glossary append to system prompt...") glossary_path = find_glossary_file(out) if glossary_path and os.path.exists(glossary_path): try: glossary_block = None if glossary_path.lower().endswith('.csv'): with open(glossary_path, 'r', encoding='utf-8') as f: glossary_block = f.read() else: with open(glossary_path, 'r', encoding='utf-8') as f: glossary_data = json.load(f) formatted_entries = {} if isinstance(glossary_data, dict) and 'entries' in glossary_data: formatted_entries = glossary_data['entries'] elif isinstance(glossary_data, dict): formatted_entries = {k: v for k, v in glossary_data.items() if k != "metadata"} if formatted_entries: glossary_block = json.dumps(formatted_entries, ensure_ascii=False, indent=2) else: glossary_block = None if glossary_block: glossary_prompt = os.getenv('GLOSSARY_APPEND_PROMPT', "Character/Term Glossary (use these translations consistently):") current_prompt = config.PROMPT if current_prompt: current_prompt += "\n\n" current_prompt += f"{glossary_prompt}\n{glossary_block}" config.PROMPT = current_prompt print(f"✅ Added auto-generated glossary to system prompt ({os.path.basename(glossary_path)})") if 'DEFER_GLOSSARY_APPEND' in os.environ: del os.environ['DEFER_GLOSSARY_APPEND'] if 'GLOSSARY_APPEND_PROMPT' in os.environ: del os.environ['GLOSSARY_APPEND_PROMPT'] else: print("⚠️ Auto-generated glossary has no entries - skipping append") if 'DEFER_GLOSSARY_APPEND' in os.environ: del os.environ['DEFER_GLOSSARY_APPEND'] if 'GLOSSARY_APPEND_PROMPT' in os.environ: del os.environ['GLOSSARY_APPEND_PROMPT'] except Exception as e: print(f"⚠️ Failed to append auto-generated glossary: {e}") else: print("⚠️ No glossary file found after automatic generation") except Exception as e: print(f"❌ Glossary generation failed: {e}") else: print("📑 Automatic glossary generation disabled") # Don't create an empty glossary - let any existing manual glossary remain glossary_file = find_glossary_file(out) # Only show glossary details if append glossary is enabled append_glossary_enabled = os.getenv("APPEND_GLOSSARY", "1") == "1" add_additional_enabled = os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1' if glossary_file and os.path.exists(glossary_file): if append_glossary_enabled: try: if glossary_file.lower().endswith(('.csv', '.txt', '.md')): # Quick CSV/TXT/MD stats with open(glossary_file, 'r', encoding='utf-8') as f: lines = [ln.strip() for ln in f.readlines() if ln.strip()] entry_count = max(0, len(lines) - 1) if lines and ',' in lines[0] else len(lines) if glossary_file.lower().endswith('.txt'): file_type = "TXT" elif glossary_file.lower().endswith('.md'): file_type = "MD" else: file_type = "CSV" print(f"📑 Glossary ready ({file_type}) with {entry_count} entries") print("📑 Sample glossary lines:") for ln in lines[1:6]: print(f" • {ln}") elif glossary_file.lower().endswith('.json'): with open(glossary_file, 'r', encoding='utf-8') as f: glossary_data = json.load(f) if isinstance(glossary_data, dict): if 'entries' in glossary_data and isinstance(glossary_data['entries'], dict): entry_count = len(glossary_data['entries']) sample_items = list(glossary_data['entries'].items())[:3] else: entry_count = len(glossary_data) sample_items = list(glossary_data.items())[:3] print(f"📑 Glossary ready with {entry_count} entries") print("📑 Sample glossary entries:") for key, value in sample_items: print(f" • {key} → {value}") elif isinstance(glossary_data, list): print(f"📑 Glossary ready with {len(glossary_data)} entries") print("📑 Sample glossary entries:") for i, entry in enumerate(glossary_data[:3]): if isinstance(entry, dict): original = entry.get('original_name', '?') translated = entry.get('name', original) print(f" • {original} → {translated}") else: print(f"⚠️ Unexpected glossary format: {type(glossary_data)}") # Check for glossary extension (after all glossary types) if add_additional_enabled: # Check for extension with any supported format additional_glossary = None for ext in ['.csv', '.md', '.txt', '.json']: candidate = os.path.join(out, f"glossary_extension{ext}") if os.path.exists(candidate): additional_glossary = candidate break if additional_glossary: try: with open(additional_glossary, 'r', encoding='utf-8') as f: add_lines = [ln.strip() for ln in f.readlines() if ln.strip()] add_entry_count = max(0, len(add_lines) - 1) if add_lines and ',' in add_lines[0] else len(add_lines) print(f"📑 Glossary extension loaded with {add_entry_count} entries") print("📑 Sample glossary extension lines:") for ln in add_lines[1:4]: print(f" • {ln}") except Exception as e: print(f"⚠️ Failed to read glossary extension: {e}") else: # Check if extension file exists but toggle is disabled for ext in ['.csv', '.md', '.txt', '.json']: additional_glossary = os.path.join(out, f"glossary_extension{ext}") if os.path.exists(additional_glossary): print("⏩ Skipping glossary extension - toggle disabled") break except Exception as e: print(f"⚠️ Failed to inspect glossary file: {e}") else: print("⏩ Skipping glossary - toggle disabled") else: if append_glossary_enabled: print("📑 No glossary file found") print("="*50) print("🚀 STARTING MAIN TRANSLATION PHASE") print("="*50 + "\n") glossary_path = find_glossary_file(out) if glossary_path and os.path.exists(glossary_path) and glossary_path.lower().endswith('.json'): try: with open(glossary_path, 'r', encoding='utf-8') as f: g_data = json.load(f) print(f"[DEBUG] Glossary type before translation: {type(g_data)}") if isinstance(g_data, list): print(f"[DEBUG] Glossary is a list") except Exception as e: print(f"[DEBUG] Error checking glossary: {e}") glossary_path = find_glossary_file(out) # Build system prompt without glossary compression initially # Compression will happen per-chapter when enabled # Use get_system_prompt(1) for initial setup (no merging) system = build_system_prompt(config.get_system_prompt(actual_merge_count=1), glossary_path, source_text=None) base_msg = [{"role": "system", "content": system}] # Preserve the original system prompt to avoid in-place mutations original_system_prompt = system # Log assistant prompt if configured assistant_prompt = getattr(config, 'ASSISTANT_PROMPT', '') or '' if assistant_prompt and assistant_prompt.strip(): print(f"🤖 Assistant Prompt: {assistant_prompt}") last_summary_block_text = None # Will hold the last rolling summary text for the NEXT chapter only last_summary_chapter_num = None # Chapter number associated with last_summary_block_text image_translator = None if config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Image translation enabled for model: {config.MODEL}") print("🖼️ Image translation will use your custom system prompt and glossary") image_translator = ImageTranslator( client, out, config.PROFILE_NAME, system, config.TEMP, log_callback , progress_manager, history_manager, chunk_context_manager ) known_vision_models = [ 'gemini-1.5-pro', 'gemini-1.5-flash', 'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro', 'gpt-4-turbo', 'gpt-4o', 'gpt-4.1-mini', 'gpt-4.1-nano', 'o4-mini', 'gpt-4.1-mini' 'gemini-3-pro-image-preview', ] if config.MODEL.lower() not in known_vision_models: print(f"⚠️ Note: {config.MODEL} may not have vision capabilities. Image translation will be attempted anyway.") else: print("ℹ️ Image translation disabled by user") total_chapters = len(chapters) # Only detect numbering if the toggle is not disabled if config.DISABLE_ZERO_DETECTION: print(f"📊 0-based detection disabled by user setting") uses_zero_based = False # Important: Set a flag that can be checked throughout the codebase config._force_disable_zero_detection = True else: if chapters: uses_zero_based = detect_novel_numbering(chapters) print(f"📊 Novel numbering detected: {'0-based' if uses_zero_based else '1-based'}") else: uses_zero_based = False config._force_disable_zero_detection = False # Store this for later use config._uses_zero_based = uses_zero_based rng = os.getenv("CHAPTER_RANGE", "") start = None end = None if rng and re.match(r"^\d+\s*-\s*\d+$", rng): start, end = map(int, rng.split("-", 1)) if config.DISABLE_ZERO_DETECTION: print(f"📊 0-based detection disabled - using range as specified: {start}-{end}") elif uses_zero_based: print(f"📊 0-based novel detected") print(f"📊 User range {start}-{end} will be used as-is (chapters are already adjusted)") else: print(f"📊 1-based novel detected") print(f"📊 Using range as specified: {start}-{end}") print("📊 Calculating total chunks needed...") total_chunks_needed = 0 chunks_per_chapter = {} chapters_to_process = 0 # Check if special files translation is disabled translate_special = os.getenv('TRANSLATE_SPECIAL_FILES', '0') == '1' # Helper: sequential numbering with zero-phase. # Start at 0; only start incrementing once a digit >0 is seen in the filename. def _assign_chapter_num(name_noext, seq_counter, zero_phase): nums = re.findall(r'\d+', name_noext) if name_noext else [] has_gt_zero = any(int(n) > 0 for n in nums) if zero_phase: if has_gt_zero: # first positive digit: begin incrementing if seq_counter == 0: seq_counter = 1 num = seq_counter seq_counter += 1 zero_phase = False else: # still zero phase num = 0 else: # already incrementing num = seq_counter seq_counter += 1 return num, seq_counter, zero_phase # When setting actual chapter numbers (in the main function) seq_counter = 0 zero_phase = True for idx, c in enumerate(chapters): chap_num = c["num"] content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Extract the raw chapter number from the file raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) #print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}") # Spine position (reading order) fallback spine_pos = c.get('spine_order') if spine_pos is None: spine_pos = c.get('opf_spine_position') if spine_pos is None: spine_pos = idx # ultimate fallback to list order # Normalize chapter number using extracted number (spine/file aware) normalized_num = raw_num if raw_num is not None else 0 offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num = normalized_num + offset # When toggle is disabled, use raw numbers without any 0-based adjustment if config.DISABLE_ZERO_DETECTION: c['actual_chapter_num'] = raw_num # Store raw number for consistency c['raw_chapter_num'] = raw_num c['zero_adjusted'] = False else: # Store raw number c['raw_chapter_num'] = raw_num # Apply adjustment only if this is a 0-based novel if uses_zero_based: c['actual_chapter_num'] = raw_num + 1 c['zero_adjusted'] = True else: c['actual_chapter_num'] = raw_num c['zero_adjusted'] = False # Now we can safely use actual_num actual_num = c['actual_chapter_num'] # Skip special files (chapter 0) if translation is disabled # IMPORTANT: Do NOT treat files with digits (including 0) in their name as special. if not translate_special and raw_num == 0: name = c.get('original_basename') or os.path.basename(c.get('filename', '')) name_noext = os.path.splitext(name)[0] if name else '' has_digits_in_name = bool(re.search(r'\d', name_noext)) if not has_digits_in_name: # Track skipped special files if not hasattr(config, '_skipped_special_files'): config._skipped_special_files = [] config._skipped_special_files.append(c.get('original_basename', f'Chapter {actual_num}')) chunks_per_chapter[idx] = 0 continue if start is not None: if not (start <= c['actual_chapter_num'] <= end): # Track skipped chapters for summary (don't print individually) if not hasattr(config, '_range_skipped_chapters'): config._range_skipped_chapters = [] config._range_skipped_chapters.append(c['actual_chapter_num']) continue # IMPORTANT: pass chapter_obj so ProgressManager can resolve composite keys # (e.g. when multiple spine items share the same chapter number). needs_translation, skip_reason, _ = progress_manager.check_chapter_status( idx, actual_num, content_hash, out, chapter_obj=c ) if not needs_translation: chunks_per_chapter[idx] = 0 continue chapters_to_process += 1 chapter_key = str(actual_num) if chapter_key in progress_manager.prog["chapters"] and progress_manager.prog["chapters"][chapter_key].get("status") == "in_progress": pass # Calculate based on effective OUTPUT limit only max_output_tokens = config.get_effective_output_limit() safety_margin_output = 500 # Korean to English typically compresses to 0.7-0.9x compression_factor = config.COMPRESSION_FACTOR available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) # Ensure minimum available_tokens = max(available_tokens, 1000) # Debug output for first chapter if os.getenv('DEBUG_CHUNK_SPLITTING', '0') == '1' and idx == 0: print(f"\n[CHUNK CALC DEBUG] Configuration:") print(f" MAX_OUTPUT_TOKENS: {max_output_tokens:,}") print(f" safety_margin_output: {safety_margin_output:,}") print(f" COMPRESSION_FACTOR: {compression_factor}") print(f" Calculated available_tokens: {available_tokens:,}") print(f" Formula: ({max_output_tokens:,} - {safety_margin_output:,}) / {compression_factor} = {available_tokens:,}\n") #print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") # For mixed content chapters, calculate on clean text # For mixed content chapters, calculate on clean text # Get filename for content type detection (prefer source_file to detect PDF context) chapter_filename = c.get('source_file') or c.get('filename') or c.get('original_basename', '') if c.get('has_images', False) and ContentProcessor.is_meaningful_text_content(c["body"]): # Don't modify c["body"] at all during chunk calculation # Just pass the body as-is, the chunking will be slightly off but that's OK chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename) else: chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename) chapter_key_str = content_hash old_key_str = str(idx) if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}): progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str] del progress_manager.prog["chapter_chunks"][old_key_str] #print(f"[PROGRESS] Migrated chunks for chapter {actual_num} to new tracking system") # Always count actual chunks - ignore "completed" tracking chunks_per_chapter[idx] = len(chunks) total_chunks_needed += chunks_per_chapter[idx] # Print range skip summary if any chapters were skipped if hasattr(config, '_range_skipped_chapters') and config._range_skipped_chapters: skipped = config._range_skipped_chapters print(f"📊 Skipped {len(skipped)} chapters outside range {start}-{end}") if len(skipped) <= 10: print(f" Skipped: {', '.join(map(str, sorted(skipped)))}") else: print(f" Range: {min(skipped)} to {max(skipped)}") # Print special files skip summary if hasattr(config, '_skipped_special_files') and config._skipped_special_files: skipped = config._skipped_special_files print(f"📊 Skipped {len(skipped)} special file(s) (TRANSLATE_SPECIAL_FILES is disabled)") if len(skipped) <= 5: for file in skipped: print(f" • {file}") # Check if no chapters will be processed and provide helpful error if chapters_to_process == 0: if start is not None and end is not None: # Check if chapters in the range exist but are already completed if chapters: available_chapters = [c.get('actual_chapter_num', c['num']) for c in chapters] chapters_in_range = [num for num in available_chapters if start <= num <= end] if chapters_in_range: # Chapters in range exist but are already completed print(f"\n✅ All chapters in range {start}-{end} are already translated - nothing to do!") else: # No chapters exist in the specified range min_chapter = min(available_chapters) max_chapter = max(available_chapters) print(f"\n❌ ERROR: Chapter range {start}-{end} doesn't match any chapters!") print(f"📚 Available chapters in this EPUB: {min_chapter}-{max_chapter} ({len(chapters)} total)") print(f"💡 Please adjust your chapter range in the settings to match the available chapters.") if hasattr(config, '_range_skipped_chapters') and config._range_skipped_chapters: print(f"\n📊 All {len(config._range_skipped_chapters)} chapters were outside the specified range.") raise ValueError(f"Chapter range {start}-{end} doesn't match any available chapters ({min_chapter}-{max_chapter})") else: print(f"\n❌ ERROR: No chapters found in EPUB to translate!") raise ValueError("No chapters found in EPUB") elif not translate_special and total_chapters > 0: print(f"\n⚠️ WARNING: All chapters are special files (chapter 0) and TRANSLATE_SPECIAL_FILES is disabled.") print(f"💡 Enable 'Translate Special Files' in settings if you want to translate these files.") elif total_chunks_needed == 0 and total_chapters > 0: print(f"\n✅ All chapters already translated - nothing to do!") else: print(f"\n❌ ERROR: No chapters to process!") terminology = "Sections" if is_text_file else "Chapters" print(f"📊 Total chunks to translate: {total_chunks_needed}") print(f"📚 {terminology} to process: {chapters_to_process}") multi_chunk_chapters = [(idx, count) for idx, count in chunks_per_chapter.items() if count > 1] if multi_chunk_chapters: # Determine terminology based on file type terminology = "Sections" if is_text_file else "Chapters" print(f"📄 {terminology} requiring multiple chunks:") for idx, chunk_count in multi_chunk_chapters: chap = chapters[idx] section_term = "Section" if is_text_file else "Chapter" print(f" • {section_term} {idx+1} ({chap['title'][:30]}...): {chunk_count} chunks") translation_start_time = time.time() chunks_completed = 0 chapters_completed = 0 current_chunk_number = 0 if config.BATCH_TRANSLATION: # Check if request merging is enabled (for PDF and EPUB files) use_request_merging = config.REQUEST_MERGING_ENABLED and config.REQUEST_MERGE_COUNT > 1 and (is_pdf_file or not is_text_file) if use_request_merging: print(f"\n🔗 REQUEST MERGING + BATCH MODE ENABLED") print(f"🔗 Merging {config.REQUEST_MERGE_COUNT} chapters per API request") print(f"📦 Processing with up to {config.BATCH_SIZE} concurrent merged requests") else: print(f"\n📦 PARALLEL TRANSLATION MODE ENABLED") print(f"📦 Processing chapters with up to {config.BATCH_SIZE} concurrent API calls") import concurrent.futures from threading import Lock progress_lock = Lock() chapters_to_translate = [] # FIX: First pass to set actual chapter numbers for ALL chapters # This ensures batch mode has the same chapter numbering as non-batch mode print("📊 Setting chapter numbers...") seq_counter = 0 zero_phase = True for idx, c in enumerate(chapters): # PDF/TEXT CHUNK FIX: Skip extract_actual_chapter_number for chunks - preserve decimal from c['num'] if is_text_file and c.get('is_chunk', False): # For text/PDF chunks, use the decimal number directly (1.0, 1.1, etc.) c['actual_chapter_num'] = c['num'] c['raw_chapter_num'] = c['num'] c['zero_adjusted'] = False continue raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) raw_num = raw_num if raw_num is not None else 0 # Apply offset if configured offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num += offset if config.DISABLE_ZERO_DETECTION: # Use raw numbers without adjustment c['actual_chapter_num'] = raw_num c['raw_chapter_num'] = raw_num c['zero_adjusted'] = False else: # Store raw number c['raw_chapter_num'] = raw_num # Apply 0-based adjustment if detected if uses_zero_based: c['actual_chapter_num'] = raw_num + 1 c['zero_adjusted'] = True else: c['actual_chapter_num'] = raw_num c['zero_adjusted'] = False for idx, c in enumerate(chapters): chap_num = c["num"] content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Check if this is a pre-split text chunk with decimal number # IMPORTANT: Check is_chunk FIRST, then use c['num'] regardless of float value # This handles cases like 1.0 where float equals integer but should still be preserved if is_text_file and c.get('is_chunk', False): actual_num = c['num'] # Preserve the decimal for text/PDF chunks c['actual_chapter_num'] = actual_num # UPDATE THE CHAPTER DICT! else: actual_num = c.get('actual_chapter_num', c['num']) # Now this will exist! # Skip special files (chapter 0) if translation is disabled raw_num = c.get('raw_chapter_num', FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)) if not translate_special and raw_num == 0: name = c.get('original_basename') or os.path.basename(c.get('filename', '')) name_noext = os.path.splitext(name)[0] if name else '' has_digits_in_name = bool(re.search(r'\d', name_noext)) if not has_digits_in_name: continue # Skip chapters outside the range if start is not None and not (start <= actual_num <= end): continue # Check if chapter needs translation needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status( idx, actual_num, content_hash, out, c # Pass the chapter object ) # Add explicit file check for supposedly completed chapters if not needs_translation and existing_file: file_path = os.path.join(out, existing_file) if not os.path.exists(file_path): print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}") needs_translation = True skip_reason = None # Update status to file_missing progress_manager.update(idx, actual_num, content_hash, None, status="file_missing", chapter_obj=c) progress_manager.save() # ------------------------------------------------------------------------- # BATCH PRE-PROCESSING # ------------------------------------------------------------------------- if needs_translation and c.get("body"): batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' use_title_tag = os.getenv('USE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active if (not use_title_tag or ignore_header_tags): try: from bs4 import BeautifulSoup content_soup = BeautifulSoup(c["body"], 'html.parser') modified = False if not use_title_tag: for title_tag in content_soup.find_all('title'): title_tag.decompose() modified = True if ignore_header_tags: for header_tag in content_soup.find_all(['h1', 'h2', 'h3']): header_tag.decompose() modified = True if modified: c["body"] = str(content_soup) except Exception as e: print(f"⚠️ Failed to filter batch content for chapter {actual_num}: {e}") # ------------------------------------------------------------------------- if not needs_translation: # Track skips for summary instead of printing each one if not hasattr(config, '_batch_skipped_chapters'): config._batch_skipped_chapters = [] is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" config._batch_skipped_chapters.append((actual_num, terminology, skip_reason)) chapters_completed += 1 continue # Check for empty or image-only chapters has_images = c.get('has_images', False) has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"]) text_size = c.get('file_size', 0) is_image_link_only = ContentProcessor.is_only_image_links(c["body"]) is_empty_chapter = (not has_images and (text_size < 1 or is_image_link_only)) is_image_only_chapter = (has_images and not has_meaningful_text) # Handle empty chapters if is_empty_chapter: print(f"📄 Empty chapter {actual_num} detected (preserving original content as-is)") safe_title = make_safe_filename(c['title'], c['num']) if isinstance(c['num'], float): fname = FileUtilities.create_chapter_filename(c, c['num']) else: fname = FileUtilities.create_chapter_filename(c, c['num']) # IMPORTANT: For completed_empty, preserve the ORIGINAL XHTML/HTML markup. # In enhanced/html2text extraction modes, c['body'] may be empty because it contains only extracted text. original_markup = ( c.get("original_html") or c.get("source_html") or c.get("raw_html") or c.get("body") or "" ) with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(original_markup) progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty", chapter_obj=c) progress_manager.save() chapters_completed += 1 continue # Add to chapters to translate chapters_to_translate.append((idx, c)) # Print skip summary for batch mode if hasattr(config, '_batch_skipped_chapters') and config._batch_skipped_chapters: skipped = config._batch_skipped_chapters print(f"\n📊 Skipped {len(skipped)} already completed chapters") if os.getenv('DEBUG_SKIP_MESSAGES', '0') == '1' and len(skipped) <= 5: for num, term, reason in skipped[:5]: print(f" • {term} {num}: {reason.split('(')[0].strip()}") print(f"📊 Found {len(chapters_to_translate)} chapters to translate in parallel") # Continue with the rest of the existing batch processing code... batch_processor = BatchTranslationProcessor( config, client, base_msg, out, progress_lock, progress_manager.save, lambda idx, actual_num, content_hash, output_file=None, status="completed", **kwargs: progress_manager.update(idx, actual_num, content_hash, output_file, status, **kwargs), check_stop, image_translator, is_text_file=is_text_file, history_manager=history_manager ) # Batch-mode rolling summary: updated once per batch and injected into the NEXT batch. rolling_summary_for_next_batch = "" # exact rolling_summary.txt contents import threading rolling_summary_update_lock = threading.Lock() summary_translation_processor = None if config.USE_ROLLING_SUMMARY: # Dedicated processor for summarization between batches (no concurrency with translation threads). summary_translation_processor = TranslationProcessor(config, client, out, log_callback, check_stop, uses_zero_based, is_text_file) total_to_process = len(chapters_to_translate) processed = 0 # ========================== # Batching mode selection # ========================== batching_mode = getattr(config, 'BATCHING_MODE', 'direct') batch_group_size_cfg = max(1, int(getattr(config, 'BATCH_GROUP_SIZE', 3))) if batching_mode not in ('direct', 'conservative', 'aggressive'): batching_mode = 'direct' # Backwards compatibility with CONSERVATIVE_BATCHING env if os.getenv('CONSERVATIVE_BATCHING', '0') == '1': batching_mode = 'conservative' if batching_mode == 'conservative': batch_group_size = config.BATCH_SIZE * batch_group_size_cfg print(f"📦 Using conservative batching: group size {batch_group_size} (batch size {config.BATCH_SIZE}, multiplier {batch_group_size_cfg})") elif batching_mode == 'direct': batch_group_size = config.BATCH_SIZE # legacy behavior print(f"📦 Using direct batching: group size {batch_group_size}, parallel {config.BATCH_SIZE}") else: # aggressive batch_group_size = batch_group_size_cfg # not used for throttling, only for logging/summary grouping print(f"⚡ Using AGGRESSIVE batching: keeps {config.BATCH_SIZE} parallel calls, auto-refills when any finishes") # Create merge groups if request merging is enabled if use_request_merging: # Build proximity runs first (so we never merge far-apart chapters), # then pack each run under the token budget. This avoids patterns like # 2+1, 2+1 when REQUEST_MERGE_COUNT=3 but only 2 chapters fit; instead # we repack into 2+2, 2+2 when possible. proximity_runs = RequestMerger.create_merge_groups( chapters_to_translate, max(1, len(chapters_to_translate)), ) max_output_tokens = config.get_effective_output_limit() safety_margin_output = 500 compression_factor = getattr(config, 'COMPRESSION_FACTOR', 1.0) or 1.0 available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) available_tokens = max(available_tokens, 1000) merge_groups = [] for run in proximity_runs: if len(run) <= 1: merge_groups.append(run) continue i = 0 while i < len(run): group = [run[i]] i += 1 # Try to grow the group up to REQUEST_MERGE_COUNT, but stop # when adding the next chapter would exceed the token budget. while i < len(run) and len(group) < config.REQUEST_MERGE_COUNT: candidate = run[i] merge_input = [ (ch.get('actual_chapter_num', ch['num']), ch["body"], ch) for (idx, ch) in (group + [candidate]) ] merged_preview = RequestMerger.merge_chapters(merge_input, log_injections=False) merged_tokens = chapter_splitter.count_tokens(merged_preview) if merged_tokens <= available_tokens: group.append(candidate) i += 1 else: break merge_groups.append(group) print(f"🔗 Created {len(merge_groups)} merge groups from {total_to_process} chapters (after size adjustment)") units_to_process = merge_groups is_merged_mode = True else: units_to_process = [[ch] for ch in chapters_to_translate] # Wrap each chapter as single-item group is_merged_mode = False with concurrent.futures.ThreadPoolExecutor(max_workers=config.BATCH_SIZE) as executor: if batching_mode == 'aggressive': import threading batch_submit_lock = threading.Lock() active_futures = {} next_unit_idx = 0 def submit_next_unit(): nonlocal next_unit_idx if next_unit_idx >= len(units_to_process): return False unit = units_to_process[next_unit_idx] if config.USE_ROLLING_SUMMARY: batch_processor.set_batch_rolling_summary_text(rolling_summary_for_next_batch) time.sleep(0.000001) if is_merged_mode: fut = executor.submit(batch_processor.process_merged_group, unit, progress_manager) else: fut = executor.submit(batch_processor.process_single_chapter, unit[0]) active_futures[fut] = unit next_unit_idx += 1 return True # Prime the executor with batch_submit_lock: while len(active_futures) < config.BATCH_SIZE and submit_next_unit(): pass graceful_stop_message_shown = False # Track if we've shown the message while active_futures or next_unit_idx < len(units_to_process): # Check for graceful stop before submitting new work graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' # Ensure we have work submitted (but not during graceful stop) if not active_futures: with batch_submit_lock: while len(active_futures) < config.BATCH_SIZE and submit_next_unit(): pass if not active_futures: break # No more work to do elif not graceful_stop_active: # Auto-refill: submit new work to maintain BATCH_SIZE parallel calls # But DON'T submit new work if graceful stop is active with batch_submit_lock: while len(active_futures) < config.BATCH_SIZE and submit_next_unit(): pass # Use wait() with FIRST_COMPLETED to properly handle dynamic future sets done, _ = concurrent.futures.wait(active_futures.keys(), return_when=concurrent.futures.FIRST_COMPLETED) # Track if we should exit the outer loop after processing done futures should_exit_outer_loop = False for future in done: if check_stop(): # Check if wait_for_chunks is enabled - if so, let current chapters finish graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if graceful_stop_active and wait_for_chunks: # Only print message once if not graceful_stop_message_shown: print("⏳ Graceful stop — waiting for current chapter(s) to finish...") graceful_stop_message_shown = True # Process only completed futures, skip cancelled ones # Clear all remaining futures and exit both loops active_futures.clear() should_exit_outer_loop = True break else: # print("❌ Translation stopped") # Redundant with "Translation stopped by user" from exception executor.shutdown(wait=False, cancel_futures=True) return unit = active_futures.pop(future) completed_in_batch = 0 failed_in_batch = 0 batch_history_map = {} chapters_in_batch = sum(len(u) for u in [unit]) try: if is_merged_mode: results = future.result() for result in results: success, chap_num, hist_user, hist_assistant, raw_obj = result if success: completed_in_batch += 1 if hist_user and hist_assistant: for idx, ch in unit: if ch.get('actual_chapter_num', ch['num']) == chap_num: batch_history_map[idx] = (hist_user, hist_assistant, raw_obj) break else: failed_in_batch += 1 processed += 1 print(f"✅ Merged group done: {len(results)} chapters") else: success, chap_num, hist_user, hist_assistant, raw_obj = future.result() idx, chapter = unit[0] if success: completed_in_batch += 1 if hist_user and hist_assistant: batch_history_map[idx] = (hist_user, hist_assistant, raw_obj) print(f"✅ Chapter {chap_num} done") # Clear any stale watchdog entries for this chapter try: import unified_api_client if hasattr(unified_api_client, '_api_watchdog_clear_chapter'): unified_api_client._api_watchdog_clear_chapter(chap_num) except Exception: pass else: failed_in_batch += 1 # Error already printed by worker thread processed += 1 except Exception as e: if is_merged_mode: failed_in_batch += len(unit) processed += len(unit) else: failed_in_batch += 1 processed += 1 print(f"❌ Thread error: {e}") progress_percent = (processed / total_to_process) * 100 print(f"📊 Overall Progress: {processed}/{total_to_process} ({progress_percent:.1f}%)") # History append immediately for this unit if config.CONTEXTUAL and getattr(config, 'HIST_LIMIT', 0) > 0: hist_limit = getattr(config, 'HIST_LIMIT', 0) sorted_chapters = sorted(unit, key=lambda x: x[0]) for idx, chapter in sorted_chapters: if idx in batch_history_map: user_content, assistant_content, raw_obj = batch_history_map[idx] try: time.sleep(0.000001) history_manager.append_to_history( user_content, assistant_content, hist_limit, reset_on_limit=True, rolling_window=config.TRANSLATION_HISTORY_ROLLING, raw_assistant_object=raw_obj ) except Exception as e: actual_num_for_log = chapter.get('actual_chapter_num', chapter.get('num')) print(f"⚠️ Failed to append Chapter {actual_num_for_log} to translation history (batch): {e}") # Rolling summary update per unit if config.USE_ROLLING_SUMMARY and summary_translation_processor is not None: try: batch_items = sorted(unit, key=lambda x: x[0]) translated_blocks = [] last_actual_num_in_batch = None for idx, chapter in batch_items: actual_num = chapter.get('actual_chapter_num', chapter.get('num')) last_actual_num_in_batch = actual_num fname_guess = FileUtilities.create_chapter_filename(chapter, actual_num) candidates = [fname_guess] if isinstance(fname_guess, str) and fname_guess.endswith('.html'): candidates.insert(0, fname_guess.replace('.html', '.txt')) elif isinstance(fname_guess, str) and fname_guess.endswith('.txt'): candidates.append(fname_guess.replace('.txt', '.html')) content = "" for cand in candidates: fp = os.path.join(out, cand) if os.path.exists(fp): with open(fp, 'r', encoding='utf-8') as f: content = f.read() if content: break if isinstance(content, str) and content: translated_blocks.append(content) batch_translations_text = "\n\n---\n\n".join(translated_blocks) if batch_translations_text: old_mode = getattr(config, 'ROLLING_SUMMARY_MODE', 'replace') old_max_entries = getattr(config, 'ROLLING_SUMMARY_MAX_ENTRIES', 0) try: config.ROLLING_SUMMARY_MODE = 'replace' config.ROLLING_SUMMARY_MAX_ENTRIES = int(chapters_in_batch or 0) with rolling_summary_update_lock: time.sleep(0.000001) summary_translation_processor.generate_rolling_summary( history_manager, last_actual_num_in_batch, base_system_content=None, source_text=batch_translations_text, previous_summary_text=None, previous_summary_chapter_num=None, prefer_translations_only_user=True, ) summary_file = os.path.join(out, 'rolling_summary.txt') if os.path.exists(summary_file): with open(summary_file, 'r', encoding='utf-8') as sf: rolling_summary_for_next_batch = (sf.read() or "") else: rolling_summary_for_next_batch = "" finally: config.ROLLING_SUMMARY_MODE = old_mode config.ROLLING_SUMMARY_MAX_ENTRIES = old_max_entries else: rolling_summary_for_next_batch = "" except Exception as e: print(f"⚠️ Batch rolling summary update failed: {e}") rolling_summary_for_next_batch = "" # Refill slots aggressively (but not if stop requested) if not check_stop(): with batch_submit_lock: while len(active_futures) < config.BATCH_SIZE and submit_next_unit(): pass # Exit outer loop if graceful stop was triggered if should_exit_outer_loop: break # After all futures complete, if stop was requested with wait_for_chunks, exit if check_stop(): graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if graceful_stop_active and wait_for_chunks: print("\n✅ All current chapter(s) completed. Stopping as requested (wait for chunks).") return else: # direct or conservative: keep legacy batch grouping behaviour for batch_start in range(0, len(units_to_process), batch_group_size if not is_merged_mode else config.BATCH_SIZE): if check_stop(): print("❌ Translation stopped during parallel processing") executor.shutdown(wait=False) return effective_batch_size = batch_group_size if not is_merged_mode else config.BATCH_SIZE batch_end = min(batch_start + effective_batch_size, len(units_to_process)) current_batch_units = units_to_process[batch_start:batch_end] # Count total chapters in this batch chapters_in_batch = sum(len(unit) for unit in current_batch_units) batch_number = (batch_start // effective_batch_size) + 1 if is_merged_mode: print(f"\n📦 Submitting batch {batch_number}: {len(current_batch_units)} merged groups ({chapters_in_batch} chapters)") else: print(f"\n📦 Submitting batch {batch_number}: {chapters_in_batch} chapters") if config.USE_ROLLING_SUMMARY: batch_processor.set_batch_rolling_summary_text(rolling_summary_for_next_batch) time.sleep(0.000001) if is_merged_mode: future_to_unit = { executor.submit(batch_processor.process_merged_group, unit, progress_manager): unit for unit in current_batch_units } else: future_to_unit = { executor.submit(batch_processor.process_single_chapter, unit[0]): unit for unit in current_batch_units } completed_in_batch = 0 failed_in_batch = 0 batch_history_map = {} for future in concurrent.futures.as_completed(future_to_unit): if check_stop(): # Check if wait_for_chunks is enabled - if so, let current chapters finish graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if graceful_stop_active and wait_for_chunks: print("⏳ Graceful stop — waiting for current chapter(s) to finish...") # Don't shutdown - let this batch complete else: # print("❌ Translation stopped") # Redundant with "Translation stopped by user" from exception executor.shutdown(wait=False) return unit = future_to_unit[future] try: if is_merged_mode: results = future.result() for result in results: success, chap_num, hist_user, hist_assistant, raw_obj = result if success: completed_in_batch += 1 if hist_user and hist_assistant: for idx, ch in unit: if ch.get('actual_chapter_num', ch['num']) == chap_num: batch_history_map[idx] = (hist_user, hist_assistant, raw_obj) break else: failed_in_batch += 1 processed += 1 print(f"✅ Merged group done: {len(results)} chapters") else: success, chap_num, hist_user, hist_assistant, raw_obj = future.result() idx, chapter = unit[0] if success: completed_in_batch += 1 print(f"✅ Chapter {chap_num} done ({completed_in_batch + failed_in_batch}/{chapters_in_batch} in batch)") # Clear any stale watchdog entries for this chapter try: import unified_api_client if hasattr(unified_api_client, '_api_watchdog_clear_chapter'): unified_api_client._api_watchdog_clear_chapter(chap_num) except Exception: pass if hist_user and hist_assistant: batch_history_map[idx] = (hist_user, hist_assistant, raw_obj) else: failed_in_batch += 1 # Error already printed by worker thread processed += 1 except Exception as e: if is_merged_mode: failed_in_batch += len(unit) processed += len(unit) else: failed_in_batch += 1 processed += 1 print(f"❌ Thread error: {e}") progress_percent = (processed / total_to_process) * 100 print(f"📊 Overall Progress: {processed}/{total_to_process} ({progress_percent:.1f}%)") # After all futures in this batch complete, append their history entries if config.CONTEXTUAL and getattr(config, 'HIST_LIMIT', 0) > 0: hist_limit = getattr(config, 'HIST_LIMIT', 0) all_chapters_in_batch = [] for unit in current_batch_units: all_chapters_in_batch.extend(unit) sorted_chapters = sorted(all_chapters_in_batch, key=lambda x: x[0]) for idx, chapter in sorted_chapters: if idx in batch_history_map: user_content, assistant_content, raw_obj = batch_history_map[idx] try: time.sleep(0.000001) history_manager.append_to_history( user_content, assistant_content, hist_limit, reset_on_limit=True, rolling_window=config.TRANSLATION_HISTORY_ROLLING, raw_assistant_object=raw_obj ) except Exception as e: actual_num_for_log = chapter.get('actual_chapter_num', chapter.get('num')) print(f"⚠️ Failed to append Chapter {actual_num_for_log} to translation history (batch): {e}") # After the batch completes, update rolling_summary.txt ONCE (for the next batch). if config.USE_ROLLING_SUMMARY and summary_translation_processor is not None: try: batch_items = [] for unit in current_batch_units: batch_items.extend(unit) batch_items = sorted(batch_items, key=lambda x: x[0]) translated_blocks = [] last_actual_num_in_batch = None for idx, chapter in batch_items: try: actual_num = chapter.get('actual_chapter_num', chapter.get('num')) last_actual_num_in_batch = actual_num fname_guess = FileUtilities.create_chapter_filename(chapter, actual_num) candidates = [fname_guess] if isinstance(fname_guess, str) and fname_guess.endswith('.html'): candidates.insert(0, fname_guess.replace('.html', '.txt')) elif isinstance(fname_guess, str) and fname_guess.endswith('.txt'): candidates.append(fname_guess.replace('.txt', '.html')) content = "" for cand in candidates: fp = os.path.join(out, cand) if os.path.exists(fp): with open(fp, 'r', encoding='utf-8') as f: content = f.read() if content: break if isinstance(content, str) and content: translated_blocks.append(content) except Exception: continue batch_translations_text = "\n\n---\n\n".join(translated_blocks) if batch_translations_text: old_mode = getattr(config, 'ROLLING_SUMMARY_MODE', 'replace') old_max_entries = getattr(config, 'ROLLING_SUMMARY_MAX_ENTRIES', 0) try: config.ROLLING_SUMMARY_MODE = 'replace' try: config.ROLLING_SUMMARY_MAX_ENTRIES = int(chapters_in_batch or 0) except Exception: config.ROLLING_SUMMARY_MAX_ENTRIES = 0 with rolling_summary_update_lock: time.sleep(0.000001) summary_translation_processor.generate_rolling_summary( history_manager, last_actual_num_in_batch, base_system_content=None, source_text=batch_translations_text, previous_summary_text=None, previous_summary_chapter_num=None, prefer_translations_only_user=True, ) summary_file = os.path.join(out, 'rolling_summary.txt') if os.path.exists(summary_file): with open(summary_file, 'r', encoding='utf-8') as sf: rolling_summary_for_next_batch = (sf.read() or "") else: rolling_summary_for_next_batch = "" finally: config.ROLLING_SUMMARY_MODE = old_mode config.ROLLING_SUMMARY_MAX_ENTRIES = old_max_entries else: rolling_summary_for_next_batch = "" except Exception as e: print(f"⚠️ Batch rolling summary update failed: {e}") rolling_summary_for_next_batch = "" print(f"\n📦 Batch Summary:") print(f" ✅ Successful: {completed_in_batch}") print(f" ❌ Failed: {failed_in_batch}") # After batch completes, if stop was requested with wait_for_chunks, exit if check_stop(): graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if graceful_stop_active and wait_for_chunks: print("\n✅ Current batch completed. Stopping as requested (wait for chunks).") return if batch_end < total_to_process: print(f"⏳ Waiting {config.DELAY}s before next batch...") time.sleep(config.DELAY) chapters_completed = batch_processor.chapters_completed chunks_completed = batch_processor.chunks_completed print(f"\n🎉 Parallel translation complete!") print(f" Total chapters processed: {processed}") # Count qa_failed chapters correctly qa_failed_count = 0 actual_successful = 0 for idx, c in enumerate(chapters): # Get the chapter's actual number if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] else: actual_num = c.get('actual_chapter_num', c['num']) # Check if this chapter was processed and has qa_failed status content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Check if this chapter exists in progress chapter_info = progress_manager.prog["chapters"].get(content_hash, {}) status = chapter_info.get("status") if status == "qa_failed": qa_failed_count += 1 elif status == "completed": actual_successful += 1 # Correct the displayed counts print(f" Successful: {actual_successful}") if qa_failed_count > 0: print(f"\n⚠️ {qa_failed_count} chapters failed due to content policy violations:") qa_failed_chapters = [] for idx, c in enumerate(chapters): if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] else: actual_num = c.get('actual_chapter_num', c['num']) content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) chapter_info = progress_manager.prog["chapters"].get(content_hash, {}) if chapter_info.get("status") == "qa_failed": qa_failed_chapters.append(actual_num) print(f" Failed chapters: {', '.join(map(str, sorted(qa_failed_chapters)))}") # Stop translation completely after batch mode print("\n📌 Batch translation completed.") elif not config.BATCH_TRANSLATION: translation_processor = TranslationProcessor(config, client, out, log_callback, check_stop, uses_zero_based, is_text_file) # Only initialize AI Hunter when both the detection mode AND duplicate retry are enabled. if config.DUPLICATE_DETECTION_MODE == 'ai-hunter' and getattr(config, 'RETRY_DUPLICATE_BODIES', False): # Build the main config from environment variables and config object main_config = { 'duplicate_lookback_chapters': config.DUPLICATE_LOOKBACK_CHAPTERS, 'duplicate_detection_mode': config.DUPLICATE_DETECTION_MODE, } # Check if AI Hunter config was passed via environment variable ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG') if ai_hunter_config_str: try: ai_hunter_config = json.loads(ai_hunter_config_str) main_config['ai_hunter_config'] = ai_hunter_config print("🤖 AI Hunter: Loaded configuration from environment") except json.JSONDecodeError: print("⚠️ AI Hunter: Failed to parse AI_HUNTER_CONFIG from environment") # If no AI Hunter config in environment, try to load from file as fallback if 'ai_hunter_config' not in main_config: # Try multiple locations for config.json config_paths = [ os.path.join(os.getcwd(), 'config.json'), os.path.join(out, '..', 'config.json'), ] if getattr(sys, 'frozen', False): config_paths.append(os.path.join(os.path.dirname(sys.executable), 'config.json')) else: script_dir = os.path.dirname(os.path.abspath(__file__)) config_paths.extend([ os.path.join(script_dir, 'config.json'), os.path.join(os.path.dirname(script_dir), 'config.json') ]) for config_path in config_paths: if os.path.exists(config_path): try: with open(config_path, 'r', encoding='utf-8') as f: file_config = json.load(f) if 'ai_hunter_config' in file_config: main_config['ai_hunter_config'] = file_config['ai_hunter_config'] print(f"🤖 AI Hunter: Loaded configuration from {config_path}") break except Exception as e: print(f"⚠️ Failed to load config from {config_path}: {e}") # Always create and inject the improved AI Hunter when ai-hunter mode is selected ai_hunter = ImprovedAIHunterDetection(main_config) # The TranslationProcessor class has a method that checks for duplicates # We need to replace it with our enhanced AI Hunter # Create a wrapper to match the expected signature def enhanced_duplicate_check(self, result, idx, prog, out, actual_num=None): # If actual_num is not provided, try to get it from progress if actual_num is None: # Look for the chapter being processed for ch_key, ch_info in prog.get("chapters", {}).items(): if ch_info.get("chapter_idx") == idx: actual_num = ch_info.get("actual_num", idx + 1) break # Fallback to idx+1 if not found if actual_num is None: actual_num = idx + 1 return ai_hunter.detect_duplicate_ai_hunter_enhanced(result, idx, prog, out, actual_num) # Bind the enhanced method to the processor instance translation_processor.check_duplicate_content = enhanced_duplicate_check.__get__(translation_processor, TranslationProcessor) print("🤖 AI Hunter: Using enhanced detection with configurable thresholds") # First pass: set actual chapter numbers respecting the config for idx, c in enumerate(chapters): raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config) #print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}") # Apply offset if configured offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0 raw_num += offset if config.DISABLE_ZERO_DETECTION: # Use raw numbers without adjustment c['actual_chapter_num'] = raw_num c['raw_chapter_num'] = raw_num c['zero_adjusted'] = False else: # Store raw number c['raw_chapter_num'] = raw_num # Apply 0-based adjustment if detected if uses_zero_based: c['actual_chapter_num'] = raw_num + 1 c['zero_adjusted'] = True else: c['actual_chapter_num'] = raw_num c['zero_adjusted'] = False # Request merging preprocessing merge_groups = {} # Maps parent_idx -> list of child (idx, chapter) tuples merged_children = set() # Set of idx that are merged into another chapter # Request merging for EPUB/PDF (non-text) in non-batch mode if config.REQUEST_MERGING_ENABLED and config.REQUEST_MERGE_COUNT > 1 and (is_pdf_file or not is_text_file): print(f"\n🔗 REQUEST MERGING ENABLED: Combining up to {config.REQUEST_MERGE_COUNT} chapters per request") # Collect chapters that need translation chapters_needing_translation = [] for idx, c in enumerate(chapters): if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] else: actual_num = c.get('actual_chapter_num', c['num']) content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Skip special files (chapter 0) if translation is disabled raw_num = c.get('raw_chapter_num', FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)) if not translate_special and raw_num == 0: name = c.get('original_basename') or os.path.basename(c.get('filename', '')) name_noext = os.path.splitext(name)[0] if name else '' has_digits_in_name = bool(re.search(r'\d', name_noext)) if not has_digits_in_name: continue if start is not None and not (start <= actual_num <= end): continue needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status( idx, actual_num, content_hash, out, c ) # Check file exists if not needs_translation and existing_file: file_path = os.path.join(out, existing_file) if not os.path.exists(file_path): needs_translation = True # Skip empty/image-only chapters from merging has_images = c.get('has_images', False) has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"]) text_size = c.get('file_size', 0) is_image_link_only = ContentProcessor.is_only_image_links(c["body"]) is_empty_chapter = (not has_images and (text_size < 1 or is_image_link_only)) is_image_only_chapter = (has_images and not has_meaningful_text) if needs_translation and not is_empty_chapter and not is_image_only_chapter: chapters_needing_translation.append((idx, c, actual_num, content_hash)) # Create merge groups groups = RequestMerger.create_merge_groups( chapters_needing_translation, config.REQUEST_MERGE_COUNT ) # Build proximity runs first (so we never merge far-apart chapters), # then pack each run under the token budget (repacking avoids 2+1,2+1 patterns). max_output_tokens = config.get_effective_output_limit() safety_margin_output = 500 compression_factor = getattr(config, 'COMPRESSION_FACTOR', 1.0) or 1.0 available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) available_tokens = max(available_tokens, 1000) proximity_runs = RequestMerger.create_merge_groups( chapters_needing_translation, max(1, len(chapters_needing_translation)), ) groups = [] for run in proximity_runs: if len(run) <= 1: groups.append(run) continue i = 0 while i < len(run): group = [run[i]] i += 1 while i < len(run) and len(group) < config.REQUEST_MERGE_COUNT: candidate = run[i] merge_input = [ (g_actual_num, g_chapter["body"], g_chapter) for (g_idx, g_chapter, g_actual_num, g_content_hash) in (group + [candidate]) ] merged_preview = RequestMerger.merge_chapters(merge_input, log_injections=False) merged_tokens = chapter_splitter.count_tokens(merged_preview) if merged_tokens <= available_tokens: group.append(candidate) i += 1 else: break groups.append(group) # Check graceful stop before logging merge groups graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' for group in groups: if len(group) > 1: parent_idx = group[0][0] # First chapter in group is the parent parent_actual_num = group[0][2] merge_groups[parent_idx] = group # Track children to skip - but DON'T mark as merged yet # (they'll be marked as merged only after parent completes) for i, (idx, c, actual_num, content_hash) in enumerate(group): if i > 0: merged_children.add(idx) # Only log merge planning if not in graceful stop if not graceful_stop_active: child_nums = [g[2] for g in group[1:]] print(f" 📎 Chapters {parent_actual_num} + {child_nums} will be merged into one request") if not graceful_stop_active: print(f" 📊 Created {len(merge_groups)} merge groups from {len(chapters_needing_translation)} chapters") # Second pass: process chapters for idx, c in enumerate(chapters): chap_num = c["num"] # Graceful stop check: stop processing new chapters when graceful stop is active graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active or os.environ.get('GRACEFUL_STOP_COMPLETED') == '1': print("✅ Graceful stop: Stopping new chapter processing...") break # Skip if this chapter was merged into another if idx in merged_children: if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] else: actual_num = c.get('actual_chapter_num', c['num']) is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f"\n⏭️ Skipping {terminology} {actual_num} (merged into parent)") chapters_completed += 1 continue # Check if this is a pre-split text chunk with decimal number if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)): actual_num = c['num'] # Preserve the decimal for text files only else: actual_num = c.get('actual_chapter_num', c['num']) content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"]) # Skip special files (chapter 0) if translation is disabled raw_num = c.get('raw_chapter_num', FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)) if not translate_special and raw_num == 0: name = c.get('original_basename') or os.path.basename(c.get('filename', '')) name_noext = os.path.splitext(name)[0] if name else '' has_digits_in_name = bool(re.search(r'\d', name_noext)) if not has_digits_in_name: continue if start is not None and not (start <= actual_num <= end): # Skip silently (already summarized in earlier pass) continue needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status( idx, actual_num, content_hash, out, c # Pass the chapter object ) # Add explicit file check for supposedly completed chapters if not needs_translation and existing_file: file_path = os.path.join(out, existing_file) if not os.path.exists(file_path): print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}") needs_translation = True skip_reason = None # Update status to file_missing progress_manager.update(idx, actual_num, content_hash, None, status="file_missing", chapter_obj=c) progress_manager.save() if not needs_translation: # Track skips for summary (already printed in batch mode section above) if not hasattr(config, '_sequential_skipped_chapters'): config._sequential_skipped_chapters = [] is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" config._sequential_skipped_chapters.append((actual_num, terminology, skip_reason)) continue chapter_position = f"{chapters_completed + 1}/{chapters_to_process}" # Determine if this is a text file is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" # Determine file reference based on type if c.get('is_chunk', False): file_ref = f"Section_{c['num']}" else: file_ref = c.get('original_basename', f'{terminology}_{actual_num}') print(f"\n🔄 Processing #{idx+1}/{total_chapters} (Actual: {terminology} {actual_num}) ({chapter_position} to translate): {c['title']} [File: {file_ref}]") chunk_context_manager.start_chapter(chap_num, c['title']) # Initialize merge_info for this chapter (will be populated if this is a parent in a merge group) merge_info = None has_images = c.get('has_images', False) has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"]) text_size = c.get('file_size', 0) is_image_link_only = ContentProcessor.is_only_image_links(c["body"]) is_empty_chapter = (not has_images and (text_size < 1 or is_image_link_only)) is_image_only_chapter = (has_images and not has_meaningful_text) is_mixed_content = (has_images and has_meaningful_text) is_text_only = (not has_images and has_meaningful_text) if is_empty_chapter: print(f"📄 Empty chapter {actual_num} detected (preserving original content as-is)") # Create filename for empty chapter if isinstance(c['num'], float): fname = FileUtilities.create_chapter_filename(c, c['num']) else: fname = FileUtilities.create_chapter_filename(c, actual_num) # Save ORIGINAL markup for empty chapters. # In enhanced/html2text extraction modes, c['body'] can be blank (it may only contain extracted text). original_markup = ( c.get("original_html") or c.get("source_html") or c.get("raw_html") or c.get("body") or "" ) with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(original_markup) # Update progress tracking progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty", chapter_obj=c) progress_manager.save() chapters_completed += 1 # CRITICAL: Skip translation! continue elif is_image_only_chapter: print(f"📸 Image-only chapter: {c.get('image_count', 0)} images") translated_html = c["body"] image_translations = {} # Step 1: Process images if image translation is enabled if image_translator and config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Translating {c.get('image_count', 0)} images...") image_translator.set_current_chapter(chap_num) translated_html, image_translations = process_chapter_images( c["body"], actual_num, image_translator, check_stop ) if image_translations: print(f"✅ Translated {len(image_translations)} images") # Step 2: Check for headers/titles that need translation from bs4 import BeautifulSoup soup = BeautifulSoup(c["body"], 'html.parser') # Look for headers headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) # If we have headers, we should translate them even in "image-only" chapters if headers and any(h.get_text(strip=True) for h in headers): print(f"📝 Found headers to translate in image-only chapter") # Create a minimal HTML with just the headers for translation headers_html = "" for header in headers: if header.get_text(strip=True): headers_html += str(header) + "\n" if headers_html: print(f"📤 Translating chapter headers...") # Send just the headers for translation header_msgs = base_msg + [{"role": "user", "content": headers_html}] # Use the standard filename fname = FileUtilities.create_chapter_filename(c, actual_num) client.set_output_filename(fname) # Simple API call for headers header_result, _ = client.send( header_msgs, temperature=config.TEMP, max_tokens=config.MAX_OUTPUT_TOKENS ) if header_result: # Clean the result header_result = re.sub(r"^```(?:html)?\s*\n?", "", header_result, count=1, flags=re.MULTILINE) header_result = re.sub(r"\n?```\s*$", "", header_result, count=1, flags=re.MULTILINE) # Parse both the translated headers and the original body soup_headers = BeautifulSoup(header_result, 'html.parser') soup_body = BeautifulSoup(translated_html, 'html.parser') # Replace headers in the body with translated versions translated_headers = soup_headers.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) original_headers = soup_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']) # Match and replace headers for orig, trans in zip(original_headers, translated_headers): if trans and trans.get_text(strip=True): orig.string = trans.get_text(strip=True) translated_html = str(soup_body) print(f"✅ Headers translated successfully") status = "completed" else: print(f"⚠️ Failed to translate headers") status = "completed_image_only" else: status = "completed_image_only" else: print(f"ℹ️ No headers found to translate") status = "completed_image_only" # Step 3: Save with correct filename fname = FileUtilities.create_chapter_filename(c, actual_num) with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(translated_html) print(f"[Chapter {idx+1}/{total_chapters}] ✅ Saved image-only chapter") progress_manager.update(idx, actual_num, content_hash, fname, status=status, chapter_obj=c) progress_manager.save() chapters_completed += 1 continue else: # Set default text to translate text_to_translate = c["body"] image_translations = {} if is_mixed_content and image_translator and config.ENABLE_IMAGE_TRANSLATION: print(f"🖼️ Processing {c.get('image_count', 0)} images first...") print(f"[DEBUG] Content before image processing (first 200 chars):") print(c["body"][:200]) print(f"[DEBUG] Has h1 tags: {'

    ' in c['body']}") print(f"[DEBUG] Has h2 tags: {'

    ' in c['body']}") image_translator.set_current_chapter(chap_num) # Store the original body before processing original_body = c["body"] # Calculate original chapter tokens before modification original_chapter_tokens = chapter_splitter.count_tokens(original_body) # Process images and get body with translations body_with_images, image_translations = process_chapter_images( c["body"], actual_num, image_translator, check_stop ) if image_translations: print(f"✅ Translated {len(image_translations)} images") # Store the body with images for later merging c["body_with_images"] = body_with_images # For chapters with only images and title, we still need to translate the title # Extract clean text for translation from ORIGINAL body from bs4 import BeautifulSoup soup_clean = BeautifulSoup(original_body, 'html.parser') # Remove images from the original to get pure text for img in soup_clean.find_all('img'): img.decompose() # Set clean text for translation - use prettify() or str() on the full document c["body"] = str(soup_clean) if soup_clean.body else original_body # If there's no meaningful text content after removing images, # the text translation will just translate the title, which is correct print(f" 📝 Clean text for translation: {len(c['body'])} chars") # Update text_size to reflect actual text to translate text_size = len(c["body"]) # Recalculate the actual token count for clean text actual_text_tokens = chapter_splitter.count_tokens(c["body"]) print(f" 📊 Actual text tokens: {actual_text_tokens} (was counting {original_chapter_tokens} with images)") # IMPORTANT: use the cleaned text for downstream chunking/translation chapter_body = c["body"] # If render mode is image and there's essentially no text, skip text translation render_mode = os.getenv("PDF_RENDER_MODE", "xhtml").lower() stripped_text_len = len(soup_clean.get_text(strip=True)) if render_mode == "image" and image_translations and stripped_text_len < 20: print("🖼️ Image-rendered page with no meaningful text — skipping text translation.") fname = FileUtilities.create_chapter_filename(c, actual_num) with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(body_with_images) progress_manager.update(idx, actual_num, content_hash, fname, status="completed_image_only", chapter_obj=c) progress_manager.save() chapters_completed += 1 continue else: print(f"ℹ️ No translatable text found in images") # Keep original body if no image translations c["body"] = original_body print(f"📖 Translating text content ({text_size} characters)") # Determine output filename for tracking fname = FileUtilities.create_chapter_filename(c, actual_num) progress_manager.update(idx, actual_num, content_hash, fname, status="in_progress", chapter_obj=c) progress_manager.save() # REQUEST MERGING: If this is a parent chapter, merge content from child chapters merge_info = None # Will store info for response splitting if idx in merge_groups: group = merge_groups[idx] if len(group) > 1: print(f"\n🔗 MERGING {len(group)} chapters into single request...") # Mark all chapters in the group as in_progress for g_idx, g_chapter, g_actual_num, g_content_hash in group: if g_idx != idx: # Parent already marked above g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) progress_manager.update(g_idx, g_actual_num, g_content_hash, g_fname, status="in_progress", chapter_obj=g_chapter) progress_manager.save() # Build merged content with separators chapters_data = [] for g_idx, g_chapter, g_actual_num, g_content_hash in group: chapters_data.append((g_actual_num, g_chapter["body"], g_chapter)) if g_idx != idx: # Don't print for parent print(f" → Including chapter {g_actual_num}") # Merge the content original_body = c["body"] # Save original for later c["body"] = RequestMerger.merge_chapters(chapters_data) # Store merge info for response splitting merge_info = { 'group': group, 'expected_chapters': [g[2] for g in group], # actual_nums 'original_body': original_body } merged_char_count = len(c["body"]) print(f" 📊 Merged content: {merged_char_count:,} characters") # Apply ignore filtering to the content before chunk splitting # IMPORTANT: Skip header removal if request merging is active, because # synthetic merge headers are critical for split-the-merge functionality batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' use_title_tag = os.getenv('USE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active # Don't remove headers if this is a merged request if merge_info is not None: ignore_header_tags = False if (not use_title_tag or ignore_header_tags) and c["body"]: from bs4 import BeautifulSoup content_soup = BeautifulSoup(c["body"], 'html.parser') # Remove title tags if ignored if not use_title_tag: for title_tag in content_soup.find_all('title'): title_tag.decompose() # Remove header tags if ignored if ignore_header_tags: for header_tag in content_soup.find_all(['h1', 'h2', 'h3']): header_tag.decompose() c["body"] = str(content_soup) # Update the chapter body # Check if this chapter is already a chunk from text file splitting if c.get('is_chunk', False): # This is already a pre-split chunk, but still check if it needs further splitting # Calculate based on effective OUTPUT limit only max_output_tokens = config.get_effective_output_limit() safety_margin_output = 500 # CJK to English typically compresses to 0.7-0.9x compression_factor = config.COMPRESSION_FACTOR available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) # Ensure minimum available_tokens = max(available_tokens, 1000) print(f"📊 Max Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") chapter_tokens = chapter_splitter.count_tokens(c["body"]) # Get filename for content type detection (prefer source_file for PDFs) chapter_filename = c.get('source_file') or c.get('filename') or c.get('original_basename', '') if chapter_tokens > available_tokens: # Even pre-split chunks might need further splitting chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename) print(f"📄 Section {c['num']} (pre-split from text file) needs further splitting into {len(chunks)} chunks") else: chunks = [(c["body"], 1, 1)] print(f"📄 Section {c['num']} (pre-split from text file)") else: # Normal splitting logic for non-text files # Calculate based on effective OUTPUT limit only max_output_tokens = config.get_effective_output_limit() safety_margin_output = 500 # CJK to English typically compresses to 0.7-0.9x compression_factor = config.COMPRESSION_FACTOR available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) # Ensure minimum available_tokens = max(available_tokens, 1000) print(f"📊 Max Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") # Get filename for content type detection (prefer source_file for PDFs) chapter_filename = c.get('source_file') or c.get('filename') or c.get('original_basename', '') chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename) # Use consistent terminology is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f"📄 {terminology} will be processed in {len(chunks)} chunk(s)") # Recalculate tokens on the actual text to be translated actual_chapter_tokens = chapter_splitter.count_tokens(c["body"]) if len(chunks) > 1: is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (limit: {available_tokens:,} tokens per chunk)") else: is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (within limit of {available_tokens:,} tokens)") chapter_key_str = str(idx) if chapter_key_str not in progress_manager.prog["chapter_chunks"]: progress_manager.prog["chapter_chunks"][chapter_key_str] = { "total": len(chunks), "completed": [], "chunks": {} } progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks) translated_chunks = [] chunk_abort = False # Flag to abort chapter processing on QA failures for chunk_idx_enumerate, (chunk_html, chunk_idx, total_chunks) in enumerate(chunks): # Apply thread delay before processing chunk (including first, when multiple chunks) if total_chunks > 1: thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) if thread_delay > 0: print(f"🧵 Chapter {actual_num}: Delaying {thread_delay}s before processing chunk {chunk_idx}/{total_chunks}") # Interruptible sleep - check stop flag every 0.1s # But respect WAIT_FOR_CHUNKS setting during graceful stop elapsed = 0 check_interval = 0.1 chunk_delay_interrupted = False while elapsed < thread_delay: # Read env vars INSIDE loop to catch stop pressed mid-delay graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' if check_stop(): if graceful_stop_active and wait_for_chunks: pass # Continue processing all chunks elif graceful_stop_active: # Graceful stop without wait_for_chunks: break to save partial print(f"⏳ Graceful stop — saving completed chunks...") chunk_delay_interrupted = True break else: # No graceful stop: return immediately print(f"🛑 Chunk delay interrupted") return sleep_chunk = min(check_interval, thread_delay - elapsed) time.sleep(sleep_chunk) elapsed += sleep_chunk if chunk_delay_interrupted: break # Exit the chunk loop to save partial results chapter_key_str = content_hash old_key_str = str(idx) if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}): progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str] del progress_manager.prog["chapter_chunks"][old_key_str] #print(f"[PROGRESS] Migrated chunks for chapter {chap_num} to new tracking system") if chapter_key_str not in progress_manager.prog["chapter_chunks"]: progress_manager.prog["chapter_chunks"][chapter_key_str] = { "total": len(chunks), "completed": [], "chunks": {} } progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks) # Get chapter status to check for qa_failed chapter_info = progress_manager.prog["chapters"].get(chapter_key_str, {}) chapter_status = chapter_info.get("status") if chapter_status == "qa_failed": # Force retranslation of qa_failed chapters print(f" [RETRY] Chunk {chunk_idx}/{total_chunks} - retranslating due to QA failure") # Check stop - but if graceful stop + wait_for_chunks is enabled, skip this check # to allow all chunks of the current chapter to complete graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' # Check if stop was requested (use stop_callback directly to avoid premature logging) stop_requested = (stop_callback and stop_callback()) or is_stop_requested() if stop_requested: if graceful_stop_active and wait_for_chunks and total_chunks > 1: # Don't stop yet - let chunks complete print(f"⏳ Graceful stop — waiting for remaining chunks ({chunk_idx}/{total_chunks}) of chapter {actual_num}...") elif graceful_stop_active and total_chunks > 1 and len(translated_chunks) > 0: # Graceful stop without wait_for_chunks, but we have some chunks: save partial print(f"⏳ Graceful stop — saving {len(translated_chunks)} completed chunk(s), skipping remaining...") break # Exit chunk loop to save partial results elif graceful_stop_active and total_chunks == 1: # Single chunk chapter with graceful stop - already completed, continue to save pass else: # No graceful stop - actually stop immediately log_stop_once() print(f"❌ Translation stopped during chapter {actual_num}, chunk {chunk_idx}") # Mark any in_progress chapter(s) as failed so the UI reflects the stop if merge_info is not None: for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']: g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) progress_manager.update( g_idx, g_actual_num, g_content_hash, g_fname, status="failed", chapter_obj=g_chapter, ) progress_manager.save() else: fname = FileUtilities.create_chapter_filename(c, actual_num) progress_manager.update( idx, actual_num, content_hash, fname, status="failed", chapter_obj=c, ) progress_manager.save() return current_chunk_number += 1 progress_percent = (current_chunk_number / total_chunks_needed) * 100 if total_chunks_needed > 0 else 0 if chunks_completed > 0: elapsed_time = time.time() - translation_start_time avg_time_per_chunk = elapsed_time / chunks_completed remaining_chunks = total_chunks_needed - current_chunk_number + 1 eta_seconds = remaining_chunks * avg_time_per_chunk eta_hours = int(eta_seconds // 3600) eta_minutes = int((eta_seconds % 3600) // 60) eta_str = f"{eta_hours}h {eta_minutes}m" if eta_hours > 0 else f"{eta_minutes}m" else: eta_str = "calculating..." # For logging, strip data URIs so inline images don't explode char counts display_len = len(re.sub(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+', 'data:image;base64,', chunk_html)) if total_chunks > 1: print(f" 🔄 Translating chunk {chunk_idx}/{total_chunks} for #{idx+1} (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str})") print(f" ⏳ Chunk size: {display_len:,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)") else: # Determine terminology and file reference is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" # Consistent file reference if c.get('is_chunk', False): file_ref = f"Section_{c['num']}" else: file_ref = c.get('original_basename', f'{terminology}_{actual_num}') chunk_tokens = chapter_splitter.count_tokens(chunk_html) print(f" 📄 {terminology} {actual_num} [{display_len:,} chars, {chunk_tokens:,} tokens]") print(f" ℹ️ This may take 30-60 seconds. Stop will take effect after completion.") if log_callback: if hasattr(log_callback, '__self__') and hasattr(log_callback.__self__, 'append_chunk_progress'): if total_chunks == 1: # Determine terminology based on source type is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" log_callback.__self__.append_chunk_progress( 1, 1, "text", f"{terminology} {actual_num}", overall_current=current_chunk_number, overall_total=total_chunks_needed, extra_info=f"{display_len:,} chars" ) else: log_callback.__self__.append_chunk_progress( chunk_idx, total_chunks, "text", f"{terminology} {actual_num}", overall_current=current_chunk_number, overall_total=total_chunks_needed ) else: # Determine terminology based on source type is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False) terminology = "Section" if is_text_source else "Chapter" terminology_lower = "section" if is_text_source else "chapter" if total_chunks == 1: log_callback(f"📄 Processing {terminology} {actual_num} ({chapters_completed + 1}/{chapters_to_process}) - {progress_percent:.1f}% complete") else: log_callback(f"📄 processing chunk {chunk_idx}/{total_chunks} for {terminology_lower} {actual_num} - {progress_percent:.1f}% complete") # Get custom chunk prompt template from environment; send as a separate assistant message chunk_prompt_template = os.getenv("TRANSLATION_CHUNK_PROMPT", "[PART {chunk_idx}/{total_chunks}]") chunk_prompt_msg = [] if total_chunks > 1: chunk_prompt_msg = [{ "role": "assistant", "content": chunk_prompt_template.format( chunk_idx=chunk_idx, total_chunks=total_chunks, chunk_html="" # Provide empty string for backward compatibility ) }] user_prompt = chunk_html if config.CONTEXTUAL: history = history_manager.load_history() trimmed = history[-config.HIST_LIMIT*2:] chunk_context = chunk_context_manager.get_context_messages(limit=2) include_source = os.getenv("INCLUDE_SOURCE_IN_HISTORY", "0") == "1" model_name = getattr(config, 'MODEL', '').lower() is_gemini_3 = ('gemini-3' in model_name) or ('gemini-exp-' in model_name) memory_msgs = [] if is_gemini_3: # Pass-through for Gemini 3 (raw objects preserved) for h in trimmed: if not isinstance(h, dict): continue role = h.get('role', 'user') raw_obj = h.get('_raw_content_object') content = h.get('content') or "" if (not content) and raw_obj: content = extract_text_from_raw_content(raw_obj) if role == 'user' and not include_source: continue if (not content) and raw_obj is None: continue msg = {'role': role} if content: msg['content'] = content if raw_obj is not None: msg['_raw_content_object'] = raw_obj memory_msgs.append(msg) else: # Prefix+content+footer for non-Gemini models memory_blocks = [] for h in trimmed: if not isinstance(h, dict): continue role = h.get('role', 'user') content = h.get('content', '') if not content: continue if role == 'user' and not include_source: continue if role == 'user': prefix = ( "[MEMORY - PREVIOUS SOURCE TEXT]\\n" "This is prior source content provided for context only.\\n" "Do NOT translate or repeat this text directly in your response.\\n\\n" ) else: prefix = ( "[MEMORY - PREVIOUS TRANSLATION]\\n" "This is prior translated content provided for context only.\\n" "Do NOT repeat or re-output this translation.\\n\\n" ) footer = "\\n\\n[END MEMORY BLOCK]\\n" memory_blocks.append(prefix + content + footer) if memory_blocks: combined_memory = "\\n".join(memory_blocks) memory_msgs = [{'role': 'assistant', 'content': combined_memory}] else: memory_msgs = [] else: history = [] # Set empty history when not contextual trimmed = [] chunk_context = [] memory_msgs = [] # Build the current system prompt from the original each time. # Apply per-chunk glossary compression if enabled # Use get_system_prompt() with actual merge count to conditionally include split marker instruction actual_merge_count = len(merge_info['group']) if merge_info else 1 base_prompt = config.get_system_prompt(actual_merge_count=actual_merge_count) if os.getenv("COMPRESS_GLOSSARY_PROMPT", "0") == "1" and glossary_path and os.path.exists(glossary_path): # Rebuild system prompt with compressed glossary for THIS SPECIFIC CHUNK current_system_content = build_system_prompt(base_prompt, glossary_path, source_text=chunk_html) else: # Use base prompt with glossary from original_system_prompt but without stale split marker current_system_content = build_system_prompt(base_prompt, glossary_path, source_text=None) current_base = [{"role": "system", "content": current_system_content}] # Inject rolling_summary.txt verbatim as an assistant message. # IMPORTANT: Do NOT parse, re-header, or otherwise modify rolling_summary.txt here. summary_msgs_list = [] if config.USE_ROLLING_SUMMARY: rolling_summary_text = "" try: summary_file = os.path.join(out, "rolling_summary.txt") if os.path.exists(summary_file): with open(summary_file, "r", encoding="utf-8") as sf: rolling_summary_text = (sf.read() or "") except Exception: rolling_summary_text = "" # Only inject if the file has content if isinstance(rolling_summary_text, str) and rolling_summary_text: summary_content = ( "CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n" "[MEMORY] Previous context summary:\n\n" + rolling_summary_text + "\n\n" "[END MEMORY]\n" "END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:" ) summary_msgs_list = [{"role": "assistant", "content": summary_content}] # Build optional assistant prefill message if configured assistant_prefill_msgs = [] if getattr(config, 'ASSISTANT_PROMPT', '') and config.ASSISTANT_PROMPT.strip(): assistant_prefill_msgs = [{"role": "assistant", "content": config.ASSISTANT_PROMPT.strip()}] # Build final message list for this chunk msgs = current_base + summary_msgs_list + chunk_context + memory_msgs + chunk_prompt_msg + assistant_prefill_msgs + [{"role": "user", "content": user_prompt}] c['__index'] = idx c['__progress'] = progress_manager.prog c['history_manager'] = history_manager # Prepare merge_group_len and merged_chapters if this is a merged request merge_group_len = len(merge_info['group']) if merge_info else None merged_chapters = merge_info['expected_chapters'] if merge_info else None result, finish_reason, raw_obj = translation_processor.translate_with_retry( msgs, chunk_html, c, chunk_idx, total_chunks, merge_group_len=merge_group_len, merged_chapters=merged_chapters ) # If this chunk was blocked/prohibited, stop remaining chunks and mark QA fail if finish_reason in ("content_filter", "prohibited_content", "error"): fname = FileUtilities.create_chapter_filename(c, actual_num) save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(config, 'save_prohibited_results', False)) if save_prohibited_results: # Do NOT preserve original; save AI output if any, otherwise empty try: with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(result if isinstance(result, str) else "") except Exception: pass progress_manager.update( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["PROHIBITED_CONTENT"], chapter_obj=c ) progress_manager.save() print(f"❌ Chunk {chunk_idx}/{total_chunks} hit content filter/prohibited; aborting chapter {actual_num}") chunk_abort = True break # Handle graceful-stop skipped chunks if finish_reason == "graceful_stop": fname = FileUtilities.create_chapter_filename(c, actual_num) save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(config, 'save_partial_results', False)) if save_partial_results: # If we have a truncated partial response, save it and mark TRUNCATED partial_content = None try: tls = translation_processor.client._get_thread_local_client() partial_content = getattr(tls, '_last_truncated_content', None) except Exception: partial_content = getattr(translation_processor.client, '_last_truncated_content', None) if isinstance(partial_content, str) and partial_content: try: with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(partial_content) except Exception: pass progress_manager.update( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TRUNCATED"], chapter_obj=c ) progress_manager.save() print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — saved truncated output") else: progress_manager.update( idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["PARTIAL"], chapter_obj=c ) progress_manager.save() print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — marked QA failed (PARTIAL)") else: progress_manager.update(idx, actual_num, content_hash, fname, status="pending") progress_manager.save() print(f"⏸️ Chapter {actual_num} skipped (graceful stop)") chunk_abort = True break # Check if result is None or contains failure markers # Only check for failure markers if response is short (< 50 chars) # Longer responses are likely legitimate translations even if they contain error keywords is_failed = result is None or (len(str(result).strip()) < 50 and is_qa_failed_response(result)) if is_failed: fname = FileUtilities.create_chapter_filename(c, actual_num) # Check if it's a timeout failure if result == "[TIMEOUT]" or finish_reason == "timeout": progress_manager.update(idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TIMEOUT"], chapter_obj=c) print(f"❌ Chunk {chunk_idx}/{total_chunks} timed out; aborting chapter {actual_num}") chunk_abort = True else: progress_manager.update(idx, actual_num, content_hash, fname, status="failed") print(f"❌ Translation failed for chapter {actual_num} - marked as failed, aborting chapter") chunk_abort = True progress_manager.save() break # ENHANCED TRUNCATION CHECK: Compare input vs output character counts # Skip this check if base64 images are present (they skew the character count) has_base64_image = 'data:image' in chunk_html or 'base64,' in chunk_html # Check if this result came from a fallback key used_fallback = hasattr(translation_processor.client, '_used_fallback_key') and translation_processor.client._used_fallback_key # Check if we're already in a nested truncation retry (prevents infinite loops) already_in_retry = c.get('__in_truncation_retry', False) char_ratio_retry_count = c.get('__char_ratio_retry_count', 0) # Char-ratio truncation settings (silent truncation detector) char_ratio_enabled = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1" try: char_ratio_threshold_pct = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50")) except Exception: char_ratio_threshold_pct = 50.0 try: char_ratio_retry_limit = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1")) except Exception: char_ratio_retry_limit = 1 try: char_ratio_min_output_chars = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100")) except Exception: char_ratio_min_output_chars = 100 # Sanitize if char_ratio_threshold_pct < 0: char_ratio_threshold_pct = 0.0 if char_ratio_threshold_pct > 100: char_ratio_threshold_pct = 100.0 char_ratio_threshold = char_ratio_threshold_pct / 100.0 if char_ratio_retry_limit < 1: char_ratio_retry_limit = 1 if char_ratio_min_output_chars < 0: char_ratio_min_output_chars = 0 # Char-ratio retry loop while char_ratio_enabled and not has_base64_image and not already_in_retry: # Check for stop signal before each retry if os.environ.get('GRACEFUL_STOP') != '1' and check_stop(): print("❌ Char-ratio retry stopped by user") break input_char_count = len(chunk_html) output_char_count = len(result) char_ratio = output_char_count / input_char_count if input_char_count > 0 else 0 # If output is much shorter than input, likely silently truncated if char_ratio < char_ratio_threshold and output_char_count > char_ratio_min_output_chars: # Only check if output has substance if used_fallback: # For fallback keys, just warn - don't retry (would go back to refusing model) print(f" ⚠️ Truncated output from fallback key - accepting as-is") break else: # Override finish_reason to trigger retry logic WITHIN translate_with_retry # This will be caught by the internal retry loop if RETRY_TRUNCATED is enabled if finish_reason != "length" and finish_reason != "max_tokens" and finish_reason not in ["content_filter", "prohibited_content"]: retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1" if not retry_truncated_enabled: break # Check if we've hit the retry limit if char_ratio_retry_count >= char_ratio_retry_limit: # All retries exhausted - mark as QA_failed with TRUNCATED print(f" ❌ All char-ratio retries ({char_ratio_retry_limit}) exhausted for Chapter {actual_num} Chunk {chunk_idx}/{total_chunks} - marking as QA_failed") fname = FileUtilities.create_chapter_filename(c, actual_num) save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(config, 'save_partial_results', False)) if save_partial_results: try: with open(os.path.join(out, fname), 'w', encoding='utf-8') as f: f.write(result if isinstance(result, str) else "") except Exception: pass progress_manager.update(idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TRUNCATED"], chapter_obj=c) progress_manager.save() # Set flag to skip further processing of this chapter chunk_abort = True break # Log truncation detection on first attempt if char_ratio_retry_count == 0: print( f" ⚠️ TRUNCATION DETECTED (char comparison) Chapter {actual_num} Chunk {chunk_idx}/{total_chunks}: " f"Input={input_char_count:,} chars, Output={output_char_count:,} chars ({char_ratio:.1%} ratio, threshold={char_ratio_threshold:.0%}) " f"- {char_ratio_retry_limit} retry attempt(s) available" ) char_ratio_retry_count += 1 c['__char_ratio_retry_count'] = char_ratio_retry_count print(f" 🔄 Character ratio retry attempt {char_ratio_retry_count}/{char_ratio_retry_limit} [Chapter {actual_num} Chunk {chunk_idx}/{total_chunks}]") # Set flag to prevent nested retries at BOTH levels c['__in_truncation_retry'] = True # CRITICAL: Set thread-local flag to prevent unified_api_client from doing its own truncation retries if hasattr(translation_processor.client, '_get_thread_local_client'): tls = translation_processor.client._get_thread_local_client() tls._in_truncation_retry = True original_max = config.MAX_OUTPUT_TOKENS target_tokens = config.MAX_RETRY_TOKENS if config.MAX_RETRY_TOKENS > 0 else original_max config.MAX_OUTPUT_TOKENS = max(original_max, target_tokens) result_retry, finish_reason_retry, raw_obj_retry = translation_processor.translate_with_retry( msgs, chunk_html, c, chunk_idx, total_chunks ) # Clear retry flags and restore original token limit c.pop('__in_truncation_retry', None) if hasattr(translation_processor.client, '_get_thread_local_client'): tls = translation_processor.client._get_thread_local_client() tls._in_truncation_retry = False config.MAX_OUTPUT_TOKENS = original_max # Check if retry improved the output retry_output_count = len(result_retry) if result_retry else 0 if result_retry and retry_output_count > output_char_count: print(f" ✅ Char-ratio retry succeeded: {output_char_count:,} → {retry_output_count:,} chars") result = result_retry finish_reason = finish_reason_retry raw_obj = raw_obj_retry # Don't break - check if this new result is STILL truncated # Loop will continue and check char_ratio again else: print(f" ⚠️ Char-ratio retry did not improve output ({output_char_count} chars)") # Continue to next retry attempt else: # finish_reason is already 'length' - unified_api_client already retried break else: # Not truncated - exit loop break # If truncation retries were exhausted, skip further processing if chunk_abort: break if config.REMOVE_AI_ARTIFACTS: result = ContentProcessor.clean_ai_artifacts(result, True) if config.EMERGENCY_RESTORE: result = ContentProcessor.emergency_restore_paragraphs(result, chunk_html) if config.REMOVE_AI_ARTIFACTS: lines = result.split('\n') json_line_count = 0 for i, line in enumerate(lines[:5]): if line.strip() and any(pattern in line for pattern in [ '"role":', '"content":', '"messages":', '{"role"', '{"content"', '[{', '}]' ]): json_line_count = i + 1 else: break if json_line_count > 0 and json_line_count < len(lines): remaining = '\n'.join(lines[json_line_count:]) if remaining.strip() and len(remaining) > 100: result = remaining print(f"✂️ Removed {json_line_count} lines of JSON artifacts") result = re.sub(r'\[PART \d+/\d+\]\s*', '', result, flags=re.IGNORECASE) translated_chunks.append((result, chunk_idx, total_chunks)) chunk_context_manager.add_chunk(user_prompt, result, chunk_idx, total_chunks) progress_manager.prog["chapter_chunks"][chapter_key_str]["completed"].append(chunk_idx) progress_manager.prog["chapter_chunks"][chapter_key_str]["chunks"][str(chunk_idx)] = result progress_manager.save() chunks_completed += 1 will_reset = history_manager.will_reset_on_next_append( config.HIST_LIMIT if config.CONTEXTUAL else 0, config.TRANSLATION_HISTORY_ROLLING ) # Check if we captured thought signatures if raw_obj: # print("🧠 Captured thought signature for history") pass # Add microsecond delay before history append to prevent race conditions time.sleep(0.000001) # 1 microsecond delay history = history_manager.append_to_history( user_prompt, result, config.HIST_LIMIT if config.CONTEXTUAL else 0, reset_on_limit=True, rolling_window=config.TRANSLATION_HISTORY_ROLLING, raw_assistant_object=raw_obj ) if chunk_idx < total_chunks: # Handle float delays while checking for stop full_seconds = int(config.DELAY) fractional_second = config.DELAY - full_seconds # Check stop signal every second for full seconds # During graceful stop, skip these checks to complete all chunks for i in range(full_seconds): if os.environ.get('GRACEFUL_STOP') != '1' and check_stop(): print("❌ Translation stopped during delay") # Mark any in_progress chapter(s) as failed so the UI reflects the stop if merge_info is not None: for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']: g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) progress_manager.update( g_idx, g_actual_num, g_content_hash, g_fname, status="failed", chapter_obj=g_chapter, ) progress_manager.save() else: fname = FileUtilities.create_chapter_filename(c, actual_num) progress_manager.update( idx, actual_num, content_hash, fname, status="failed", chapter_obj=c, ) progress_manager.save() return time.sleep(1) # Handle the fractional part if any if fractional_second > 0: if os.environ.get('GRACEFUL_STOP') != '1' and check_stop(): print("❌ Translation stopped during delay") # Mark any in_progress chapter(s) as failed so the UI reflects the stop if merge_info is not None: for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']: g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) progress_manager.update( g_idx, g_actual_num, g_content_hash, g_fname, status="failed", chapter_obj=g_chapter, ) progress_manager.save() else: fname = FileUtilities.create_chapter_filename(c, actual_num) progress_manager.update( idx, actual_num, content_hash, fname, status="failed", chapter_obj=c, ) progress_manager.save() return time.sleep(fractional_second) # During graceful stop, skip this check to save the completed API response if os.environ.get('GRACEFUL_STOP') != '1' and check_stop(): print(f"❌ Translation stopped before saving chapter {actual_num}") # Mark any in_progress chapter(s) as failed so the UI reflects the stop if merge_info is not None: for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']: g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) progress_manager.update( g_idx, g_actual_num, g_content_hash, g_fname, status="failed", chapter_obj=g_chapter, ) progress_manager.save() else: fname = FileUtilities.create_chapter_filename(c, actual_num) progress_manager.update( idx, actual_num, content_hash, fname, status="failed", chapter_obj=c, ) progress_manager.save() return # Check for partial results (graceful stop during multi-chunk processing) is_partial_result = False expected_total = len(chunks) if 'chunks' in dir() else 1 if len(translated_chunks) < expected_total and len(translated_chunks) > 0: graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' if graceful_stop_active: print(f"⚠️ Chapter {actual_num}: partial translation ({len(translated_chunks)}/{expected_total} chunks) due to graceful stop") is_partial_result = True if len(translated_chunks) > 1: print(f" 📎 Merging {len(translated_chunks)} chunks...") translated_chunks.sort(key=lambda x: x[1]) merged_result = chapter_splitter.merge_translated_chunks(translated_chunks) else: merged_result = translated_chunks[0][0] if translated_chunks else "" if config.CONTEXTUAL and len(translated_chunks) > 1: user_summary, assistant_summary = chunk_context_manager.get_summary_for_history() if user_summary and assistant_summary: # Add microsecond delay before summary append time.sleep(0.000001) # 1 microsecond delay history_manager.append_to_history( user_summary, assistant_summary, config.HIST_LIMIT, reset_on_limit=False, rolling_window=config.TRANSLATION_HISTORY_ROLLING ) print(f" 📝 Added chapter summary to history") chunk_context_manager.clear() # For text file chunks, ensure we pass the decimal number if is_text_file and c.get('is_chunk', False) and isinstance(c.get('num'), float): fname = FileUtilities.create_chapter_filename(c, c['num']) # Use the decimal num directly print(f"[DEBUG] Text file chunk - using decimal num {c['num']} -> filename: {fname}") else: fname = FileUtilities.create_chapter_filename(c, actual_num) if is_text_file: print(f"[DEBUG] Text file - using actual_num {actual_num} -> filename: {fname}") client.set_output_filename(fname) cleaned = re.sub(r"^```(?:html)?\s*\n?", "", merged_result, count=1, flags=re.MULTILINE) cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE) cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=config.REMOVE_AI_ARTIFACTS) # Post-process: Remove duplicate H1+P pairs from translated OUTPUT if enabled remove_duplicate_h1_p = os.getenv('REMOVE_DUPLICATE_H1_P', '0') == '1' if remove_duplicate_h1_p and cleaned: # First: HTML-based duplicate removal from bs4 import BeautifulSoup output_soup = BeautifulSoup(cleaned, 'html.parser') for h1_tag in output_soup.find_all('h1'): h1_id = h1_tag.get('id', '') if h1_id and h1_id.startswith('split-'): continue h1_text = h1_tag.get_text(strip=True) if 'SPLIT MARKER' in h1_text: continue # Check next sibling (P after H1) next_sibling = h1_tag.find_next_sibling() if next_sibling and next_sibling.name == 'p': if h1_text == next_sibling.get_text(strip=True): next_sibling.decompose() continue # Check previous sibling (P before H1) prev_sibling = h1_tag.find_previous_sibling() if prev_sibling and prev_sibling.name == 'p': if h1_text == prev_sibling.get_text(strip=True): prev_sibling.decompose() cleaned = str(output_soup) # Second: Markdown-based duplicate removal (for enhanced extraction mode) # Pattern: "Title Text\n\n# Title Text" - remove the plain text line before markdown header def remove_markdown_duplicate_headers(text): lines = text.split('\n') result = [] i = 0 while i < len(lines): line = lines[i] # Check if this is a non-empty line followed by blank lines and then a markdown header if line.strip() and not line.strip().startswith('#'): # Look ahead for pattern: [blank lines] [# header with same text] j = i + 1 # Skip blank lines while j < len(lines) and not lines[j].strip(): j += 1 # Check if next non-blank line is a markdown header if j < len(lines): next_line = lines[j] header_match = re.match(r'^(#{1,6})\s+(.+)$', next_line) if header_match: header_text = header_match.group(2).strip() # Compare with current line (stripped) if line.strip() == header_text: # Skip this duplicate line, keep blanks and header i += 1 continue result.append(line) i += 1 return '\n'.join(result) cleaned = remove_markdown_duplicate_headers(cleaned) # If the cleaned translation is empty/whitespace, treat as failure and skip file write if not cleaned or not str(cleaned).strip(): print(f"❌ Translation empty for chapter {actual_num} — skipping file write") chapter_key = progress_manager._get_chapter_key(actual_num, FileUtilities.create_chapter_filename(c, actual_num), c, content_hash) existing = progress_manager.prog.get("chapters", {}).get(chapter_key, {}) # If already qa_failed (e.g., prohibited content), keep that; otherwise mark qa_failed with EMPTY_OUTPUT new_status = existing.get("status") if existing.get("status") == "qa_failed" else "qa_failed" qa_issues = existing.get("qa_issues_found") or [] if "EMPTY_OUTPUT" not in qa_issues: qa_issues = qa_issues + ["EMPTY_OUTPUT"] progress_manager.update( idx, actual_num, content_hash, FileUtilities.create_chapter_filename(c, actual_num), status=new_status, qa_issues_found=qa_issues, chapter_obj=c, ) progress_manager.save() # Move to next chapter without writing a file continue if is_mixed_content and image_translations: print(f"🔀 Merging {len(image_translations)} image translations with text...") from bs4 import BeautifulSoup # Parse the translated text (which has the translated title/header) soup_translated = BeautifulSoup(cleaned, 'html.parser') # For each image translation, insert it into the document for img_path, translation_html in image_translations.items(): if translation_html and ']*id="split-\d+"[^>]*>.*?

    \s*', '', cleaned_to_save, flags=re.IGNORECASE | re.DOTALL, ) with open(os.path.join(out, parent_fname), 'w', encoding='utf-8') as f: f.write(cleaned_to_save) except Exception: pass # Mark ALL chapters in the merge group as qa_failed using # their own expected filenames so we overwrite existing # in_progress entries instead of creating composite keys. for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']: g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) progress_manager.update( g_idx, g_actual_num, g_content_hash, g_fname, status="qa_failed", chapter_obj=g_chapter, ) progress_manager.save() print(f" ⚠️ Merged group marked as qa_failed") continue # Check if Split the Merge is enabled split_the_merge = os.getenv('SPLIT_THE_MERGE', '0') == '1' disable_fallback = os.getenv('DISABLE_MERGE_FALLBACK', '0') == '1' split_sections = None if split_the_merge and len(merge_info['group']) > 1: # Try to split by invisible markers split_sections = RequestMerger.split_by_markers(cleaned, len(merge_info['group'])) # If disable fallback is enabled and split failed, mark as qa_failed if split_the_merge and disable_fallback and (not split_sections or len(split_sections) != len(merge_info['group'])): print(f" ⚠️ Split failed and fallback disabled - marking merged group as qa_failed") # Update watchdog: Record this as a "split_failed" event before the request technically finishes # Using the first request ID from the group if possible, though this runs post-request. # Since the original request is finished by now, we can't update its watchdog state directly. # But we can log it clearly. # Only save file for debugging if it contains meaningful content beyond error markers cleaned_stripped = cleaned.strip() is_only_error_marker = cleaned_stripped in [ "[TRANSLATION FAILED]", "[Content Blocked]", "[IMAGE TRANSLATION FAILED]", "[EXTRACTION FAILED]", "[RATE LIMITED]", "[]" ] or cleaned_stripped.startswith("[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]") or cleaned_stripped.startswith("[CONTENT BLOCKED - ORIGINAL TEXT PRESERVED]") if not is_only_error_marker: parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num) try: cleaned_to_save = cleaned if split_the_merge: cleaned_to_save = re.sub( r']*id="split-\d+"[^>]*>.*?\s*', '', cleaned_to_save, flags=re.IGNORECASE | re.DOTALL, ) with open(os.path.join(out, parent_fname), 'w', encoding='utf-8') as f: f.write(cleaned_to_save) except Exception: pass # Mark ALL chapters in the merge group as qa_failed for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']: g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) progress_manager.update( g_idx, g_actual_num, g_content_hash, g_fname, status="qa_failed", chapter_obj=g_chapter, qa_issues_found=["SPLIT_FAILED"], ) progress_manager.save() print(f" ⚠️ Merged group ({len(merge_info['group'])} chapters) marked as qa_failed with SPLIT_FAILED") continue if split_sections and len(split_sections) == len(merge_info['group']): # Split successful - save each section as individual file print(f" ✂️ Splitting merged content into {len(split_sections)} individual files") saved_files = [] for i, (g_idx, g_chapter, g_actual_num, g_content_hash) in enumerate(merge_info['group']): section_content = split_sections[i] # Generate filename for this chapter using content.opf naming split_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num) # Handle text file mode if is_text_file: split_fname = split_fname.replace('.html', '.txt') from bs4 import BeautifulSoup soup = BeautifulSoup(section_content, 'html.parser') section_content = soup.get_text(strip=True) # Save the section split_output_path = os.path.join(out, split_fname) with open(split_output_path, 'w', encoding='utf-8') as f: f.write(section_content) # Verify file was written successfully if os.path.exists(split_output_path): saved_files.append((g_idx, g_chapter, g_actual_num, g_content_hash, split_fname)) print(f" 💾 Saved Chapter {g_actual_num}: {split_fname} ({len(section_content)} chars)") else: print(f" ⚠️ ERROR: Failed to write file {split_fname} - file does not exist after write") # Mark all chapters as completed or qa_failed (for truncated) for g_idx, g_chapter, g_actual_num, g_content_hash, split_fname in saved_files: chapter_status = "qa_failed" if was_truncated else "completed" qa_issues = ["TRUNCATED"] if was_truncated else None progress_manager.update( g_idx, g_actual_num, g_content_hash, split_fname, status=chapter_status, chapter_obj=g_chapter, qa_issues_found=qa_issues ) chapters_completed += 1 # Save once after all updates progress_manager.save() print(f" ✅ Split the Merge complete: {len(saved_files)} files created") continue # Normal merged behavior (split not enabled or header count mismatch) # Save entire merged response to parent chapter's file cleaned_to_save = cleaned if split_the_merge and len(merge_info['group']) > 1: cleaned_to_save = re.sub( r']*id="split-\d+"[^>]*>.*?\s*', '', cleaned_to_save, flags=re.IGNORECASE | re.DOTALL, ) if is_text_file and not is_pdf_file: parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num).replace('.html', '.txt') from bs4 import BeautifulSoup soup = BeautifulSoup(cleaned_to_save, 'html.parser') text_content = soup.get_text(strip=True) parent_output_path = os.path.join(out, parent_fname) with open(parent_output_path, 'w', encoding='utf-8') as f: f.write(text_content) else: parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num) parent_output_path = os.path.join(out, parent_fname) with open(parent_output_path, 'w', encoding='utf-8') as f: f.write(cleaned_to_save) # Verify file was actually written before marking as completed if not os.path.exists(parent_output_path): print(f" ⚠️ ERROR: Failed to write merged file {parent_fname} - file does not exist after write") # Mark all chapters in the group as failed since parent file wasn't written for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']: progress_manager.update(g_idx, g_actual_num, g_content_hash, None, status="failed", chapter_obj=g_chapter) progress_manager.save() continue print(f" 💾 Saved merged content to Chapter {parent_actual_num}: {parent_fname} ({len(cleaned_to_save)} chars)") if was_truncated: # For truncated merged responses, mark ALL chapters as qa_failed qa_issues = ["TRUNCATED"] progress_manager.update( parent_idx, parent_actual_num, parent_content_hash, parent_fname, status="qa_failed", chapter_obj=parent_chapter, qa_issues_found=qa_issues ) for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group'][1:]: progress_manager.update( g_idx, g_actual_num, g_content_hash, None, status="qa_failed", chapter_obj=g_chapter, qa_issues_found=qa_issues ) chapters_completed += len(merge_info['group']) # Save once after all updates progress_manager.save() print(f" ⚠️ Merged group marked as qa_failed due to truncation") else: # Normal success path: parent completed, children marked as merged progress_manager.update( parent_idx, parent_actual_num, parent_content_hash, parent_fname, status="completed", chapter_obj=parent_chapter, merged_chapters=merged_child_nums ) chapters_completed += 1 # Mark child chapters as merged (point to parent's output file) - atomically after parent for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group'][1:]: progress_manager.mark_as_merged(g_idx, g_actual_num, g_content_hash, parent_actual_num, g_chapter, parent_output_file=parent_fname) chapters_completed += 1 # Save once after all updates progress_manager.save() print(f" 📊 Saved merged content for {len(merge_info['group'])} chapters") # Skip normal save since we handled it above and exit this translation run continue # CRITICAL: Unescape img tags that were converted to HTML entities (applies to ALL HTML) # Pattern matches: <img ... /> where the tag ends with / img_count = len(re.findall(r'<img\s[^>]*?/>', cleaned, flags=re.IGNORECASE)) if img_count > 0: print(f"🖼️ Unescaping {img_count} img tag(s) from HTML entities (post-processing)") cleaned = re.sub( r'<(img\s[^>]*?/)>', r'<\1>', cleaned, flags=re.IGNORECASE ) if is_text_file and not is_pdf_file: # For text files (but NOT PDFs), save as plain text instead of HTML fname_txt = fname.replace('.html', '.txt') # Change extension to .txt # Extract text from HTML from bs4 import BeautifulSoup soup = BeautifulSoup(cleaned, 'html.parser') text_content = soup.get_text(strip=True) # Write plain text file output_path = os.path.join(out, fname_txt) with open(output_path, 'w', encoding='utf-8') as f: f.write(text_content) # Verify file was actually written before marking as completed if not os.path.exists(output_path): print(f"⚠️ ERROR: Failed to write file {fname_txt} - file does not exist after write") # Keep status as in_progress or mark as failed progress_manager.save() # Save current in_progress state continue print(f"💾 Saved text file: {fname_txt} (Chapter {actual_num})") final_title = c['title'] or make_safe_filename(c['title'], actual_num) # Don't print individual "Processed" messages - these are redundant with the main progress display if os.getenv('DEBUG_CHAPTER_SAVES', '0') == '1': print(f"[Processed {idx+1}/{total_chapters}] ✅ Saved Chapter {actual_num}: {final_title}") # Determine status based on comprehensive failure detection qa_issues = None if is_qa_failed_response(cleaned): chapter_status = "qa_failed" failure_reason = get_failure_reason(cleaned) print(f"⚠️ Chapter {actual_num} marked as qa_failed: {failure_reason}") elif finish_reason in ["length", "max_tokens"]: chapter_status = "qa_failed" qa_issues = ["TRUNCATED"] print(f"⚠️ Chapter {actual_num} marked as qa_failed: truncated (finish_reason: {finish_reason})") elif is_partial_result: chapter_status = "qa_failed" qa_issues = ["PARTIAL"] print(f"⚠️ Chapter {actual_num} marked as qa_failed: partial translation (graceful stop)") else: chapter_status = "completed" progress_manager.update(idx, actual_num, content_hash, fname_txt, status=chapter_status, chapter_obj=c, qa_issues_found=qa_issues) # Clear any stale watchdog entries for this chapter try: import unified_api_client if hasattr(unified_api_client, '_api_watchdog_clear_chapter'): unified_api_client._api_watchdog_clear_chapter(actual_num) except Exception: pass else: # For EPUB files, keep original HTML behavior output_path = os.path.join(out, fname) with open(output_path, 'w', encoding='utf-8') as f: f.write(cleaned) # Verify file was actually written before marking as completed if not os.path.exists(output_path): print(f"⚠️ ERROR: Failed to write file {fname} - file does not exist after write") # Keep status as in_progress or mark as failed progress_manager.save() # Save current in_progress state continue final_title = c['title'] or make_safe_filename(c['title'], actual_num) # Don't print individual "Processed" messages - these are redundant with the main progress display if os.getenv('DEBUG_CHAPTER_SAVES', '0') == '1': print(f"[Processed {idx+1}/{total_chapters}] ✅ Saved Chapter {actual_num}: {final_title}") # Determine status based on comprehensive failure detection qa_issues = None if is_qa_failed_response(cleaned): chapter_status = "qa_failed" failure_reason = get_failure_reason(cleaned) print(f"⚠️ Chapter {actual_num} marked as qa_failed: {failure_reason}") elif finish_reason in ["length", "max_tokens"]: chapter_status = "qa_failed" qa_issues = ["TRUNCATED"] print(f"⚠️ Chapter {actual_num} marked as qa_failed: truncated (finish_reason: {finish_reason})") elif is_partial_result: chapter_status = "qa_failed" qa_issues = ["PARTIAL"] print(f"⚠️ Chapter {actual_num} marked as qa_failed: partial translation (graceful stop)") else: chapter_status = "completed" progress_manager.update(idx, actual_num, content_hash, fname, status=chapter_status, chapter_obj=c, qa_issues_found=qa_issues) # Clear any stale watchdog entries for this chapter try: import unified_api_client if hasattr(unified_api_client, '_api_watchdog_clear_chapter'): unified_api_client._api_watchdog_clear_chapter(actual_num) except Exception: pass progress_manager.save() # After completing this chapter, check if we should stop graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1' wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1' stop_requested = (stop_callback and stop_callback()) or is_stop_requested() # Stop after saving if: partial result OR graceful stop + wait_for_chunks completed if is_partial_result: print(f"\n✅ Partial chapter {actual_num} saved. Stopping as requested (graceful stop).") log_stop_once() return if stop_requested and graceful_stop_active and wait_for_chunks: print(f"\n✅ Chapter {actual_num} completed. Stopping as requested (wait for chunks).") log_stop_once() return # After completing this chapter, produce a rolling summary and store it for the NEXT chapter if config.USE_ROLLING_SUMMARY: # Use the original system prompt to build the summary system prompt base_system_content = original_system_prompt summary_mode = str(getattr(config, 'ROLLING_SUMMARY_MODE', 'replace') or 'replace').strip().lower() def _load_previous_rolling_summary_text(*, full_file: bool = False) -> str: """Load rolling_summary.txt to use as assistant context (no parsing).""" try: summary_file = os.path.join(out, "rolling_summary.txt") if not os.path.exists(summary_file): return "" with open(summary_file, "r", encoding="utf-8") as f: content = f.read().strip() return content except Exception: return "" def _get_last_translated_outputs(n: int) -> str: """Build the user text from the last N translated chapter outputs (by completed_list).""" try: n = int(n or 0) if n <= 0: return cleaned # completed_list is saved (sorted) by ProgressManager.save() completed_list = progress_manager.prog.get("completed_list") or [] if not isinstance(completed_list, list) or not completed_list: return cleaned last_items = completed_list[-n:] blocks = [] for item in last_items: try: chap_num = item.get("num") rel_file = item.get("file") if not rel_file: continue fp = os.path.join(out, rel_file) if not os.path.exists(fp): continue with open(fp, "r", encoding="utf-8") as f: txt = f.read().strip() if not txt: continue blocks.append( f"=== Previous Translated Text: Chapter {chap_num} ===\n" f"{txt}\n" f"=== End Previous Translated Text ===" ) except Exception: continue return "\n\n".join(blocks) if blocks else cleaned except Exception: return cleaned if summary_mode == 'replace': # In replace mode, update the rolling summary using: # - assistant: previous rolling summary (from rolling_summary.txt) # - user: last N translated chapter outputs (configured by ROLLING_SUMMARY_EXCHANGES) prev_summary = _load_previous_rolling_summary_text() n = int(getattr(config, 'ROLLING_SUMMARY_EXCHANGES', 5) or 5) user_text = _get_last_translated_outputs(n) summary_text = translation_processor.generate_rolling_summary( history_manager, actual_num, base_system_content, source_text=user_text, previous_summary_text=prev_summary, previous_summary_chapter_num=None, prefer_translations_only_user=True, ) else: # append (and any unknown value): summarize ONLY this chapter's translated output. # Do NOT send the previous rolling summary in append mode. summary_text = translation_processor.generate_rolling_summary( history_manager, actual_num, base_system_content, source_text=cleaned, previous_summary_text=None, previous_summary_chapter_num=None, ) if summary_text: last_summary_block_text = summary_text last_summary_chapter_num = actual_num chapters_completed += 1 # Check if PDF should output as PDF or EPUB pdf_output_format = os.getenv('PDF_OUTPUT_FORMAT', 'pdf').lower() should_create_pdf = is_text_file or (is_pdf_file and pdf_output_format == 'pdf') if should_create_pdf: print("📄 Text file translation complete!") try: # Collect all translated chapters with their metadata translated_chapters = [] for chapter in chapters: # Look for .txt files for text files, .html for PDFs fname_base = FileUtilities.create_chapter_filename(chapter, chapter['num']) if is_pdf_file: fname_to_check = fname_base # PDFs use .html files else: fname_to_check = fname_base.replace('.html', '.txt') # Text files use .txt if os.path.exists(os.path.join(out, fname_to_check)): with open(os.path.join(out, fname_to_check), 'r', encoding='utf-8') as f: content = f.read() translated_chapters.append({ 'num': chapter['num'], 'title': chapter['title'], 'content': content, 'is_chunk': chapter.get('is_chunk', False), 'chunk_info': chapter.get('chunk_info', {}), 'filename': fname_to_check # Store filename for debugging }) elif os.path.exists(os.path.join(out, fname_base)): # Fallback to HTML if txt doesn't exist with open(os.path.join(out, fname_base), 'r', encoding='utf-8') as f: content = f.read() # For PDFs, keep HTML content; for text files, extract text if is_pdf_file: # Keep the HTML as-is for PDFs text = content else: # Extract text from HTML for text files from bs4 import BeautifulSoup soup = BeautifulSoup(content, 'html.parser') text = soup.get_text(strip=True) translated_chapters.append({ 'num': chapter['num'], 'title': chapter['title'], 'content': text, 'is_chunk': chapter.get('is_chunk', False), 'chunk_info': chapter.get('chunk_info', {}), 'filename': fname_base # Store filename for debugging }) # Sort chapters by number to ensure correct order # Handle both integer and float chapter numbers (e.g., 1.0, 1.1, etc.) translated_chapters.sort(key=lambda x: float(x['num'])) print(f"✅ Translation complete! {len(translated_chapters)} section files created:") for chapter_data in translated_chapters: print(f" • Section {chapter_data['num']}: {chapter_data['title']} (from {chapter_data.get('filename', 'unknown')})") # Create a combined file with proper section structure if input_path.lower().endswith('.pdf'): # Check if content is HTML or plain text is_html_content = any('' in chapter_data.get('content', '') or 'PDF renderers. try: from bs4 import BeautifulSoup frag = BeautifulSoup(content, 'html.parser') for tag in frag.find_all(id='page0'): tag['id'] = f'mupdf-page0-{i + 1}' content = str(frag) except Exception: pass # Always insert a page break before every combined page after the first. if i > 0: html_parts.append('
    \n') if chapter_data.get('is_chunk'): chunk_info = chapter_data.get('chunk_info', {}) original_chapter = chunk_info.get('original_chapter') chunk_idx = chunk_info.get('chunk_idx', 1) total_chunks = chunk_info.get('total_chunks', 1) if original_chapter != current_main_chapter: current_main_chapter = original_chapter html_parts.append(content) if chunk_idx < total_chunks: html_parts.append('\n') else: current_main_chapter = chapter_data['num'] html_parts.append(content) full_html_body = "".join(html_parts) # Post-process: merge paragraphs that span across pages full_html_body = _merge_split_paragraphs(full_html_body) # Post-process: merge image-only page containers into the previous page # (reduces wasted whitespace for "image-only" pages) full_html_body = _merge_image_only_pages(full_html_body) # Post-process: wrap last text block with a following image to reduce image-only pages full_html_body = _keep_text_with_following_image(full_html_body) # Replace/insert a clean Table of Contents built from h1/h2 headers full_html_body = _generate_and_replace_toc(full_html_body) # Wrap in full HTML document with CSS css_path = os.path.join(out, 'styles.css') css_link = '' if os.path.exists(css_path) else '' # Extra inline CSS for PDF-derived HTML: # - h3 is used as body text in our PDF extraction; normalize it to paragraph-like styling # - reduce margins around images # - keep-with-image wrapper helps reduce image-only PDF pages extra_css = """ """ full_html = f""" {txt_processor.file_base} - Translated {css_link} {extra_css} {full_html_body} """ # Save HTML file for reference html_path = os.path.join(out, f"{txt_processor.file_base}_translated.html") with open(html_path, 'w', encoding='utf-8') as f: f.write(full_html) print(f" • Created HTML file: {html_path}") # Convert HTML to PDF try: from pdf_extractor import create_pdf_from_html images_dir = os.path.join(out, 'images') css_arg = css_path if os.path.exists(css_path) else None images_arg = images_dir if os.path.exists(images_dir) else None # Check if images directory exists and has images has_images = False if images_arg and os.path.exists(images_arg): image_files = [f for f in os.listdir(images_arg) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp'))] has_images = len(image_files) > 0 if has_images: print(f" • Found {len(image_files)} images to include in PDF") if create_pdf_from_html(full_html, combined_path, css_path=css_arg, images_dir=images_arg): print(f" • Created translated PDF file: {combined_path}") if has_images: print(f" • PDF includes images from images folder") else: print("⚠️ Failed to create PDF from HTML, using HTML file") combined_path = html_path except Exception as e: print(f"⚠️ Error creating PDF from HTML: {e}") import traceback traceback.print_exc() print(f" • Using HTML file instead: {html_path}") combined_path = html_path else: # Plain text content - use text-based PDF creation combined_path = os.path.join(out, f"{txt_processor.file_base}_translated.pdf") print(f"📄 Creating PDF from plain text...") # Build full text content full_text_parts = [] current_main_chapter = None # Note: translated_chapters is already sorted at this point for i, chapter_data in enumerate(translated_chapters): content = chapter_data['content'] if chapter_data.get('is_chunk'): chunk_info = chapter_data.get('chunk_info', {}) original_chapter = chunk_info.get('original_chapter') chunk_idx = chunk_info.get('chunk_idx', 1) total_chunks = chunk_info.get('total_chunks', 1) if original_chapter != current_main_chapter: current_main_chapter = original_chapter if i > 0: full_text_parts.append(f"\n\n{'='*50}\n\n") full_text_parts.append(content) if chunk_idx < total_chunks: full_text_parts.append("\n") else: current_main_chapter = chapter_data['num'] if i > 0: full_text_parts.append(f"\n\n{'='*50}\n\n") full_text_parts.append(content) full_text = "".join(full_text_parts) from pdf_extractor import create_pdf_from_text if create_pdf_from_text(full_text, combined_path): print(f" • Created translated PDF file: {combined_path}") else: print("⚠️ Failed to create PDF, falling back to text output") combined_path = os.path.join(out, f"{txt_processor.file_base}_translated.txt") with open(combined_path, 'w', encoding='utf-8') as f: f.write(full_text) print(f" • Created fallback text file: {combined_path}") else: combined_path = os.path.join(out, f"{txt_processor.file_base}_translated.txt") with open(combined_path, 'w', encoding='utf-8') as combined: current_main_chapter = None # Note: translated_chapters is already sorted at this point for i, chapter_data in enumerate(translated_chapters): content = chapter_data['content'] # Check if this is a chunk of a larger chapter if chapter_data.get('is_chunk'): chunk_info = chapter_data.get('chunk_info', {}) original_chapter = chunk_info.get('original_chapter') chunk_idx = chunk_info.get('chunk_idx', 1) total_chunks = chunk_info.get('total_chunks', 1) # Only add the chapter header for the first chunk if original_chapter != current_main_chapter: current_main_chapter = original_chapter # Add separator if not first chapter if i > 0: combined.write(f"\n\n{'='*50}\n\n") # Add the chunk content combined.write(content) # Add spacing between chunks of the same chapter if chunk_idx < total_chunks: combined.write("\n") else: # This is a standalone chapter current_main_chapter = chapter_data['num'] # Add separator if not first chapter if i > 0: combined.write(f"\n\n{'='*50}\n\n") # Add the content combined.write(content) print(f" • Combined file with preserved sections: {combined_path}") total_time = time.time() - translation_start_time hours = int(total_time // 3600) minutes = int((total_time % 3600) // 60) seconds = int(total_time % 60) print(f"\n⏱️ Total translation time: {hours}h {minutes}m {seconds}s") print(f"📊 Chapters completed: {chapters_completed}") print(f"✅ Text file translation complete!") if log_callback: log_callback(f"✅ Text file translation complete! Created {combined_path}") # Exit here for text files and PDFs - don't fall through to EPUB generation print("TRANSLATION_COMPLETE_SIGNAL") return except Exception as e: print(f"❌ Error creating combined text file: {e}") if log_callback: log_callback(f"❌ Error creating combined text file: {e}") print("TRANSLATION_COMPLETE_SIGNAL") return else: # Skip EPUB building if graceful stop was triggered graceful_stop_triggered = os.environ.get('GRACEFUL_STOP') == '1' or os.environ.get('GRACEFUL_STOP_COMPLETED') == '1' if graceful_stop_triggered: print("⏳ Graceful stop triggered - skipping EPUB building") print("TRANSLATION_COMPLETE_SIGNAL") return print("🔍 Checking for translated chapters...") # Respect retain extension toggle: if enabled, don't look for response_ prefix if should_retain_source_extension(): response_files = [f for f in os.listdir(out) if f.endswith('.html') and not f.startswith('chapter_')] else: response_files = [f for f in os.listdir(out) if f.startswith('response_') and f.endswith('.html')] chapter_files = [f for f in os.listdir(out) if f.startswith('chapter_') and f.endswith('.html')] if not response_files and chapter_files: if should_retain_source_extension(): print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist") print("ℹ️ Retain-source-extension mode is ON: skipping placeholder creation and using original files for EPUB compilation.") else: print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist") print("📝 Creating placeholder response files for EPUB compilation...") for chapter_file in chapter_files: response_file = chapter_file.replace('chapter_', 'response_', 1) src = os.path.join(out, chapter_file) dst = os.path.join(out, response_file) try: with open(src, 'r', encoding='utf-8') as f: content = f.read() soup = BeautifulSoup(content, 'html.parser') notice = soup.new_tag('p') notice.string = "[Note: This chapter could not be translated - showing original content]" notice['style'] = "color: red; font-style: italic;" if soup.body: soup.body.insert(0, notice) with open(dst, 'w', encoding='utf-8') as f: f.write(str(soup)) except Exception as e: print(f"⚠️ Error processing {chapter_file}: {e}") try: shutil.copy2(src, dst) except: pass print(f"✅ Created {len(chapter_files)} placeholder response files") print("⚠️ Note: The EPUB will contain untranslated content") print("📘 Building final EPUB…") try: from epub_converter import fallback_compile_epub fallback_compile_epub(out, log_callback=log_callback) print("✅ All done: your final EPUB is in", out) total_time = time.time() - translation_start_time hours = int(total_time // 3600) minutes = int((total_time % 3600) // 60) seconds = int(total_time % 60) print(f"\n📊 Translation Statistics:") print(f" • Total chunks processed: {chunks_completed}") print(f" • Total time: {hours}h {minutes}m {seconds}s") if chunks_completed > 0: avg_time = total_time / chunks_completed print(f" • Average time per chunk: {avg_time:.1f} seconds") stats = progress_manager.get_stats(out) print(f"\n📊 Progress Tracking Summary:") print(f" • Total chapters tracked: {stats['total_tracked']}") print(f" • Successfully completed: {stats['completed']}") print(f" • Missing files: {stats['missing_files']}") print(f" • In progress: {stats['in_progress']}") except Exception as e: print("❌ EPUB build failed:", e) print("TRANSLATION_COMPLETE_SIGNAL") if __name__ == "__main__": from shutdown_utils import run_cli_main run_cli_main(main)