# TransateKRtoEN.py
# -*- coding: utf-8 -*-
import json
import logging
import shutil
import threading
import queue
import uuid
import inspect
import os, sys, io, zipfile, time, re, mimetypes, subprocess, tiktoken
import builtins
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup, NavigableString
try:
from bs4 import XMLParsedAsHTMLWarning
import warnings
# Suppress the warning since we handle both HTML and XHTML content
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
except ImportError:
# Older versions of BeautifulSoup might not have this warning
pass
from collections import Counter
from unified_api_client import UnifiedClient, UnifiedClientError
# Translation thread submission throttling (batch) to align queued logs with actual delay
_translation_thread_submit_lock = threading.Lock()
_translation_last_thread_submit = 0.0
import hashlib
import tempfile
import unicodedata
from difflib import SequenceMatcher
import unicodedata
import re
import time
from history_manager import HistoryManager
from chapter_splitter import ChapterSplitter
from image_translator import ImageTranslator
from typing import Dict, List, Tuple
from txt_processor import TextFileProcessor
from ai_hunter_enhanced import ImprovedAIHunterDetection
import GlossaryManager # Module with glossary functions
import csv
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
# Module-level functions for ProcessPoolExecutor compatibility
from tqdm import tqdm
class ProgressBar:
"""Simple in-place progress bar for terminal output"""
_last_line_length = 0
@classmethod
def update(cls, current, total, prefix="Progress", bar_length=30):
"""Update progress bar in-place
Args:
current: Current progress value
total: Total value for 100% completion
prefix: Text to show before the bar
bar_length: Length of the progress bar in characters
"""
if total == 0:
return
percent = min(100, int(100 * current / total))
filled = int(bar_length * current / total)
bar = '█' * filled + '░' * (bar_length - filled)
# Build the line
line = f"\r{prefix}: [{bar}] {current}/{total} ({percent}%)"
# Pad with spaces to clear previous line if it was longer
if len(line) < cls._last_line_length:
line += ' ' * (cls._last_line_length - len(line))
cls._last_line_length = len(line)
# Print without newline
print(line, end='', flush=True)
@classmethod
def finish(cls):
"""Finish progress bar and move to next line"""
print() # Move to next line
cls._last_line_length = 0
def is_traditional_translation_api(model: str) -> bool:
"""Check if the model is a traditional translation API"""
return model in ['deepl', 'google-translate', 'google-translate-free'] or model.startswith('deepl/') or model.startswith('google-translate/')
def get_chapter_terminology(is_text_file, chapter_data=None):
"""Get appropriate terminology (Chapter/Section) based on source type"""
if is_text_file:
return "Section"
if chapter_data:
if chapter_data.get('filename', '').endswith('.txt') or chapter_data.get('is_chunk', False):
return "Section"
return "Chapter"
def extract_text_from_raw_content(raw_obj) -> str:
"""
Safely extract human-readable text from a Gemini raw_content_object.
Skips reasoning-only parts (thought=True) but preserves normal text.
"""
try:
parts = []
if hasattr(raw_obj, 'parts'):
parts = raw_obj.parts or []
elif isinstance(raw_obj, dict):
parts = raw_obj.get('parts', []) or []
texts = []
for p in parts:
is_thought = False
text_val = None
if hasattr(p, 'thought'):
is_thought = bool(getattr(p, 'thought', False))
elif isinstance(p, dict):
is_thought = bool(p.get('thought', False))
if hasattr(p, 'text'):
text_val = getattr(p, 'text', None)
elif isinstance(p, dict):
text_val = p.get('text')
if text_val and not is_thought:
texts.append(str(text_val))
return "\n".join(texts).strip()
except Exception:
return ""
def build_gemini_model_message(content: str = "", raw_obj=None) -> dict:
"""
Build a Gemini 3-compatible assistant-role message with parts:
- text part (when available)
- thought_signature part (when available)
Using assistant keeps roles valid while preserving parts for Gemini 3.
"""
import base64
parts = []
# Prefer text from raw_obj parts if present; else use provided content
text_added = False
if raw_obj:
candidate_parts = []
if hasattr(raw_obj, "parts"):
candidate_parts = raw_obj.parts or []
elif isinstance(raw_obj, dict):
candidate_parts = raw_obj.get("parts", []) or []
for p in candidate_parts:
if hasattr(p, "text") and getattr(p, "text", None):
parts.append({"text": str(getattr(p, "text"))})
text_added = True
elif isinstance(p, dict) and p.get("text"):
parts.append({"text": str(p.get("text"))})
text_added = True
if content and not text_added:
parts.append({"text": str(content)})
# Find thought signature (snake or camel case, bytes or dict)
sig_bytes = None
if raw_obj:
def _extract_sig_from_part(part):
ts = None
if hasattr(part, "thought_signature"):
ts = getattr(part, "thought_signature", None)
elif hasattr(part, "thoughtSignature"):
ts = getattr(part, "thoughtSignature", None)
elif isinstance(part, dict):
ts = part.get("thought_signature") or part.get("thoughtSignature")
return ts
# Check top-level then parts
top_ts = None
if isinstance(raw_obj, dict):
top_ts = raw_obj.get("thought_signature") or raw_obj.get("thoughtSignature")
if hasattr(raw_obj, "thought_signature"):
top_ts = getattr(raw_obj, "thought_signature", None)
if hasattr(raw_obj, "thoughtSignature"):
top_ts = getattr(raw_obj, "thoughtSignature", None)
if top_ts is not None:
sig_bytes = top_ts
else:
cand_parts = []
if hasattr(raw_obj, "parts"):
cand_parts = raw_obj.parts or []
elif isinstance(raw_obj, dict):
cand_parts = raw_obj.get("parts", []) or []
for p in cand_parts:
ts = _extract_sig_from_part(p)
if ts is not None:
sig_bytes = ts
break
if sig_bytes is not None:
if isinstance(sig_bytes, dict) and sig_bytes.get("_type") == "bytes" and sig_bytes.get("data"):
data_b64 = sig_bytes.get("data")
elif isinstance(sig_bytes, (bytes, bytearray)):
data_b64 = base64.b64encode(sig_bytes).decode("utf-8")
else:
# If provided as string (already b64) keep as-is
data_b64 = str(sig_bytes)
parts.append({"thought_signature": {"_type": "bytes", "data": data_b64}})
# Fallback to text-only part if nothing found
if not parts and content:
parts.append({"text": str(content)})
return {"role": "assistant", "parts": parts} if parts else {"role": "assistant", "parts": []}
def _merge_split_paragraphs(html_body: str) -> str:
"""Merge paragraphs that were artificially split across PDF pages.
PDFs are extracted page-by-page, which can split paragraphs mid-sentence.
This function merges consecutive justified paragraphs that don't end with
sentence-ending punctuation, creating more natural paragraph breaks.
Only affects PDFs, not EPUBs.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_body, 'html.parser')
# Find all
tags
paragraphs = soup.find_all('p')
if len(paragraphs) < 2:
return html_body # Nothing to merge
# Process paragraphs and merge when appropriate
i = 0
while i < len(paragraphs) - 1:
current_p = paragraphs[i]
next_p = paragraphs[i + 1]
# Skip if either is None or not a tag
if not current_p or not next_p:
i += 1
continue
# Get paragraph classes - only merge justified paragraphs
current_class = current_p.get('class', [])
next_class = next_p.get('class', [])
current_is_justified = 'align-justify' in current_class if current_class else False
next_is_justified = 'align-justify' in next_class if next_class else False
# Only merge if both are justified (regular body text)
if not (current_is_justified and next_is_justified):
i += 1
continue
# Get text content of current paragraph
current_text = current_p.get_text().strip()
# Check if current paragraph ends with sentence-ending punctuation
ends_with_sentence = bool(re.search(r'[.!?]\s*$', current_text))
# Check if next paragraph looks like continuation (doesn't start with capital)
next_text = next_p.get_text().strip()
starts_with_capital = bool(re.match(r'^[A-Z"\(]', next_text)) if next_text else False
# Merge if:
# - Current doesn't end with sentence punctuation, OR
# - Current ends with sentence but next doesn't start with capital (likely continuation)
should_merge = not ends_with_sentence or (ends_with_sentence and not starts_with_capital)
if should_merge:
# Merge next paragraph's content into current
# Add a space between them
current_p.append(' ')
for content in list(next_p.contents):
try:
current_p.append(content.extract())
except Exception:
current_p.append(content)
# Remove the next paragraph
next_p.decompose()
# Update list and continue without increment to consider further merges
paragraphs = soup.find_all('p')
continue
else:
# Can't merge, move to next pair
i += 1
# Use decode() instead of str() to preserve original formatting and attributes
return soup.decode(formatter='minimal')
def _merge_image_only_pages(html_body: str) -> str:
"""Merge image-only extracted PDF page containers into the previous container.
Motivation: When PDFs are extracted page-by-page, some pages contain only a single image.
Keeping them as a standalone container often produces large wasted whitespace in the final
PDF/HTML output. This pass moves the image(s) into the previous page container.
We treat a container as "image-only" if:
- it contains at least one
- its visible text (after stripping whitespace/nbsp) is empty
This is a best-effort layout hint; the renderer may still paginate based on available space.
"""
try:
from bs4 import BeautifulSoup
import re as _re
soup = BeautifulSoup(html_body, 'html.parser')
# Common page wrapper IDs produced by our pipeline / MuPDF
id_pat = _re.compile(r'^(?:mupdf-page0-\d+|page\d+|page0)$')
def _is_image_only(div) -> bool:
if not div:
return False
imgs = div.find_all('img')
if not imgs:
return False
txt = (div.get_text(' ', strip=True) or '').replace('\xa0', '').strip()
return txt == ''
changed = True
while changed:
changed = False
divs = soup.find_all('div', id=id_pat)
for idx in range(1, len(divs)):
div = divs[idx]
if not _is_image_only(div):
continue
prev = div.find_previous('div', id=id_pat)
if not prev:
continue
# Move children into previous container
for child in list(div.contents):
try:
prev.append(child.extract())
except Exception:
prev.append(child)
div.decompose()
changed = True
break # restart scan since tree changed
return soup.decode(formatter='minimal')
except Exception:
return html_body
def _keep_text_with_following_image(html_body: str, *, min_text_chars: int = 40) -> str:
"""Reduce image-only PDF pages by keeping the last text block together with the following image.
If an image doesn't fit at the bottom of a page, renderers will push it to the next page,
sometimes resulting in a page that contains only the image. By wrapping the last text block
immediately before an image together with that image in a container that avoids page breaks
inside, the renderer will move BOTH to the next page when needed.
This intentionally trades some extra whitespace on the previous page to avoid image-only pages.
"""
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_body, 'html.parser')
# Target
blocks (most of your extracted images are in this shape)
for p in soup.find_all('p'):
imgs = p.find_all('img')
if len(imgs) != 1:
continue
# Ensure this
is basically image-only
txt = (p.get_text(' ', strip=True) or '').replace('\xa0', '').strip()
if txt:
continue
# Find a preceding text block sibling (h1-h6 or p with text)
prev = p.find_previous_sibling(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
if not prev:
continue
prev_txt = (prev.get_text(' ', strip=True) or '').replace('\xa0', '').strip()
if len(prev_txt) < min_text_chars:
continue
# Wrap prev + image-paragraph together
wrapper = soup.new_tag('div')
wrapper['class'] = (wrapper.get('class', []) or []) + ['keep-with-image']
wrapper['style'] = 'break-inside:avoid; page-break-inside:avoid;'
prev.insert_before(wrapper)
wrapper.append(prev.extract())
wrapper.append(p.extract())
return soup.decode(formatter='minimal')
except Exception:
return html_body
def _generate_and_replace_toc(html_body: str) -> str:
"""Generate a proper table of contents from headers and replace any existing broken TOC.
Only affects PDFs, not EPUBs.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_body, 'html.parser')
# Find all h1 and h2 headers (skip those in first 3 pages/divs as they're likely title page)
headers = []
all_divs = soup.find_all('div', id=lambda x: x and x.startswith('page'))
# Start collecting headers after the first 3 pages
for div in all_divs[3:] if len(all_divs) > 3 else []:
for header in div.find_all(['h1', 'h2']):
header_text = header.get_text().strip()
if header_text and len(header_text) > 2: # Skip very short headers
# Create anchor ID
if not header.get('id'):
anchor_id = re.sub(r'[^a-zA-Z0-9]+', '-', header_text[:50].lower()).strip('-')
header['id'] = anchor_id
else:
anchor_id = header['id']
headers.append({
'text': header_text,
'id': anchor_id,
'level': int(header.name[1]) # h1 -> 1, h2 -> 2
})
# If we found headers, generate TOC
if headers:
# Build TOC HTML
toc_html = '
\n'
toc_html += '
Table of Contents
\n'
for h in headers:
indent = '' if h['level'] == 1 else ' '
toc_html += f'
\n'
# Search for existing TOC by looking for "Table of Contents" or "Contents" text
toc_replaced = False
# Method 1: Search for any element containing "Table of Contents" text
for element in soup.find_all(string=re.compile(r'table of contents|^contents$', re.IGNORECASE)):
# Find the containing page div
page_div = element.find_parent('div', id=lambda x: x and x.startswith('page'))
if page_div:
page_div.clear()
page_div.append(BeautifulSoup(toc_html, 'html.parser'))
toc_replaced = True
print(f" • Replaced broken TOC with generated TOC ({len(headers)} entries)")
break
# Method 2: If not found by text, check page divs for TOC-like content
if not toc_replaced:
for i, div in enumerate(all_divs[:10]): # Check first 10 pages
div_text = div.get_text().lower().strip()
# Check if this looks like a TOC page (has "contents" early in the page)
if ('table of contents' in div_text or
(div_text.startswith('contents') or 'contents' in div_text[:100])):
# Replace entire div content with new TOC
div.clear()
div.append(BeautifulSoup(toc_html, 'html.parser'))
toc_replaced = True
print(f" • Replaced broken TOC on page {i+1} with generated TOC ({len(headers)} entries)")
break
# Use decode() instead of str() to preserve original formatting and attributes
return soup.decode(formatter='minimal')
# =====================================================
# CONFIGURATION AND ENVIRONMENT MANAGEMENT
# =====================================================
class TranslationConfig:
"""Centralized configuration management"""
def __init__(self):
self.MODEL = os.getenv("MODEL", "gemini-1.5-flash")
self.input_path = os.getenv("input_path", "default.epub")
self.PROFILE_NAME = os.getenv("PROFILE_NAME", "korean").lower()
self.CONTEXTUAL = os.getenv("CONTEXTUAL", "1") == "1"
self.DELAY = float(os.getenv("SEND_INTERVAL_SECONDS", "1"))
# Use large_env to bypass Windows 32,767-char env var limit for large prompts
try:
import large_env
self.SYSTEM_PROMPT = (large_env.get_env("SYSTEM_PROMPT", "") or "").strip()
except Exception:
self.SYSTEM_PROMPT = os.getenv("SYSTEM_PROMPT", "").strip()
self.ASSISTANT_PROMPT = os.getenv("ASSISTANT_PROMPT", "").strip() # Optional assistant prefill
self.REQUEST_MERGING_ENABLED = os.getenv("REQUEST_MERGING_ENABLED", "0") == "1"
# Read merge count early so we can use it for placeholder handling
self.REQUEST_MERGE_COUNT = int(os.getenv("REQUEST_MERGE_COUNT", "3"))
# Handle split marker instruction placeholder
# Always strip the placeholder from base prompt - we'll add the instruction dynamically
# based on whether we're actually merging multiple chapters in a given request
if self.SYSTEM_PROMPT:
import re
# Strip placeholder - the actual instruction will be added via get_system_prompt()
self.SYSTEM_PROMPT = re.sub(r'\s*\{split_marker_instruction\}\s*', '', self.SYSTEM_PROMPT)
self.REMOVE_AI_ARTIFACTS = os.getenv("REMOVE_AI_ARTIFACTS", "0") == "1"
self.TEMP = float(os.getenv("TRANSLATION_TEMPERATURE", "0.3"))
self.HIST_LIMIT = int(os.getenv("TRANSLATION_HISTORY_LIMIT", "20"))
self.MAX_OUTPUT_TOKENS = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
self.EMERGENCY_RESTORE = os.getenv("EMERGENCY_PARAGRAPH_RESTORE", "1") == "1"
self.BATCH_TRANSLATION = os.getenv("BATCH_TRANSLATION", "0") == "1"
self.BATCH_SIZE = int(os.getenv("BATCH_SIZE", "10"))
self.BATCHING_MODE = os.getenv("BATCHING_MODE", "aggressive")
self.BATCH_GROUP_SIZE = int(os.getenv("BATCH_GROUP_SIZE", os.getenv("CONSERVATIVE_BATCH_GROUP_SIZE", "3")))
# Note: REQUEST_MERGING_ENABLED and REQUEST_MERGE_COUNT are set earlier (before split_marker_instruction handling)
# Synthetic header injection for merged requests (Split-the-Merge helper)
self.SYNTHETIC_MERGE_HEADERS = os.getenv("SYNTHETIC_MERGE_HEADERS", "1") == "1"
self.ENABLE_IMAGE_TRANSLATION = os.getenv("ENABLE_IMAGE_TRANSLATION", "1") == "1"
# Auto-disable image translation for html2text and BeautifulSoup profiles
# These profiles are designed for text extraction and don't need image translation
if self.ENABLE_IMAGE_TRANSLATION and self.PROFILE_NAME:
profile_lower = self.PROFILE_NAME.lower()
if 'html2text' in profile_lower or 'beautifulsoup' in profile_lower:
self.ENABLE_IMAGE_TRANSLATION = False
print(f"ℹ️ Image translation disabled for {self.PROFILE_NAME} profile")
self.TRANSLATE_BOOK_TITLE = os.getenv("TRANSLATE_BOOK_TITLE", "1") == "1"
self.DISABLE_ZERO_DETECTION = os.getenv("DISABLE_ZERO_DETECTION", "0") == "1"
self.ENABLE_AUTO_GLOSSARY = os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1"
self.COMPREHENSIVE_EXTRACTION = os.getenv("COMPREHENSIVE_EXTRACTION", "0") == "1"
self.MANUAL_GLOSSARY = os.getenv("MANUAL_GLOSSARY")
self.RETRY_TRUNCATED = os.getenv("RETRY_TRUNCATED", "1") == "1"
try:
self.TRUNCATION_RETRY_ATTEMPTS = int(os.getenv("TRUNCATION_RETRY_ATTEMPTS", "1"))
except Exception:
self.TRUNCATION_RETRY_ATTEMPTS = 1
# Char-ratio truncation detection (silent truncation)
self.CHAR_RATIO_TRUNCATION_ENABLED = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1"
try:
self.CHAR_RATIO_TRUNCATION_PERCENT = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50"))
except Exception:
self.CHAR_RATIO_TRUNCATION_PERCENT = 50.0
try:
self.CHAR_RATIO_TRUNCATION_ATTEMPTS = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1"))
except Exception:
self.CHAR_RATIO_TRUNCATION_ATTEMPTS = 1
try:
self.CHAR_RATIO_MIN_OUTPUT_CHARS = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100"))
except Exception:
self.CHAR_RATIO_MIN_OUTPUT_CHARS = 100
self.RETRY_SPLIT_FAILED = os.getenv("RETRY_SPLIT_FAILED", "0") == "1"
try:
self.SPLIT_FAILED_RETRY_ATTEMPTS = int(os.getenv("SPLIT_FAILED_RETRY_ATTEMPTS", "1"))
except Exception:
self.SPLIT_FAILED_RETRY_ATTEMPTS = 1
self.RETRY_DUPLICATE_BODIES = os.getenv("RETRY_DUPLICATE_BODIES", "1") == "1"
self.RETRY_TIMEOUT = os.getenv("RETRY_TIMEOUT", "0") == "1"
self.CHUNK_TIMEOUT = int(os.getenv("CHUNK_TIMEOUT", "1800"))
self.DISABLE_MERGE_FALLBACK = os.getenv("DISABLE_MERGE_FALLBACK", "0") == "1"
self.MAX_RETRY_TOKENS = int(os.getenv("MAX_RETRY_TOKENS", "16384"))
self.DUPLICATE_LOOKBACK_CHAPTERS = int(os.getenv("DUPLICATE_LOOKBACK_CHAPTERS", "3"))
self.USE_ROLLING_SUMMARY = os.getenv("USE_ROLLING_SUMMARY", "0") == "1"
self.ROLLING_SUMMARY_EXCHANGES = int(os.getenv("ROLLING_SUMMARY_EXCHANGES", "5"))
self.ROLLING_SUMMARY_MODE = os.getenv("ROLLING_SUMMARY_MODE", "replace")
# New: maximum number of rolling summary entries to retain when in append mode (0 = unlimited)
self.ROLLING_SUMMARY_MAX_ENTRIES = int(os.getenv("ROLLING_SUMMARY_MAX_ENTRIES", "10"))
self.DUPLICATE_DETECTION_MODE = os.getenv("DUPLICATE_DETECTION_MODE", "basic")
self.AI_HUNTER_THRESHOLD = int(os.getenv("AI_HUNTER_THRESHOLD", "75"))
self.TRANSLATION_HISTORY_ROLLING = os.getenv("TRANSLATION_HISTORY_ROLLING", "0") == "1"
self.API_KEY = (os.getenv("API_KEY") or
os.getenv("OPENAI_API_KEY") or
os.getenv("OPENAI_OR_Gemini_API_KEY") or
os.getenv("GEMINI_API_KEY"))
# NEW: Simple chapter number offset
self.CHAPTER_NUMBER_OFFSET = int(os.getenv("CHAPTER_NUMBER_OFFSET", "0"))
self.ENABLE_WATERMARK_REMOVAL = os.getenv("ENABLE_WATERMARK_REMOVAL", "1") == "1"
self.SAVE_CLEANED_IMAGES = os.getenv("SAVE_CLEANED_IMAGES", "1") == "1"
self.EMERGENCY_IMAGE_RESTORE = os.getenv("EMERGENCY_IMAGE_RESTORE", "0") == "1"
self.WATERMARK_PATTERN_THRESHOLD = int(os.getenv("WATERMARK_PATTERN_THRESHOLD", "10"))
self.WATERMARK_CLAHE_LIMIT = float(os.getenv("WATERMARK_CLAHE_LIMIT", "3.0"))
self.COMPRESSION_FACTOR = float(os.getenv("COMPRESSION_FACTOR", "2.0"))
# Multi API key support
self.use_multi_api_keys = os.environ.get('USE_MULTI_API_KEYS', '0') == '1'
self.multi_api_keys = []
if self.use_multi_api_keys:
multi_keys_json = os.environ.get('MULTI_API_KEYS', '[]')
try:
if multi_keys_json and str(multi_keys_json).strip() not in ('', '[]', 'null', 'None'):
self.multi_api_keys = json.loads(multi_keys_json)
else:
# Fallback: UnifiedClient may have the keys stored in-memory to avoid Windows env var limits
try:
from unified_api_client import UnifiedClient
with UnifiedClient._in_memory_multi_keys_lock:
self.multi_api_keys = UnifiedClient._in_memory_multi_keys or []
except Exception:
self.multi_api_keys = []
print(f"Loaded {len(self.multi_api_keys)} API keys for multi-key mode")
if not self.multi_api_keys:
self.use_multi_api_keys = False
except Exception as e:
print(f"Failed to load multi API keys: {e}")
self.use_multi_api_keys = False
# Fallback keys (for direct fallback retries)
self.use_fallback_keys = os.environ.get('USE_FALLBACK_KEYS', '0') == '1'
self.fallback_keys = []
if self.use_fallback_keys:
fk_json = os.environ.get('FALLBACK_KEYS', '[]')
try:
self.fallback_keys = json.loads(fk_json)
except Exception as e:
print(f"Failed to load fallback keys: {e}")
self.use_fallback_keys = False
def get_effective_output_limit(self) -> int:
"""Return the effective output token limit, considering per-key overrides.
- Start from the global MAX_OUTPUT_TOKENS.
- Check if the model has a discovered limit (from auto-adjustment)
- If multi-key mode is enabled, intersect with any per-key
individual_output_token_limit values (min of all >0 limits).
- If fallback keys are enabled, also intersect with their per-key
individual_output_token_limit values.
"""
effective = self.MAX_OUTPUT_TOKENS
# Check if we've discovered a model limit via auto-adjustment
try:
from unified_api_client import UnifiedClient
with UnifiedClient._model_limits_lock:
cached_limit = UnifiedClient._model_token_limits.get(self.MODEL)
if cached_limit and cached_limit < effective:
effective = cached_limit
except Exception:
pass
# Collect per-key limits from multi-key pool (only from enabled keys)
per_key_limits = []
try:
for idx, key_data in enumerate(self.multi_api_keys or []):
if not isinstance(key_data, dict):
continue
# Skip disabled keys
if not key_data.get('enabled', True):
continue
raw = key_data.get('individual_output_token_limit')
if raw in (None, "", 0):
continue
try:
val = int(raw)
if val > 0:
per_key_limits.append(val)
except Exception:
continue
except Exception:
pass
# Collect per-key limits from fallback keys (only from enabled keys)
try:
for idx, fb in enumerate(self.fallback_keys or []):
if not isinstance(fb, dict):
continue
# Skip disabled keys
if not fb.get('enabled', True):
continue
raw = fb.get('individual_output_token_limit')
if raw in (None, "", 0):
continue
try:
val = int(raw)
if val > 0:
per_key_limits.append(val)
except Exception:
continue
except Exception:
pass
if per_key_limits:
effective = min(effective, min(per_key_limits))
return effective
def get_system_prompt(self, actual_merge_count: int = 1) -> str:
"""Return the system prompt, optionally with split marker instruction.
Args:
actual_merge_count: The actual number of chapters being merged in this request.
If > 1, the split marker instruction will be added.
If 1 (default), no split marker instruction is added.
Returns:
The system prompt string, with or without split marker instruction.
"""
if not self.SYSTEM_PROMPT:
return self.SYSTEM_PROMPT
# Only add split marker instruction if actually merging multiple chapters
if actual_merge_count > 1 and self.REQUEST_MERGING_ENABLED:
split_instr = ("- CRITICAL Requirement: If you see any HTML tags containing 'SPLIT MARKER' "
"(Example:
SPLIT MARKER: Do Not Remove This Tag
), "
"you MUST preserve them EXACTLY as they appear. Do not translate, modify, or remove these markers.")
# Append to end of system prompt
return self.SYSTEM_PROMPT + "\n\n" + split_instr
return self.SYSTEM_PROMPT
# =====================================================
# REQUEST MERGING UTILITIES
# =====================================================
class RequestMerger:
"""Handles merging multiple chapters into a single request"""
@classmethod
def merge_chapters(cls, chapters_data, log_injections=True):
"""Merge multiple chapters into a single content block.
This is used both for request-size estimation and for the actual
merged request that is sent to the API.
Before concatenating, we inject an invisible split marker at the
beginning of each chapter. This greatly improves the reliability of
Split-the-Merge, because the splitter can simply find these markers
instead of carefully parsing headers.
Args:
chapters_data: List of tuples (chapter_num, content, chapter_obj)
log_injections: If False, perform marker injection silently
(no console logging). Used for size-estimation previews to
avoid duplicate log lines.
Returns:
Merged content string
"""
if not chapters_data:
return ""
# Split markers are only needed when split-the-merge is enabled
# Check if the feature is turned on
split_the_merge_enabled = os.getenv('SPLIT_THE_MERGE', '0') == '1'
split_markers_enabled = split_the_merge_enabled
merged_parts = []
for chapter_num, content, chapter_obj in chapters_data:
# Defensive: if something goes wrong in the marker injection
# logic, fall back to the original content rather than breaking
# the whole merge.
try:
if isinstance(content, str):
# Only add split markers if split-the-merge is enabled
if split_markers_enabled:
# Use H1 tag as split marker - AI will preserve visible HTML elements
split_marker = f'
SPLIT MARKER: Do Not Remove This Tag
\n'
marked_content = split_marker + content
if log_injections:
preview = marked_content[:120].replace('\n', ' ')
print(
f" ℹ️ Request Merging: Injected H1 split marker for "
f"chapter {chapter_num}: {preview}..."
)
merged_parts.append(marked_content)
else:
# No split markers - just append content as-is
merged_parts.append(content)
else:
# Non-string content, just append as-is
merged_parts.append(content)
except Exception as e:
# Fallback: append original content if anything goes wrong
if log_injections:
print(f" ⚠️ Request Merging: Failed to inject split marker for chapter {chapter_num}: {e}")
merged_parts.append(content)
return "\n\n".join(merged_parts)
@classmethod
def create_merge_groups(cls, chapters_to_translate, merge_count):
"""Group chapters into merge groups, keeping only nearby chapters together.
This prevents cases like chapter 7 being merged with chapter 29 just
because chapters 8–28 were already translated or merged earlier.
Args:
chapters_to_translate: List of tuples. Supported shapes:
- (idx, chapter_obj)
- (idx, chapter_obj, actual_num, ...)
merge_count: Maximum number of chapters to merge per request.
Returns:
List of merge groups, each group is a list of chapter tuples taken
from ``chapters_to_translate`` in order.
"""
if merge_count <= 1 or not chapters_to_translate:
# No merging, return each chapter as its own group
return [[ch] for ch in chapters_to_translate]
def _get_actual_num(item):
"""Best-effort extraction of the logical chapter number for grouping.
This is primarily used as a *display* / fallback value. For actual
proximity checks we prefer OPF spine order when available (see
``_get_proximity_key`` below).
We try, in order:
1. Explicit ``actual_num`` in position 2 (non-text merge path).
2. ``chapter_obj['actual_chapter_num']`` if present.
3. ``chapter_obj['num']``.
4. Fallback to idx (position 0).
"""
# Shape: (idx, chapter_obj, actual_num, ...)
try:
if len(item) >= 3 and isinstance(item[2], (int, float)):
return item[2]
except Exception:
pass
# Shape: (idx, chapter_obj)
try:
chapter_obj = item[1]
if isinstance(chapter_obj, dict):
if 'actual_chapter_num' in chapter_obj:
return chapter_obj.get('actual_chapter_num')
return chapter_obj.get('num')
except Exception:
pass
# Fallback: idx
try:
return item[0]
except Exception:
return None
def _get_proximity_key(item):
"""Return a numeric key representing *reading order* proximity.
We want proximity to reflect where chapters sit in the *book* rather
than their logical numbering, so that multiple files with the same
chapter number (e.g. notice pages vs. main text) don't get merged
just because their labels are "4, 5, 4".
Strategy (in order):
1. Use ``spine_order`` or ``opf_spine_position`` if present on the
chapter object (true reading order from content.opf).
2. Fall back to the chapter index ``idx`` (position 0 in the tuple),
which preserves the original ordering of the ``chapters`` list.
3. As a last resort, fall back to ``_get_actual_num``.
"""
# 1) Prefer explicit spine-based order from OPF if available
try:
chapter_obj = item[1]
if isinstance(chapter_obj, dict):
spine_pos = chapter_obj.get('spine_order')
if spine_pos is None:
spine_pos = chapter_obj.get('opf_spine_position')
if spine_pos is not None:
return float(spine_pos)
except Exception:
pass
# 2) Fall back to the chapter's index in the master chapter list.
# ``idx`` is stored in position 0 in all supported shapes.
try:
return float(item[0])
except Exception:
pass
# 3) Ultimate fallback – use the logical chapter number.
return _get_actual_num(item)
groups = []
current_group = []
prev_num = None
for ch in chapters_to_translate:
# Use proximity key (spine order when available) instead of the
# logical chapter number alone. This prevents far‑apart chapters
# with the same numeric label (e.g. multiple "Ch.004" entries in
# different parts of the book) from being merged together when
# there are many intervening chapters in the OPF spine.
current_num = _get_proximity_key(ch)
if not current_group:
# Start the first group
current_group = [ch]
prev_num = current_num
continue
# If we've hit the per-request limit, start a new group
if len(current_group) >= merge_count:
groups.append(current_group)
current_group = [ch]
prev_num = current_num
continue
# If we can't safely determine chapter numbers, be conservative and
# start a new group so we never merge far‑apart chapters by accident.
if current_num is None or prev_num is None:
groups.append(current_group)
current_group = [ch]
prev_num = current_num
continue
# Only merge if chapters are numerically adjacent (or effectively so).
# This means sequences like 1→2→3 will merge, but 1→4 will not.
try:
gap = abs(float(current_num) - float(prev_num))
except Exception:
gap = None
if gap is not None and gap <= 1:
# Close enough in chapter numbering, keep in same group
current_group.append(ch)
else:
# Too far apart (e.g. 7 then 29) → start a new group
groups.append(current_group)
current_group = [ch]
prev_num = current_num
if current_group:
groups.append(current_group)
return groups
@classmethod
def split_by_markers(cls, content, expected_count):
"""
Split merged translation output by split markers.
This method is robust to broken or missing split tags:
- Handles partial marker tags (e.g., missing closing tag)
- Handles malformed id attributes
- Falls back to ANY h1 tag if split markers are missing
- Works even if some markers are completely missing
Args:
content: The translated HTML content
expected_count: Expected number of sections (should match merged chapter count)
Returns:
List of content sections if we can reliably split,
or None if splitting is not possible (fallback to normal merged behavior)
"""
import re
from bs4 import BeautifulSoup
# Try multiple strategies in order of reliability:
# 1. Perfect split markers with proper id="split-N"
# 2. Any h1 tag with "split" in the id (even broken)
# 3. Any h1 tag containing "SPLIT MARKER" text
# 4. Any h1 tag at all
# Strategy 1: Perfect markers
perfect_pattern = r'
]*id="split-\d+"[^>]*>.*?
'
perfect_markers = list(re.finditer(perfect_pattern, content, flags=re.DOTALL | re.IGNORECASE))
if len(perfect_markers) == expected_count:
print(f" ✓️ Split the Merge: Found {len(perfect_markers)} perfect split markers")
return cls._split_by_positions(content, [m.start() for m in perfect_markers])
print(f" ⚠️ Split the Merge: Found {len(perfect_markers)} perfect markers, expected {expected_count}. Trying fallback strategies...")
# Strategy 2: Broken markers with "split" in id (handles broken closing tags, etc.)
try:
soup = BeautifulSoup(content, 'html.parser')
h1_tags = soup.find_all('h1')
# Try markers with "split" in id
split_id_tags = [tag for tag in h1_tags if tag.get('id') and 'split' in tag.get('id', '').lower()]
if len(split_id_tags) == expected_count:
print(f" ✓️ Split the Merge: Found {len(split_id_tags)} h1 tags with 'split' in id (broken marker format)")
positions = []
for tag in split_id_tags:
# Find position of this tag in original content
tag_str = str(tag)
# Search for the opening tag
opening_tag = re.escape(tag_str.split('>')[0] + '>')
match = re.search(opening_tag, content, flags=re.IGNORECASE)
if match:
positions.append(match.start())
if len(positions) == expected_count:
return cls._split_by_positions(content, sorted(positions))
except Exception as e:
print(f" ⚠️ Split the Merge: BeautifulSoup fallback failed: {e}")
# Strategy 3: H1 tags containing "SPLIT MARKER" text
try:
soup = BeautifulSoup(content, 'html.parser')
h1_tags = soup.find_all('h1')
marker_text_tags = [tag for tag in h1_tags if 'split marker' in tag.get_text().lower()]
if len(marker_text_tags) == expected_count:
print(f" ✓️ Split the Merge: Found {len(marker_text_tags)} h1 tags with 'SPLIT MARKER' text")
positions = []
for tag in marker_text_tags:
tag_str = str(tag)
opening_tag = re.escape(tag_str.split('>')[0] + '>')
match = re.search(opening_tag, content, flags=re.IGNORECASE)
if match:
positions.append(match.start())
if len(positions) == expected_count:
return cls._split_by_positions(content, sorted(positions))
except Exception as e:
print(f" ⚠️ Split the Merge: Text marker fallback failed: {e}")
# All strategies failed
print(f" ❌ Split the Merge: Could not reliably split content (found varying marker counts across strategies)")
return None
@classmethod
def _split_by_positions(cls, content, positions):
"""
Helper to split content at specific character positions.
Args:
content: Full content string
positions: List of character positions where splits should occur (sorted)
Returns:
List of content sections
"""
if not positions:
return [content]
sections = []
# First section is before the first marker (usually empty/whitespace)
first_section = content[:positions[0]].strip()
if first_section: # Only include if non-empty
sections.append(first_section)
# Middle sections between markers
for i in range(len(positions) - 1):
# Find where the actual content starts (after the marker tag)
start_pos = positions[i]
# Skip past the h1 tag
marker_end = content.find('', start_pos)
if marker_end != -1:
content_start = marker_end + 5 # len('')
else:
# Broken closing tag, try to skip past the opening tag at least
next_close_bracket = content.find('>', start_pos)
content_start = next_close_bracket + 1 if next_close_bracket != -1 else start_pos
section = content[content_start:positions[i + 1]].strip()
sections.append(section)
# Last section after the last marker
last_marker_pos = positions[-1]
marker_end = content.find('', last_marker_pos)
if marker_end != -1:
content_start = marker_end + 5
else:
next_close_bracket = content.find('>', last_marker_pos)
content_start = next_close_bracket + 1 if next_close_bracket != -1 else last_marker_pos
last_section = content[content_start:].strip()
sections.append(last_section)
print(f" ✓️ Split the Merge: Successfully split into {len(sections)} sections")
return sections
# =====================================================
# UNIFIED PATTERNS AND CONSTANTS
# =====================================================
class PatternManager:
"""Centralized pattern management"""
CHAPTER_PATTERNS = [
# English patterns
(r'chapter[\s_-]*(\d+)', re.IGNORECASE, 'english_chapter'),
(r'\bch\.?\s*(\d+)\b', re.IGNORECASE, 'english_ch'),
(r'part[\s_-]*(\d+)', re.IGNORECASE, 'english_part'),
(r'episode[\s_-]*(\d+)', re.IGNORECASE, 'english_episode'),
# Chinese patterns
(r'第\s*(\d+)\s*[章节話话回]', 0, 'chinese_chapter'),
(r'第\s*([一二三四五六七八九十百千万]+)\s*[章节話话回]', 0, 'chinese_chapter_cn'),
(r'(\d+)[章节話话回]', 0, 'chinese_short'),
# Japanese patterns
(r'第\s*(\d+)\s*話', 0, 'japanese_wa'),
(r'第\s*(\d+)\s*章', 0, 'japanese_chapter'),
(r'その\s*(\d+)', 0, 'japanese_sono'),
(r'(\d+)話目', 0, 'japanese_wame'),
# Korean patterns
(r'제\s*(\d+)\s*[장화권부편]', 0, 'korean_chapter'),
(r'(\d+)\s*[장화권부편]', 0, 'korean_short'),
(r'에피소드\s*(\d+)', 0, 'korean_episode'),
# Generic numeric patterns
(r'^\s*(\d+)\s*[-–—.\:]', re.MULTILINE, 'generic_numbered'),
(r'_(\d+)\.x?html?$', re.IGNORECASE, 'filename_number'),
(r'/(\d+)\.x?html?$', re.IGNORECASE, 'path_number'),
(r'(\d+)', 0, 'any_number'),
]
FILENAME_EXTRACT_PATTERNS = [
# IMPORTANT: More specific patterns MUST come first
r'^\d{3}(\d)_(\d{2})_\.x?html?$', # Captures both parts for decimal: group1.group2
r'^\d{4}_(\d+)\.x?html?$', # "0000_1.xhtml" - extracts 1, not 0000
r'^\d+_(\d+)[_\.]', # Any digits followed by underscore then capture next digits
r'^(\d+)[_\.]', # Standard: "0249_" or "0249."
r'response_(\d+)_', # Standard pattern: response_001_
r'response_(\d+)\.', # Pattern: response_001.
r'(\d{3,5})[_\.]', # 3-5 digit pattern with padding
r'[Cc]hapter[_\s]*(\d+)', # Chapter word pattern
r'[Cc]h[_\s]*(\d+)', # Ch abbreviation
r'No(\d+)Chapter', # No prefix with Chapter - matches "No00013Chapter.xhtml"
r'No(\d+)Section', # No prefix with Section - matches "No00013Section.xhtml"
r'No(\d+)(?=\.|_|$)', # No prefix followed by end, dot, or underscore (not followed by text)
r'第(\d+)[章话回]', # Chinese chapter markers
r'_(\d+)(?:_|\.|$)', # Number between underscores or at end
r'^(\d+)(?:_|\.|$)', # Starting with number
r'(\d+)', # Any number (fallback)
]
CJK_HONORIFICS = {
'korean': [
# Modern honorifics
'님', '씨', '선배', '후배', '동기', '형', '누나', '언니', '오빠', '동생',
'선생님', '교수님', '박사님', '사장님', '회장님', '부장님', '과장님', '대리님',
'팀장님', '실장님', '이사님', '전무님', '상무님', '부사장님', '고문님',
# Classical/formal honorifics
'공', '옹', '군', '양', '낭', '랑', '생', '자', '부', '모', '시', '제', '족하',
# Royal/noble address forms
'마마', '마노라', '대감', '영감', '나리', '도령', '낭자', '아씨', '규수',
'각하', '전하', '폐하', '저하', '합하', '대비', '대왕', '왕자', '공주',
# Buddhist/religious
'스님', '사부님', '조사님', '큰스님', '화상', '대덕', '대사', '법사',
'선사', '율사', '보살님', '거사님', '신부님', '목사님', '장로님', '집사님',
# Confucian/scholarly
'부자', '선생', '대인', '어른', '어르신', '존자', '현자', '군자', '대부',
'학사', '진사', '문하생', '제자',
# Kinship honorifics
'어르신', '할아버님', '할머님', '아버님', '어머님', '형님', '누님',
'아주버님', '아주머님', '삼촌', '이모님', '고모님', '외삼촌', '장인어른',
'장모님', '시아버님', '시어머님', '처남', '처형', '매형', '손님',
# Verb-based honorific endings and speech levels
'습니다', 'ㅂ니다', '습니까', 'ㅂ니까', '시다', '세요', '셔요', '십시오', '시오',
'이에요', '예요', '이예요', '에요', '어요', '아요', '여요', '해요', '이세요', '으세요',
'으시', '시', '으십니다', '십니다', '으십니까', '십니까', '으셨', '셨',
'드립니다', '드려요', '드릴게요', '드리겠습니다', '올립니다', '올려요',
'사옵니다', '사뢰', '여쭙니다', '여쭤요', '아뢰', '뵙니다', '뵈요', '모십니다',
'시지요', '시죠', '시네요', '시는군요', '시는구나', '으실', '실',
'드시다', '잡수시다', '주무시다', '계시다', '가시다', '오시다',
# Common verb endings with 있다/없다/하다
'있어요', '있습니다', '있으세요', '있으십니까', '없어요', '없습니다', '없으세요',
'해요', '합니다', '하세요', '하십시오', '하시죠', '하시네요', '했어요', '했습니다',
'되세요', '되셨어요', '되십니다', '됩니다', '되요', '돼요',
'이야', '이네', '이구나', '이군', '이네요', '인가요', '인가', '일까요', '일까',
'거예요', '거에요', '겁니다', '건가요', '게요', '을게요', '을까요', '었어요', '었습니다',
'겠습니다', '겠어요', '겠네요', '을겁니다', '을거예요', '을거에요',
# Common endings
'요', '죠', '네요', '는데요', '거든요', '니까', '으니까', '는걸요', '군요', '구나',
'는구나', '는군요', '더라고요', '더군요', '던데요', '나요', '가요', '까요',
'라고요', '다고요', '냐고요', '자고요', '란다', '단다', '냔다', '잔다',
# Formal archaic endings
'나이다', '사옵나이다', '옵니다', '오', '소서', '으오', '으옵소서', '사이다',
'으시옵니다', '시옵니다', '으시옵니까', '시옵니까', '나이까', '리이까', '리이다',
'옵소서', '으소서', '소이다', '로소이다', '이옵니다', '이올시다', '하옵니다'
],
'japanese': [
# Modern honorifics
'さん', 'ちゃん', '君', 'くん', '様', 'さま', '先生', 'せんせい', '殿', 'どの', '先輩', 'せんぱい',
# Classical/historical
'氏', 'し', '朝臣', 'あそん', '宿禰', 'すくね', '連', 'むらじ', '臣', 'おみ', '君', 'きみ',
'真人', 'まひと', '道師', 'みちのし', '稲置', 'いなぎ', '直', 'あたい', '造', 'みやつこ',
# Court titles
'卿', 'きょう', '大夫', 'たいふ', '郎', 'ろう', '史', 'し', '主典', 'さかん',
# Buddhist titles
'和尚', 'おしょう', '禅師', 'ぜんじ', '上人', 'しょうにん', '聖人', 'しょうにん',
'法師', 'ほうし', '阿闍梨', 'あじゃり', '大和尚', 'だいおしょう',
# Shinto titles
'大宮司', 'だいぐうじ', '宮司', 'ぐうじ', '禰宜', 'ねぎ', '祝', 'はふり',
# Samurai era
'守', 'かみ', '介', 'すけ', '掾', 'じょう', '目', 'さかん', '丞', 'じょう',
# Keigo (honorific language) verb forms
'です', 'ます', 'ございます', 'いらっしゃる', 'いらっしゃいます', 'おっしゃる', 'おっしゃいます',
'なさる', 'なさいます', 'くださる', 'くださいます', 'いただく', 'いただきます',
'おります', 'でございます', 'ございません', 'いたします', 'いたしました',
'申す', '申します', '申し上げる', '申し上げます', '存じる', '存じます', '存じ上げる',
'伺う', '伺います', '参る', '参ります', 'お目にかかる', 'お目にかかります',
'拝見', '拝見します', '拝聴', '拝聴します', '承る', '承ります',
# Respectful prefixes/suffixes
'お', 'ご', '御', 'み', '美', '貴', '尊'
],
'chinese': [
# Modern forms
'先生', '小姐', '夫人', '公子', '大人', '老师', '师父', '师傅', '同志', '同学',
# Ancient/classical forms
'子', '丈', '翁', '公', '侯', '伯', '叔', '仲', '季', '父', '甫', '卿', '君', '生',
# Imperial court
'陛下', '殿下', '千岁', '万岁', '圣上', '皇上', '天子', '至尊', '御前', '爷',
# Nobility/officials
'阁下', '大人', '老爷', '相公', '官人', '郎君', '娘子', '夫子', '足下',
# Religious titles
'上人', '法师', '禅师', '大师', '高僧', '圣僧', '神僧', '活佛', '仁波切',
'真人', '天师', '道长', '道友', '仙长', '上仙', '祖师', '掌教',
# Scholarly/Confucian
'夫子', '圣人', '贤人', '君子', '大儒', '鸿儒', '宗师', '泰斗', '巨擘',
# Martial arts
'侠士', '大侠', '少侠', '女侠', '英雄', '豪杰', '壮士', '义士',
# Family/kinship
'令尊', '令堂', '令郎', '令爱', '贤弟', '贤侄', '愚兄', '小弟', '家父', '家母',
# Humble forms
'在下', '小人', '鄙人', '不才', '愚', '某', '仆', '妾', '奴', '婢',
# Polite verbal markers
'请', '请问', '敢问', '恭请', '敬请', '烦请', '有请', '请教', '赐教',
'惠顾', '惠赐', '惠存', '笑纳', '雅正', '指正', '斧正', '垂询',
'拜', '拜见', '拜访', '拜读', '拜托', '拜谢', '敬上', '谨上', '顿首'
],
'english': [
# Modern Korean romanizations (Revised Romanization of Korean - 2000)
'-nim', '-ssi', '-seonbae', '-hubae', '-donggi', '-hyeong', '-nuna',
'-eonni', '-oppa', '-dongsaeng', '-seonsaengnim', '-gyosunim',
'-baksanim', '-sajangnim', '-hoejangnim', '-bujangnim', '-gwajangnim',
'-daerim', '-timjangnim', '-siljangnim', '-isanim', '-jeonmunim',
'-sangmunim', '-busajangnim', '-gomunnim',
# Classical/formal Korean romanizations
'-gong', '-ong', '-gun', '-yang', '-nang', '-rang', '-saeng', '-ja',
'-bu', '-mo', '-si', '-je', '-jokha',
# Royal/noble Korean romanizations
'-mama', '-manora', '-daegam', '-yeonggam', '-nari', '-doryeong',
'-nangja', '-assi', '-gyusu', '-gakha', '-jeonha', '-pyeha', '-jeoha',
'-hapka', '-daebi', '-daewang', '-wangja', '-gongju',
# Buddhist/religious Korean romanizations
'-seunim', '-sabunim', '-josanim', '-keunseunim', '-hwasang',
'-daedeok', '-daesa', '-beopsa', '-seonsa', '-yulsa', '-bosalnim',
'-geosanim', '-sinbunim', '-moksanim', '-jangnonim', '-jipsanim',
# Confucian/scholarly Korean romanizations
'-buja', '-seonsaeng', '-daein', '-eoreun', '-eoreusin', '-jonja',
'-hyeonja', '-gunja', '-daebu', '-haksa', '-jinsa', '-munhasaeng', '-jeja',
# Kinship Korean romanizations
'-harabeonim', '-halmeonim', '-abeonim', '-eomeonim', '-hyeongnim',
'-nunim', '-ajubeonim', '-ajumeonim', '-samchon', '-imonim', '-gomonim',
'-oesamchon', '-jangineoreun', '-jangmonim', '-siabeonim', '-sieomeonim',
'-cheonam', '-cheohyeong', '-maehyeong', '-sonnim',
# Korean verb endings romanized (Revised Romanization)
'-seumnida', '-mnida', '-seumnikka', '-mnikka', '-sida', '-seyo',
'-syeoyo', '-sipsio', '-sio', '-ieyo', '-yeyo', '-iyeyo', '-eyo',
'-eoyo', '-ayo', '-yeoyo', '-haeyo', '-iseyo', '-euseyo',
'-eusi', '-si', '-eusimnida', '-simnida', '-eusimnikka', '-simnikka',
'-eusyeot', '-syeot', '-deurimnida', '-deuryeoyo', '-deurilgeyo',
'-deurigesseumnida', '-ollimnida', '-ollyeoyo', '-saomnida', '-saroe',
'-yeojjumnida', '-yeojjwoyo', '-aroe', '-boemnida', '-boeyo', '-mosimnida',
'-sijiyo', '-sijyo', '-sineyo', '-sineungunyo', '-sineunguna', '-eusil', '-sil',
'-deusida', '-japsusida', '-jumusida', '-gyesida', '-gasida', '-osida',
# Common Korean verb endings romanized
'-isseoyo', '-isseumnida', '-isseuseyo', '-isseusimnikka',
'-eopseoyo', '-eopseumnida', '-eopseuseyo', '-hamnida', '-haseyo',
'-hasipsio', '-hasijyo', '-hasineyo', '-haesseoyo', '-haesseumnida',
'-doeseyo', '-doesyeosseoyo', '-doesimnida', '-doemnida', '-doeyo', '-dwaeyo',
'-iya', '-ine', '-iguna', '-igun', '-ineyo', '-ingayo', '-inga',
'-ilkkayo', '-ilkka', '-geoyeyo', '-geoeyo', '-geomnida', '-geongayo',
'-geyo', '-eulgeyo', '-eulkkayo', '-eosseoyo', '-eosseumnida',
'-gesseumnida', '-gesseoyo', '-genneyo', '-eulgeommida', '-eulgeoyeyo', '-eulgeoeyo',
# Common Korean endings romanized
'-yo', '-jyo', '-neyo', '-neundeyo', '-geodeunyo', '-nikka',
'-eunikka', '-neungeolyo', '-gunyo', '-guna', '-neunguna', '-neungunyo',
'-deoragoyo', '-deogunyo', '-deondeyo', '-nayo', '-gayo', '-kkayo',
'-ragoyo', '-dagoyo', '-nyagoyo', '-jagoyo', '-randa', '-danda',
'-nyanda', '-janda',
# Formal archaic Korean romanized
'-naida', '-saomnaida', '-omnida', '-o', '-soseo', '-euo',
'-euopsoseo', '-saida', '-eusiomnida', '-siomnida', '-eusiomnikka',
'-siomnikka', '-naikka', '-riikka', '-riida', '-opsoseo', '-eusoseo',
'-soida', '-rosoida', '-iomnida', '-iolsida', '-haomnida',
# Japanese keigo romanized (keeping existing)
'-san', '-chan', '-kun', '-sama', '-sensei', '-senpai', '-dono',
'-shi', '-tan', '-chin', '-desu', '-masu', '-gozaimasu',
'-irassharu', '-irasshaimasu', '-ossharu', '-osshaimasu',
'-nasaru', '-nasaimasu', '-kudasaru', '-kudasaimasu', '-itadaku',
'-itadakimasu', '-orimasu', '-degozaimasu', '-gozaimasen',
'-itashimasu', '-itashimashita', '-mousu', '-moushimasu',
'-moushiageru', '-moushiagemasu', '-zonjiru', '-zonjimasu',
'-ukagau', '-ukagaimasu', '-mairu', '-mairimasu', '-haiken',
'-haikenshimasu',
# Chinese romanizations (keeping existing)
'-xiong', '-di', '-ge', '-gege', '-didi', '-jie', '-jiejie',
'-meimei', '-shixiong', '-shidi', '-shijie', '-shimei', '-gongzi',
'-guniang', '-xiaojie', '-daren', '-qianbei', '-daoyou', '-zhanglao',
'-shibo', '-shishu', '-shifu', '-laoshi', '-xiansheng', '-daxia',
'-shaoxia', '-nvxia', '-jushi', '-shanren', '-dazhang', '-zhenren',
# Ancient Chinese romanizations
'-zi', '-gong', '-hou', '-bo', '-jun', '-qing', '-weng', '-fu',
'-sheng', '-lang', '-langjun', '-niangzi', '-furen', '-gege',
'-jiejie', '-yeye', '-nainai',
# Chinese politeness markers romanized
'-qing', '-jing', '-gong', '-hui', '-ci', '-bai', '-gan', '-chui',
'qingwen', 'ganwen', 'gongjing', 'jingjing', 'baijian', 'baifang',
'baituo'
]
}
TITLE_PATTERNS = {
'korean': [
# Modern titles
r'\b(왕|여왕|왕자|공주|황제|황후|대왕|대공|공작|백작|자작|남작|기사|장군|대장|원수|제독|함장|대신|재상|총리|대통령|시장|지사|검사|판사|변호사|의사|박사|교수|신부|목사|스님|도사)\b',
r'\b(폐하|전하|각하|예하|님|대감|영감|나리|도련님|아가씨|부인|선생)\b',
# Historical/classical titles
r'\b(대왕|태왕|왕비|왕후|세자|세자빈|대군|군|옹주|공주|부마|원자|원손)\b',
r'\b(영의정|좌의정|우의정|판서|참판|참의|정승|판사|사또|현령|군수|목사|부사)\b',
r'\b(대제학|제학|대사간|사간|대사헌|사헌|도승지|승지|한림|사관|내시|환관)\b',
r'\b(병조판서|이조판서|호조판서|예조판서|형조판서|공조판서)\b',
r'\b(도원수|부원수|병마절도사|수군절도사|첨절제사|만호|천호|백호)\b',
r'\b(정일품|종일품|정이품|종이품|정삼품|종삼품|정사품|종사품|정오품|종오품)\b',
# Korean honorific verb endings patterns
r'(습니다|ㅂ니다|습니까|ㅂ니까|세요|셔요|십시오|시오)$',
r'(이에요|예요|이예요|에요|어요|아요|여요|해요)$',
r'(으시|시)(었|겠|ㄹ|을|는|던)*(습니다|ㅂ니다|어요|아요|세요)',
r'(드립니다|드려요|드릴게요|드리겠습니다|올립니다|올려요)$',
r'(사옵니다|여쭙니다|여쭤요|뵙니다|뵈요|모십니다)$',
r'(나이다|사옵나이다|옵니다|으오|으옵소서|사이다)$'
],
'japanese': [
# Modern titles
r'\b(王|女王|王子|姫|皇帝|皇后|天皇|皇太子|大王|大公|公爵|伯爵|子爵|男爵|騎士|将軍|大将|元帥|提督|艦長|大臣|宰相|総理|大統領|市長|知事|検事|裁判官|弁護士|医者|博士|教授|神父|牧師|僧侶|道士)\b',
r'\b(陛下|殿下|閣下|猊下|様|大人|殿|卿|君|氏)\b',
# Historical titles
r'\b(天皇|皇后|皇太子|親王|内親王|王|女王|太政大臣|左大臣|右大臣|内大臣|大納言|中納言|参議)\b',
r'\b(関白|摂政|征夷大将軍|管領|執権|守護|地頭|代官|奉行|与力|同心)\b',
r'\b(太政官|神祇官|式部省|治部省|民部省|兵部省|刑部省|大蔵省|宮内省)\b',
r'\b(大僧正|僧正|大僧都|僧都|律師|大法師|法師|大禅師|禅師)\b',
r'\b(正一位|従一位|正二位|従二位|正三位|従三位|正四位|従四位|正五位|従五位)\b',
r'\b(大和守|山城守|摂津守|河内守|和泉守|伊賀守|伊勢守|尾張守|三河守|遠江守)\b',
# Japanese keigo (honorific language) patterns
r'(です|ます|ございます)$',
r'(いらっしゃ|おっしゃ|なさ|くださ)(います|いました|る|った)$',
r'(いただ|お|ご|御)(き|きます|きました|く|ける|けます)',
r'(申し上げ|申し|存じ上げ|存じ|伺い|参り)(ます|ました|る)$',
r'(拝見|拝聴|承り|承)(します|しました|いたします|いたしました)$',
r'お[^あ-ん]+[になる|になります|くださる|くださいます]'
],
'chinese': [
# Modern titles
r'\b(王|女王|王子|公主|皇帝|皇后|大王|大公|公爵|伯爵|子爵|男爵|骑士|将军|大将|元帅|提督|舰长|大臣|宰相|总理|大总统|市长|知事|检察官|法官|律师|医生|博士|教授|神父|牧师|和尚|道士)\b',
r'\b(陛下|殿下|阁下|大人|老爷|夫人|小姐|公子|少爷|姑娘|先生)\b',
# Imperial titles
r'\b(天子|圣上|皇上|万岁|万岁爷|太上皇|皇太后|太后|皇后|贵妃|妃|嫔|贵人|常在|答应)\b',
r'\b(太子|皇子|皇孙|亲王|郡王|贝勒|贝子|公主|格格|郡主|县主|郡君|县君)\b',
# Ancient official titles
r'\b(丞相|相国|太师|太傅|太保|太尉|司徒|司空|大司马|大司农|大司寇)\b',
r'\b(尚书|侍郎|郎中|员外郎|主事|知府|知州|知县|同知|通判|推官|巡抚|总督)\b',
r'\b(御史大夫|御史中丞|监察御史|给事中|都察院|翰林院|国子监|钦天监)\b',
r'\b(大学士|学士|侍读|侍讲|编修|检讨|庶吉士|举人|进士|状元|榜眼|探花)\b',
# Military ranks
r'\b(大元帅|元帅|大将军|将军|都督|都指挥使|指挥使|千户|百户|总兵|副将|参将|游击|都司|守备)\b',
r'\b(提督|总兵官|副总兵|参将|游击将军|都司|守备|千总|把总|外委)\b',
# Religious titles
r'\b(国师|帝师|法王|活佛|堪布|仁波切|大和尚|方丈|住持|首座|维那|知客)\b',
r'\b(天师|真人|道长|掌教|监院|高功|都讲|总理|提点|知观)\b',
# Nobility ranks
r'\b(公|侯|伯|子|男|开国公|郡公|国公|郡侯|县侯|郡伯|县伯|县子|县男)\b',
r'\b(一品|二品|三品|四品|五品|六品|七品|八品|九品|正一品|从一品|正二品|从二品)\b',
# Chinese politeness markers
r'(请|敢|恭|敬|烦|有)(问|请|赐|教|告|示)',
r'(拜|惠|赐|垂|雅|笑)(见|访|读|托|谢|顾|赐|存|纳|正|询)',
r'(敬|谨|顿)(上|呈|启|白|首)'
],
'english': [
# Western titles
r'\b(King|Queen|Prince|Princess|Emperor|Empress|Duke|Duchess|Marquis|Marquess|Earl|Count|Countess|Viscount|Viscountess|Baron|Baroness|Knight|Lord|Lady|Sir|Dame|General|Admiral|Captain|Major|Colonel|Commander|Lieutenant|Sergeant|Minister|Chancellor|President|Mayor|Governor|Judge|Doctor|Professor|Father|Reverend|Master|Mistress)\b',
r'\b(His|Her|Your|Their)\s+(Majesty|Highness|Grace|Excellency|Honor|Worship|Lordship|Ladyship)\b',
# Romanized historical titles
r'\b(Tianzi|Huangdi|Huanghou|Taizi|Qinwang|Junwang|Beile|Beizi|Gongzhu|Gege)\b',
r'\b(Chengxiang|Zaixiang|Taishi|Taifu|Taibao|Taiwei|Situ|Sikong|Dasima)\b',
r'\b(Shogun|Daimyo|Samurai|Ronin|Ninja|Tenno|Mikado|Kampaku|Sessho)\b',
r'\b(Taewang|Wangbi|Wanghu|Seja|Daegun|Gun|Ongju|Gongju|Buma)\b'
]
}
# Expanded Chinese numbers including classical forms
CHINESE_NUMS = {
# Basic numbers
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5,
'六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
'十一': 11, '十二': 12, '十三': 13, '十四': 14, '十五': 15,
'十六': 16, '十七': 17, '十八': 18, '十九': 19, '二十': 20,
'二十一': 21, '二十二': 22, '二十三': 23, '二十四': 24, '二十五': 25,
'三十': 30, '四十': 40, '五十': 50, '六十': 60,
'七十': 70, '八十': 80, '九十': 90, '百': 100,
# Classical/formal numbers
'壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5,
'陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
'佰': 100, '仟': 1000, '萬': 10000, '万': 10000,
# Ordinal indicators
'第一': 1, '第二': 2, '第三': 3, '第四': 4, '第五': 5,
'首': 1, '次': 2, '初': 1, '末': -1,
}
# Common words - keeping the same for filtering
COMMON_WORDS = {
'이', '그', '저', '우리', '너희', '자기', '당신', '여기', '거기', '저기',
'오늘', '내일', '어제', '지금', '아까', '나중', '먼저', '다음', '마지막',
'모든', '어떤', '무슨', '이런', '그런', '저런', '같은', '다른', '새로운',
'하다', '있다', '없다', '되다', '하는', '있는', '없는', '되는',
'것', '수', '때', '년', '월', '일', '시', '분', '초',
'은', '는', '이', '가', '을', '를', '에', '의', '와', '과', '도', '만',
'에서', '으로', '로', '까지', '부터', '에게', '한테', '께', '께서',
'この', 'その', 'あの', 'どの', 'これ', 'それ', 'あれ', 'どれ',
'わたし', 'あなた', 'かれ', 'かのじょ', 'わたしたち', 'あなたたち',
'きょう', 'あした', 'きのう', 'いま', 'あとで', 'まえ', 'つぎ',
'の', 'は', 'が', 'を', 'に', 'で', 'と', 'も', 'や', 'から', 'まで',
'这', '那', '哪', '这个', '那个', '哪个', '这里', '那里', '哪里',
'我', '你', '他', '她', '它', '我们', '你们', '他们', '她们',
'今天', '明天', '昨天', '现在', '刚才', '以后', '以前', '后来',
'的', '了', '在', '是', '有', '和', '与', '或', '但', '因为', '所以',
'一', '二', '三', '四', '五', '六', '七', '八', '九', '十',
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0',
}
# =====================================================
# CHUNK CONTEXT MANAGER (unchanged - already optimal)
# =====================================================
class ChunkContextManager:
"""Manage context within a chapter separate from history"""
def __init__(self):
self.current_chunks = []
self.chapter_num = None
self.chapter_title = None
def start_chapter(self, chapter_num, chapter_title):
"""Start a new chapter context"""
self.current_chunks = []
self.chapter_num = chapter_num
self.chapter_title = chapter_title
def add_chunk(self, user_content, assistant_content, chunk_idx, total_chunks):
"""Add a chunk to the current chapter context"""
self.current_chunks.append({
"user": user_content,
"assistant": assistant_content,
"chunk_idx": chunk_idx,
"total_chunks": total_chunks
})
def get_context_messages(self, limit=3):
"""Get last N chunks as messages for API context"""
context = []
for chunk in self.current_chunks[-limit:]:
context.extend([
{"role": "user", "content": chunk["user"]},
{"role": "assistant", "content": chunk["assistant"]}
])
return context
def get_summary_for_history(self):
"""Create a summary representation for the history"""
if not self.current_chunks:
return None, None
total_chunks = len(self.current_chunks)
user_summary = f"[Chapter {self.chapter_num}: {self.chapter_title}]\n"
user_summary += f"[{total_chunks} chunks processed]\n"
if self.current_chunks:
first_chunk = self.current_chunks[0]['user']
if len(first_chunk) > 500:
user_summary += first_chunk[:500] + "..."
else:
user_summary += first_chunk
assistant_summary = f"[Chapter {self.chapter_num} Translation Complete]\n"
assistant_summary += f"[Translated in {total_chunks} chunks]\n"
if self.current_chunks:
samples = []
first_trans = self.current_chunks[0]['assistant']
samples.append(f"Beginning: {first_trans[:200]}..." if len(first_trans) > 200 else f"Beginning: {first_trans}")
if total_chunks > 2:
mid_idx = total_chunks // 2
mid_trans = self.current_chunks[mid_idx]['assistant']
samples.append(f"Middle: {mid_trans[:200]}..." if len(mid_trans) > 200 else f"Middle: {mid_trans}")
if total_chunks > 1:
last_trans = self.current_chunks[-1]['assistant']
samples.append(f"End: {last_trans[:200]}..." if len(last_trans) > 200 else f"End: {last_trans}")
assistant_summary += "\n".join(samples)
return user_summary, assistant_summary
def clear(self):
"""Clear the current chapter context"""
self.current_chunks = []
self.chapter_num = None
self.chapter_title = None
# =====================================================
# UNIFIED UTILITIES
# =====================================================
class FileUtilities:
"""Utilities for file and path operations"""
@staticmethod
def extract_actual_chapter_number(chapter, patterns=None, config=None):
"""Extract actual chapter number from filename using improved logic"""
# IMPORTANT: Check if this is a pre-split TEXT FILE chunk first
if (chapter.get('is_chunk', False) and
'num' in chapter and
isinstance(chapter['num'], float) and
chapter.get('filename', '').endswith('.txt')):
# For text file chunks only, preserve the decimal number
return chapter['num'] # This will be 1.1, 1.2, etc.
# Get filename for extraction (broadened to match GUI/spine data)
filename = (
chapter.get('original_basename')
or chapter.get('original_filename')
or chapter.get('filename')
or chapter.get('source_filename')
or chapter.get('href')
or chapter.get('idref')
or chapter.get('id')
or chapter.get('name')
or chapter.get('key')
or ''
)
opf_spine_position = chapter.get('spine_order')
if opf_spine_position is None:
opf_spine_position = chapter.get('opf_spine_position')
actual_num, method = extract_chapter_number_from_filename(filename, opf_spine_position=opf_spine_position)
# If extraction failed (no digits and no special), fall back to spine/file data
if actual_num is None and opf_spine_position is not None:
actual_num = opf_spine_position
method = 'opf_spine_fallback'
# Only fall back to file_chapter_num when we still have no number
if actual_num is None and chapter.get('file_chapter_num') is not None:
actual_num = chapter['file_chapter_num']
method = 'file_chapter_num_fallback'
# Prefer OPF spine position when available (ensures range selection follows content.opf)
# opf_spine_position = chapter.get('spine_order')
# opf_spine_data = chapter.get('opf_spine_data')
# Use our improved extraction function
# actual_num, method = extract_chapter_number_from_filename(
# filename,
# opf_spine_position=opf_spine_position,
# opf_spine_data=opf_spine_data
# )
# If extraction succeeded, return the result
if actual_num is not None:
#print(f"[DEBUG] Extracted {actual_num} from '{filename}' using method: {method}")
return actual_num
# Fallback to original complex logic for edge cases
actual_num = None
if patterns is None:
patterns = PatternManager.FILENAME_EXTRACT_PATTERNS
# Try to extract from original basename first
if chapter.get('original_basename'):
basename = chapter['original_basename']
# Check if decimal chapters are enabled for EPUBs
enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1'
# For EPUBs, only check decimal patterns if the toggle is enabled
if enable_decimal:
# Check for standard decimal chapter numbers (e.g., Chapter_1.1, 1.2.html)
decimal_match = re.search(r'(\d+)\.(\d+)', basename)
if decimal_match:
actual_num = float(f"{decimal_match.group(1)}.{decimal_match.group(2)}")
return actual_num
# Check for the XXXX_YY pattern where it represents X.YY decimal chapters
decimal_prefix_match = re.match(r'^(\d{4})_(\d{1,2})(?:_|\.)?(?:x?html?)?$', basename)
if decimal_prefix_match:
first_part = decimal_prefix_match.group(1)
second_part = decimal_prefix_match.group(2)
if len(second_part) == 2 and int(second_part) > 9:
chapter_num = int(first_part[-1])
decimal_part = second_part
actual_num = float(f"{chapter_num}.{decimal_part}")
return actual_num
# Standard XXXX_Y format handling (existing logic)
prefix_suffix_match = re.match(r'^(\d+)_(\d+)', basename)
if prefix_suffix_match:
second_part = prefix_suffix_match.group(2)
if not enable_decimal:
actual_num = int(second_part)
return actual_num
else:
if len(second_part) == 1 or (len(second_part) == 2 and int(second_part) <= 9):
actual_num = int(second_part)
return actual_num
# Check other patterns if no match yet
for pattern in patterns:
if pattern in [r'^(\d+)[_\.]', r'(\d{3,5})[_\.]', r'^(\d+)_']:
continue
match = re.search(pattern, basename, re.IGNORECASE)
if match:
actual_num = int(match.group(1))
break
# Final fallback to chapter num
if actual_num is None:
actual_num = chapter.get("num", 0)
print(f"[DEBUG] No pattern matched, using chapter num: {actual_num}")
return actual_num
@staticmethod
def create_chapter_filename(chapter, actual_num=None):
"""Create consistent chapter filename"""
# Check if we should use header as output name
use_header_output = os.getenv("USE_HEADER_AS_OUTPUT", "0") == "1"
# Check if this is for a text file
is_text_file = chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False)
# Respect toggle: retain source extension and remove 'response_' prefix
retain = should_retain_source_extension()
# Helper to compute full original extension chain (e.g., '.html.xhtml')
def _full_ext_from_original(ch):
fn = ch.get('original_filename')
if not fn:
return '.html'
bn = os.path.basename(fn)
root, ext = os.path.splitext(bn)
if not ext:
return '.html'
full_ext = ''
while ext:
full_ext = ext + full_ext
root, ext = os.path.splitext(root)
return full_ext or '.html'
if use_header_output and chapter.get('title'):
chapter_num_for_name = actual_num or chapter.get('num', 0)
safe_title = make_safe_filename(chapter['title'], chapter_num_for_name)
# For comparison, handle both int and float chapter numbers
if isinstance(chapter_num_for_name, float):
major = int(chapter_num_for_name)
minor = int(round((chapter_num_for_name - major) * 100))
if minor > 0:
comparison_name = f"chapter_{major:03d}_{minor:02d}"
else:
comparison_name = f"chapter_{major:03d}"
else:
comparison_name = f"chapter_{chapter_num_for_name:03d}"
if safe_title and safe_title != comparison_name:
if is_text_file:
return f"{safe_title}.txt" if retain else f"response_{safe_title}.txt"
else:
# If retaining, use full original ext chain; else default .html
if retain:
return f"{safe_title}{_full_ext_from_original(chapter)}"
return f"response_{safe_title}.html"
# Check if decimal chapters are enabled
enable_decimal = os.getenv('ENABLE_DECIMAL_CHAPTERS', '0') == '1'
# For EPUBs with decimal detection enabled
if enable_decimal and 'original_basename' in chapter and chapter['original_basename']:
basename = chapter['original_basename']
# Check for standard decimal pattern (e.g., Chapter_1.1)
decimal_match = re.search(r'(\d+)\.(\d+)', basename)
if decimal_match:
# Create a modified basename that preserves the decimal
base = os.path.splitext(basename)[0]
# Replace dots with underscores for filesystem compatibility
base = base.replace('.', '_')
# Use .txt extension for text files
if is_text_file:
return f"{base}.txt" if retain else f"response_{base}.txt"
else:
if retain:
return f"{base}{_full_ext_from_original(chapter)}"
return f"response_{base}.html"
# NOTE: Removed broken XXXX_YY decimal pattern handling that was mangling filenames
# Files like 0009_10.xhtml should just use original_basename (handled below at line 1707+)
# The old code was incorrectly extracting just the last digit and losing leading zeros
# Standard EPUB handling - use original basename
if 'original_basename' in chapter and chapter['original_basename']:
base = os.path.splitext(chapter['original_basename'])[0]
# Use .txt extension for text files
if is_text_file:
return f"{base}.txt" if retain else f"response_{base}.txt"
else:
if retain:
# Preserve the full original extension chain
return f"{base}{_full_ext_from_original(chapter)}"
return f"response_{base}.html"
else:
# Text file handling (no original basename)
if actual_num is None:
actual_num = chapter.get('actual_chapter_num', chapter.get('num', 0))
# Handle decimal chapter numbers from text file splitting
if isinstance(actual_num, float):
major = int(actual_num)
minor = int(round((actual_num - major) * 10)) # Use *10 to get 0, 1, 2, etc. from 1.0, 1.1, 1.2
# PDF CHUNK FIX: Check if the chunk has a specific filename with extension
# For PDF chunks, preserve the .html or .md extension from the original filename
chunk_filename = chapter.get('filename', '')
if chunk_filename and (chunk_filename.endswith('.html') or chunk_filename.endswith('.md')):
# Use the extension from the chunk's original filename
file_ext = '.html' if chunk_filename.endswith('.html') else '.md'
if retain:
return f"section_{major}_{minor}{file_ext}"
else:
return f"response_section_{major}_{minor}{file_ext}"
elif is_text_file:
return f"section_{major}_{minor}.txt" if retain else f"response_section_{major}_{minor}.txt"
else:
return f"{major:03d}_{minor:02d}.html" if retain else f"response_{major:03d}_{minor:02d}.html"
else:
# For integer chapter numbers, use standard formatting
if is_text_file:
return f"section_{actual_num}.txt" if retain else f"response_section_{actual_num}.txt"
else:
return f"{actual_num:03d}.html" if retain else f"response_{actual_num:03d}.html"
# =====================================================
# UNIFIED PROGRESS MANAGER
# =====================================================
class ProgressManager:
"""Unified progress management"""
def __init__(self, payloads_dir):
self.payloads_dir = payloads_dir
self.PROGRESS_FILE = os.path.join(payloads_dir, "translation_progress.json")
self.prog = self._init_or_load()
# Disable auto-dedup unless explicitly enabled; dedup can drop distinct chapters sharing filenames
if os.getenv("ENABLE_PROGRESS_DEDUP", "0") == "1":
self._dedup_by_output()
def _init_or_load(self):
"""Initialize or load progress tracking with improved structure"""
if os.path.exists(self.PROGRESS_FILE):
try:
with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf:
prog = json.load(pf)
except json.JSONDecodeError as e:
print(f"⚠️ Warning: Progress file is corrupted: {e}")
print("🔧 Attempting to fix JSON syntax...")
try:
with open(self.PROGRESS_FILE, "r", encoding="utf-8") as pf:
content = pf.read()
content = re.sub(r',\s*\]', ']', content)
content = re.sub(r',\s*\}', '}', content)
prog = json.loads(content)
with open(self.PROGRESS_FILE, "w", encoding="utf-8") as pf:
json.dump(prog, pf, ensure_ascii=False, indent=2)
print("✅ Successfully fixed and saved progress file")
except Exception as fix_error:
print(f"❌ Could not fix progress file: {fix_error}")
print("🔄 Creating backup and starting fresh...")
backup_name = f"translation_progress_backup_{int(time.time())}.json"
backup_path = os.path.join(self.payloads_dir, backup_name)
try:
shutil.copy(self.PROGRESS_FILE, backup_path)
print(f"📁 Backup saved to: {backup_name}")
except:
pass
prog = {
"chapters": {},
"chapter_chunks": {},
"version": "2.0"
}
if "chapters" not in prog:
prog["chapters"] = {}
for idx in prog.get("completed", []):
prog["chapters"][str(idx)] = {
"status": "completed",
"timestamp": None
}
if "chapter_chunks" not in prog:
prog["chapter_chunks"] = {}
else:
prog = {
"chapters": {},
"chapter_chunks": {},
"image_chunks": {},
"version": "2.1"
}
return prog
def _dedup_by_output(self):
"""Keep a single entry per normalized output filename; priority: qa_failed > pending > failed > in_progress > completed."""
def _norm_out(fname: str):
if not fname:
return None
base = os.path.basename(fname)
if base.startswith("response_"):
base = base[len("response_"):]
return os.path.splitext(base)[0]
def _infer_num(fname: str):
if not fname:
return None
nums = re.findall(r"\d+", fname)
if not nums:
return None
nums = list(map(int, nums))
if nums[0] == 0 and nums[-1] > 0:
return nums[-1]
return nums[0]
# Prefer completed over failed/pending/in_progress, but keep qa_failed highest
severity = {'qa_failed': 6, 'completed': 5, 'merged': 5, 'pending': 4, 'failed': 3, 'in_progress': 2, 'unknown': 0}
dedup = {}
for key, info in list(self.prog.get("chapters", {}).items()):
out = info.get("output_file")
norm = _norm_out(out) or key
if (info.get("actual_num") in (None, 0)) and out:
hint = _infer_num(out)
if hint is not None:
info["actual_num"] = hint
current = dedup.get(norm)
if current:
cur_rank = severity.get(current.get("status", "unknown"), 0)
new_rank = severity.get(info.get("status", "unknown"), 0)
if (new_rank > cur_rank) or (new_rank == cur_rank and info.get("last_updated", 0) > current.get("last_updated", 0)):
dedup[norm] = info
else:
dedup[norm] = info
new_chapters = {}
for norm, info in dedup.items():
new_key = str(info["actual_num"]) if info.get("actual_num") is not None else norm
if new_key in new_chapters:
cur_rank = severity.get(new_chapters[new_key].get("status", "unknown"), 0)
new_rank = severity.get(info.get("status", "unknown"), 0)
if (new_rank > cur_rank) or (new_rank == cur_rank and info.get("last_updated", 0) > new_chapters[new_key].get("last_updated", 0)):
new_chapters[new_key] = info
else:
new_chapters[new_key] = info
self.prog["chapters"] = new_chapters
# NOTE: caller is responsible for saving after dedup
def _get_chapter_key(self, actual_num, output_file=None, chapter_obj=None, content_hash=None):
"""Generate consistent chapter key, handling collisions with composite keys.
Returns the key that should be used for this chapter in the progress dict.
"""
def _normalize_fname(fname):
"""Normalize filename for comparison regardless of response_ prefix or extension."""
if not fname:
return None
base = os.path.basename(fname)
if base.startswith('response_'):
base = base[len('response_'):]
# Strip extension only for comparison so .html vs .xhtml don't diverge
return os.path.splitext(base)[0]
def _make_spine_key(num, spine_pos):
if spine_pos is None:
return None
return f"{num}@{spine_pos}"
spine_pos = None
if chapter_obj:
spine_pos = chapter_obj.get('spine_order')
if spine_pos is None:
spine_pos = chapter_obj.get('opf_spine_position')
# CHUNK FIX: For decimal chapter numbers (e.g., 1.0, 1.1), use the full decimal in the key
# This prevents collisions when multiple chunks share the same integer part
if isinstance(actual_num, float) and actual_num != int(actual_num):
# Convert to string preserving decimal: "1.0", "1.1", etc.
chapter_key = str(actual_num)
else:
chapter_key = str(actual_num)
# Determine the output filename
if output_file:
filename = output_file
elif chapter_obj:
from TransateKRtoEN import FileUtilities
filename = FileUtilities.create_chapter_filename(chapter_obj, actual_num)
else:
# No way to determine filename, use simple key
return chapter_key
# SPECIAL FILES FIX: Check if there's an in-progress entry with matching content_hash
# This allows us to update the same entry when completing a special file
if content_hash and chapter_key in self.prog["chapters"]:
existing_info = self.prog["chapters"][chapter_key]
existing_hash = existing_info.get("content_hash")
existing_file = existing_info.get("output_file")
# If hashes match and it's in-progress (no output file yet), keep using simple key
if existing_hash == content_hash and not existing_file:
return chapter_key
# If a spine key already exists, prefer it
spine_key = _make_spine_key(actual_num, spine_pos)
if spine_key and spine_key in self.prog["chapters"]:
existing_info = self.prog["chapters"][spine_key]
existing_file = existing_info.get("output_file")
# Require exact filename match to avoid mixing notice/chapter files with same number
if existing_file == filename:
return spine_key
# Check if simple key exists and matches this file
if chapter_key in self.prog["chapters"]:
existing_info = self.prog["chapters"][chapter_key]
existing_file = existing_info.get("output_file")
existing_status = existing_info.get("status")
# If the existing entry is for the same file, use simple key
if existing_file == filename:
return chapter_key
# NEW: tolerate retain-source toggle changes (response_ prefix / extension)
existing_norm = _normalize_fname(existing_file)
new_norm = _normalize_fname(filename)
if existing_norm and new_norm and existing_norm == new_norm:
return chapter_key
# MERGED STATUS FIX: If existing entry is merged, always use simple key
# Merged chapters point to parent's output_file, so filename won't match
# but we still want to use the same key to find the merged status
if existing_status == "merged":
return chapter_key
# Different file with same chapter number - prefer spine-based composite, else filename-based
if spine_key:
return spine_key
file_basename = os.path.splitext(os.path.basename(filename))[0]
file_basename = file_basename.replace("response_", "")
composite_key = f"{actual_num}_{file_basename}"
# NEW: if existing entry is pending and for a different file, don't overwrite it
if existing_status and str(existing_status).lower().startswith("pending"):
if existing_file and existing_file != filename:
return composite_key
return composite_key
# Check if composite key already exists for this file
file_basename = os.path.splitext(os.path.basename(filename))[0]
file_basename = file_basename.replace("response_", "")
composite_key = f"{actual_num}_{file_basename}"
spine_composite = spine_key
if spine_composite and spine_composite in self.prog["chapters"]:
return spine_composite
if composite_key in self.prog["chapters"]:
return composite_key
# No existing entry - use simple key for new entries
return spine_key or chapter_key
def save(self):
"""Save progress to file"""
try:
self.prog["completed_list"] = []
for chapter_key, chapter_info in self.prog.get("chapters", {}).items():
if chapter_info.get("status") == "completed" and chapter_info.get("output_file"):
actual_num = chapter_info.get("actual_num", 0)
self.prog["completed_list"].append({
"num": actual_num,
"idx": 0, # idx is not used anymore
"title": f"Chapter {actual_num}",
"file": chapter_info.get("output_file", ""),
"key": chapter_key
})
if self.prog.get("completed_list"):
self.prog["completed_list"].sort(key=lambda x: x["num"])
temp_file = self.PROGRESS_FILE + '.tmp'
with open(temp_file, "w", encoding="utf-8") as pf:
json.dump(self.prog, pf, ensure_ascii=False, indent=2)
if os.path.exists(self.PROGRESS_FILE):
os.remove(self.PROGRESS_FILE)
os.rename(temp_file, self.PROGRESS_FILE)
except Exception as e:
print(f"⚠️ Warning: Failed to save progress: {e}")
temp_file = self.PROGRESS_FILE + '.tmp'
if os.path.exists(temp_file):
try:
os.remove(temp_file)
except:
pass
def update(self, idx, actual_num, content_hash, output_file, status="in_progress", ai_features=None, raw_num=None, chapter_obj=None, merged_chapters=None, qa_issues_found=None):
"""Update progress for a chapter"""
# Use helper method to get consistent key
chapter_key = self._get_chapter_key(actual_num, output_file, chapter_obj, content_hash)
# Log if we're using a composite key
if "_" in chapter_key and chapter_key != str(actual_num):
print(f"📌 Using composite key for chapter {actual_num}: {chapter_key}")
# MERGED CHAPTERS FIX: If this chapter has merged children and status changes to failed/pending,
# clear the merged status from all child chapters so they can be retranslated
if status in ["qa_failed", "failed", "pending", "error"] and chapter_key in self.prog["chapters"]:
existing_info = self.prog["chapters"][chapter_key]
merged_child_nums = existing_info.get("merged_chapters", [])
if merged_child_nums:
print(f"🔓 Clearing merged status from {len(merged_child_nums)} child chapters due to parent status: {status}")
# Find and clear merged status from all child chapters
for child_chapter_key, child_info in list(self.prog["chapters"].items()):
if child_info.get("status") == "merged" and child_info.get("merged_parent_chapter") == actual_num:
child_actual_num = child_info.get("actual_num")
print(f" 🔓 Clearing merged status for chapter {child_actual_num}")
# Delete the merged child entry so it will be retranslated
del self.prog["chapters"][child_chapter_key]
chapter_info = {
"actual_num": actual_num,
"content_hash": content_hash,
"output_file": output_file,
"status": status,
"last_updated": time.time()
}
# CRITICAL: Store original_basename for OPF->output mapping in GUI
if chapter_obj:
if chapter_obj.get('original_basename'):
chapter_info["original_basename"] = chapter_obj['original_basename']
elif chapter_obj.get('original_filename'):
chapter_info["original_basename"] = os.path.basename(chapter_obj['original_filename'])
# Add raw number tracking
if raw_num is not None:
chapter_info["raw_chapter_num"] = raw_num
# Check if zero detection was disabled
if hasattr(builtins, '_DISABLE_ZERO_DETECTION') and builtins._DISABLE_ZERO_DETECTION:
chapter_info["zero_adjusted"] = False
else:
chapter_info["zero_adjusted"] = (raw_num != actual_num) if raw_num is not None else False
# FIXED: Store AI features if provided
if ai_features is not None:
chapter_info["ai_features"] = ai_features
# Preserve existing AI features if not overwriting
elif chapter_key in self.prog["chapters"] and "ai_features" in self.prog["chapters"][chapter_key]:
chapter_info["ai_features"] = self.prog["chapters"][chapter_key]["ai_features"]
# Add merged chapters list if provided (for parent chapters in request merging)
if merged_chapters is not None:
chapter_info["merged_chapters"] = merged_chapters
# Add QA issues if provided (for qa_failed status)
if qa_issues_found is not None:
chapter_info["qa_issues"] = True
chapter_info["qa_timestamp"] = time.time()
chapter_info["qa_issues_found"] = qa_issues_found
# IMPORTANT: When changing to in_progress or failed status, explicitly clear QA fields
# This ensures old qa_failed markers don't persist
elif status in ["in_progress", "failed"]:
# Don't add QA fields - they will be excluded from chapter_info
pass
self.prog["chapters"][chapter_key] = chapter_info
def mark_as_merged(self, idx, actual_num, content_hash, parent_chapter_num, chapter_obj=None, parent_output_file=None):
"""Mark a chapter as merged into a parent chapter"""
chapter_key = self._get_chapter_key(actual_num, output_file=None, chapter_obj=chapter_obj, content_hash=content_hash)
merged_info = {
"actual_num": actual_num,
"content_hash": content_hash,
"output_file": parent_output_file, # Point to parent's output file
"status": "merged",
"merged_parent_chapter": parent_chapter_num,
"last_updated": time.time()
}
# Add original_basename so GUI can match by source filename
if chapter_obj and 'original_basename' in chapter_obj:
merged_info["original_basename"] = chapter_obj['original_basename']
elif chapter_obj and 'filename' in chapter_obj:
merged_info["original_basename"] = chapter_obj['filename']
self.prog["chapters"][chapter_key] = merged_info
def update_merged_chapters_list(self, parent_chapter_num, merged_chapter_nums, parent_content_hash=None, parent_chapter_obj=None):
"""Update the parent chapter to track which chapters were merged into it"""
chapter_key = self._get_chapter_key(parent_chapter_num, output_file=None, chapter_obj=parent_chapter_obj, content_hash=parent_content_hash)
if chapter_key in self.prog["chapters"]:
self.prog["chapters"][chapter_key]["merged_chapters"] = merged_chapter_nums
def check_chapter_status(self, chapter_idx, actual_num, content_hash, output_dir, chapter_obj=None):
"""Check if a chapter needs translation"""
# Define _norm helper early so it's available throughout this method
def _norm(fname: str):
"""
Normalize a filename for comparison:
- drop leading response_ prefix
- strip *all* extensions (handle .html.xhtml, .md.html, etc.)
- lowercase for case-insensitive matching on Windows
"""
if not fname:
return ""
base = os.path.basename(fname)
if base.startswith("response_"):
base = base[len("response_"):]
# Strip all extensions, not just the last one
while True:
base, ext = os.path.splitext(base)
if not ext:
break
return base.lower()
# If caller passed 0/None, recompute from filename/spine to avoid collapsing to chapter 0
if (actual_num is None or actual_num <= 0) and chapter_obj:
try:
from TransateKRtoEN import FileUtilities
recomputed = FileUtilities.extract_actual_chapter_number(chapter_obj, patterns=None, config=None)
if recomputed is not None:
actual_num = recomputed
except Exception:
pass
# Use helper method to get consistent key
chapter_key = self._get_chapter_key(actual_num, output_file=None, chapter_obj=chapter_obj, content_hash=content_hash)
# Check if we have tracking for this chapter
if chapter_key in self.prog["chapters"]:
chapter_info = self.prog["chapters"][chapter_key]
status = chapter_info.get("status")
status_l = status.lower() if isinstance(status, str) else status or ""
# Failed statuses ALWAYS trigger retranslation
if status in ["qa_failed", "failed", "error", "file_missing"]:
return True, None, None
# Merged status - skip translation, content is in parent chapter
if status == "merged":
parent_chapter = chapter_info.get("merged_parent_chapter")
return False, f"Chapter {actual_num} merged into chapter {parent_chapter}", None
# Completed - check file exists
if status in ["completed", "completed_empty", "completed_image_only"]:
output_file = chapter_info.get("output_file")
if output_file:
output_path = os.path.join(output_dir, output_file)
if os.path.exists(output_path):
return False, f"Chapter {actual_num} already translated: {output_file}", output_file
# Fallback: look for any file with same base name (ignore extensions)
expected_norm = _norm(output_file)
try:
for f in os.listdir(output_dir):
if _norm(f) == expected_norm:
alt_path = os.path.join(output_dir, f)
if os.path.exists(alt_path):
# Update stored filename to the discovered one
self.prog["chapters"][chapter_key]["output_file"] = f
self.save()
return False, f"Chapter {actual_num} already translated: {f}", f
except Exception:
pass
# File missing - retranslate
del self.prog["chapters"][chapter_key]
if chapter_key in self.prog.get("chapter_chunks", {}):
del self.prog["chapter_chunks"][chapter_key]
self.save()
return True, None, None
# Any other status - retranslate
return True, None, None
# No entry in progress tracking - check if file exists on disk
# This handles the case where progress file was deleted but translated files remain
if chapter_obj:
from TransateKRtoEN import FileUtilities
output_filename = FileUtilities.create_chapter_filename(chapter_obj, actual_num)
output_path = os.path.join(output_dir, output_filename)
# If a differently-keyed entry already tracks this file, reuse it instead of auto-discovering
expected_norm = _norm(output_filename)
for k, info in self.prog.get("chapters", {}).items():
if _norm(info.get("output_file")) == expected_norm:
status = info.get("status")
if status in ["completed", "completed_empty", "completed_image_only"]:
if info.get("output_file"):
if os.path.exists(os.path.join(output_dir, info["output_file"])):
return False, f"Chapter {info.get('actual_num', actual_num)} already translated: {info['output_file']}", info["output_file"]
# If tracked with other status, treat as tracked (will retranslate if non-completed)
return True, None, info.get("output_file")
# Check if file exists for auto-discovery
if os.path.exists(output_path):
print(f"📁 Found existing file for chapter {actual_num}: {output_filename}")
self.prog["chapters"][chapter_key] = {
"actual_num": actual_num,
"content_hash": content_hash,
"output_file": output_filename,
"status": "completed",
"last_updated": os.path.getmtime(output_path),
"auto_discovered": True
}
self.save()
return False, f"Chapter {actual_num} already exists: {output_filename}", output_filename
# No entry and no file - needs translation
return True, None, None
def cleanup_missing_files(self, output_dir):
"""Remove missing files and clear merged children of missing parents"""
cleaned_count = 0
deleted_parents = set() # Track which parent chapters were deleted
parents_with_missing_files = set() # Track parents with missing files (for merged children clearing)
# First pass: Remove entries for missing files (except merged children and certain non-final states)
for chapter_key, chapter_info in list(self.prog["chapters"].items()):
output_file = chapter_info.get("output_file")
status = chapter_info.get("status")
status_l = status.lower().strip() if isinstance(status, str) else (str(status).lower().strip() if status is not None else "")
# MERGED CHAPTERS FIX: Don't delete merged children in first pass
# They will be handled in second pass if their parent was deleted
if status == "merged":
continue
# QA_FAILED / FAILED / IN_PROGRESS / PENDING FIX:
# Don't delete entries that are meant to be visible in the retranslation UI
# even when their output file is missing.
# - qa_failed/failed: should remain visible for investigation/retry
# - in_progress: file doesn't exist yet because translation is ongoing
# - pending: user explicitly marked for retranslation; file may have been deleted on purpose
if status_l.startswith("pending") or status_l in ["qa_failed", "failed", "in_progress"]:
continue
if output_file:
output_path = os.path.join(output_dir, output_file)
if not os.path.exists(output_path):
# Before deleting, check if the file was renamed (response_/extension toggle)
_html_exts = {'.html', '.xhtml', '.htm', '.xml'}
def _norm_cleanup(fn):
b = os.path.basename(fn)
if b.startswith('response_'):
b = b[len('response_'):]
while True:
b2, e2 = os.path.splitext(b)
if e2.lower() in _html_exts:
b = b2
else:
break
return b.lower()
expected_norm = _norm_cleanup(output_file)
renamed_match = None
try:
for f in os.listdir(output_dir):
if f.lower().endswith(('.html', '.xhtml', '.htm')):
if _norm_cleanup(f) == expected_norm:
renamed_match = f
break
except Exception:
pass
if renamed_match:
# File was renamed (retain toggle) – update the stored filename
chapter_info['output_file'] = renamed_match
continue
actual_num = chapter_info.get("actual_num")
if actual_num is not None:
# Track if this was a parent of merged chapters
deleted_parents.add(actual_num)
# Also track if this chapter has merged children (for later clearing)
if chapter_info.get("merged_chapters"):
parents_with_missing_files.add(actual_num)
# Delete the entry
del self.prog["chapters"][chapter_key]
# Remove chunk data
if chapter_key in self.prog.get("chapter_chunks", {}):
del self.prog["chapter_chunks"][chapter_key]
cleaned_count += 1
# Second pass: Clear merged children whose parents were deleted OR have missing files
if deleted_parents or parents_with_missing_files:
all_affected_parents = deleted_parents | parents_with_missing_files
for chapter_key, chapter_info in list(self.prog["chapters"].items()):
if chapter_info.get("status") == "merged":
parent_num = chapter_info.get("merged_parent_chapter")
if parent_num in all_affected_parents:
actual_num = chapter_info.get("actual_num")
print(f"🔓 Clearing merged child chapter {actual_num} (parent {parent_num} file is missing)")
del self.prog["chapters"][chapter_key]
cleaned_count += 1
if cleaned_count > 0:
print(f"🔄 Removed {cleaned_count} missing file entries")
def migrate_to_content_hash(self, chapters):
"""Change keys to match actual_num values for proper mapping and sort by chapter number"""
def _normalize_out(fname: str):
if not fname:
return None
base = os.path.basename(fname)
if base.startswith('response_'):
base = base[len('response_'):]
return os.path.splitext(base)[0]
def _infer_num_from_filename(fname: str):
if not fname:
return None
nums = re.findall(r'\\d+', fname)
if not nums:
return None
nums = list(map(int, nums))
if nums[0] == 0 and nums[-1] > 0:
return nums[-1]
return nums[0]
# Priority: qa_failed > pending > failed > in_progress > completed
severity_rank = {'qa_failed': 6, 'completed': 5, 'merged': 5, 'pending': 4, 'failed': 3, 'in_progress': 2, 'unknown': 0}
# First, deduplicate by normalized output filename choosing highest severity then latest timestamp
dedup = {}
for old_key, chapter_info in self.prog["chapters"].items():
out = chapter_info.get("output_file")
norm = _normalize_out(out)
if not norm:
norm = old_key # fallback to key to avoid losing entry
# Fix actual_num if missing or zero using filename hint
actual_num = chapter_info.get("actual_num")
if (actual_num in (None, 0)) and out:
hint = _infer_num_from_filename(out)
if hint is not None:
chapter_info["actual_num"] = hint
actual_num = hint
current_best = dedup.get(norm)
if current_best:
best_sev = severity_rank.get(current_best.get("status", "unknown"), 0)
cur_sev = severity_rank.get(chapter_info.get("status", "unknown"), 0)
if (cur_sev > best_sev) or (cur_sev == best_sev and chapter_info.get("last_updated", 0) > current_best.get("last_updated", 0)):
dedup[norm] = chapter_info
else:
dedup[norm] = chapter_info
new_chapters = {}
migrated_count = 0
for norm, chapter_info in dedup.items():
actual_num = chapter_info.get("actual_num")
key_candidate = None
# Prefer numeric key when available
if actual_num is not None:
key_candidate = str(actual_num)
else:
key_candidate = norm
# If non-numeric key, keep as-is
if not key_candidate.isdigit():
new_key = key_candidate
else:
new_key = key_candidate
# Handle collisions by severity and timestamp
if new_key in new_chapters:
existing = new_chapters[new_key]
best_sev = severity_rank.get(existing.get("status", "unknown"), 0)
cur_sev = severity_rank.get(chapter_info.get("status", "unknown"), 0)
if (cur_sev > best_sev) or (cur_sev == best_sev and chapter_info.get("last_updated", 0) > existing.get("last_updated", 0)):
new_chapters[new_key] = chapter_info
else:
new_chapters[new_key] = chapter_info
migrated_count += 1
# Sort chapters by actual_num field, then by key as fallback
def sort_key(item):
key, chapter_info = item
actual_num = chapter_info.get("actual_num")
if actual_num is not None:
return actual_num
else:
# Fallback to key if no actual_num
try:
return int(key)
except ValueError:
# For non-numeric keys, sort them at the end
return float('inf')
sorted_chapters = dict(sorted(new_chapters.items(), key=sort_key))
if migrated_count > 0:
# Also migrate and sort chapter_chunks if they exist
if "chapter_chunks" in self.prog:
new_chunks = {}
for old_key, chunk_data in self.prog["chapter_chunks"].items():
if not str(old_key).isdigit():
new_chunks[old_key] = chunk_data
elif old_key in self.prog["chapters"] and "actual_num" in self.prog["chapters"][old_key]:
new_key = str(self.prog["chapters"][old_key]["actual_num"])
new_chunks[new_key] = chunk_data
else:
new_chunks[old_key] = chunk_data
# Sort chapter_chunks using the same sorting logic
sorted_chunks = dict(sorted(new_chunks.items(), key=sort_key))
self.prog["chapter_chunks"] = sorted_chunks
self.prog["chapters"] = sorted_chapters
self.save()
print(f"✅ Migrated {migrated_count} entries to use actual_num as key and sorted by chapter number")
else:
# Even if no migration occurred, still apply sorting
self.prog["chapters"] = sorted_chapters
if "chapter_chunks" in self.prog:
sorted_chunks = dict(sorted(self.prog["chapter_chunks"].items(), key=sort_key))
self.prog["chapter_chunks"] = sorted_chunks
self.save()
print("✅ Sorted chapters by chapter number")
def get_stats(self, output_dir):
"""Get statistics about translation progress"""
stats = {
"total_tracked": len(self.prog["chapters"]),
"completed": 0,
"missing_files": 0,
"in_progress": 0
}
for chapter_info in self.prog["chapters"].values():
status = chapter_info.get("status")
output_file = chapter_info.get("output_file")
if status == "completed" and output_file:
output_path = os.path.join(output_dir, output_file)
if os.path.exists(output_path):
stats["completed"] += 1
else:
stats["missing_files"] += 1
elif status == "in_progress":
stats["in_progress"] += 1
elif status == "file_missing":
stats["missing_files"] += 1
return stats
# =====================================================
# UNIFIED CONTENT PROCESSOR
# =====================================================
class ContentProcessor:
"""Unified content processing"""
@staticmethod
def clean_ai_artifacts(text, remove_artifacts=True):
"""Remove AI response artifacts from text - but ONLY when enabled"""
if not remove_artifacts:
return text
# IMPORTANT: Protect split markers used by request merging
# These must NEVER be removed as they're critical for split-the-merge
split_marker_pattern = r'
]*id="split-\d+"[^>]*>.*?SPLIT MARKER.*?
'
has_split_markers = bool(re.search(split_marker_pattern, text, re.DOTALL | re.IGNORECASE))
if has_split_markers:
# Extract and preserve split markers temporarily
split_markers = []
def preserve_marker(match):
marker_id = f"__SPLIT_MARKER_{len(split_markers)}__"
split_markers.append(match.group(0))
return marker_id
text = re.sub(split_marker_pattern, preserve_marker, text, flags=re.DOTALL | re.IGNORECASE)
# First, remove thinking tags if they exist
text = ContentProcessor._remove_thinking_tags(text)
# After removing thinking tags, re-analyze the text structure
# to catch AI artifacts that may now be at the beginning
lines = text.split('\n')
# Clean up empty lines at the beginning
while lines and not lines[0].strip():
lines.pop(0)
if not lines:
# Restore split markers before returning
if has_split_markers:
for i, marker in enumerate(split_markers):
text = text.replace(f"__SPLIT_MARKER_{i}__", marker)
return text
# Check the first non-empty line for AI artifacts
first_line = lines[0].strip()
ai_patterns = [
r'^(?:Sure|Okay|Understood|Of course|Got it|Alright|Certainly|Here\'s|Here is)',
r'^(?:I\'ll|I will|Let me) (?:translate|help|assist)',
r'^(?:System|Assistant|AI|User|Human|Model)\s*:',
r'^\[PART\s+\d+/\d+\]',
r'^(?:Translation note|Note|Here\'s the translation|I\'ve translated)',
r'^```(?:html|xml|text)?\s*$', # Enhanced code block detection
r'^', remaining_text, re.IGNORECASE) or
len(remaining_text.strip()) > 50): # Reduced from 100 to 50
print(f"✂️ Removed AI artifact: {first_line[:50]}...")
return remaining_text.lstrip()
if first_line.lower() in ['html', 'text', 'content', 'translation', 'output']:
remaining_lines = lines[1:]
remaining_text = '\n'.join(remaining_lines)
if remaining_text.strip():
print(f"✂️ Removed single word artifact: {first_line}")
result = remaining_text.lstrip()
# Restore split markers
if has_split_markers:
for i, marker in enumerate(split_markers):
result = result.replace(f"__SPLIT_MARKER_{i}__", marker)
return result
result = '\n'.join(lines)
# Restore split markers before returning
if has_split_markers:
for i, marker in enumerate(split_markers):
result = result.replace(f"__SPLIT_MARKER_{i}__", marker)
return result
@staticmethod
def _remove_thinking_tags(text):
"""Remove thinking tags that some AI models produce"""
if not text:
return text
# Common thinking tag patterns used by various AI models
thinking_patterns = [
# XML-style thinking tags
(r'.*?', 'thinking'),
(r'.*?', 'think'),
(r'.*?', 'thoughts'),
(r'.*?', 'reasoning'),
(r'.*?', 'analysis'),
(r'.*?', 'reflection'),
# OpenAI o1-style reasoning blocks - fix the regex escaping
(r'<\|thinking\|>.*?\|thinking\|>', 'o1-thinking'),
# Claude-style thinking blocks
(r'\[thinking\].*?\[/thinking\]', 'claude-thinking'),
# Generic bracketed thinking patterns
(r'\[THINKING\].*?\[/THINKING\]', 'bracketed-thinking'),
(r'\[ANALYSIS\].*?\[/ANALYSIS\]', 'bracketed-analysis'),
]
original_text = text
removed_count = 0
for pattern, tag_type in thinking_patterns:
# Use DOTALL flag to match across newlines
matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
if matches:
text = re.sub(pattern, '', text, flags=re.DOTALL | re.IGNORECASE)
removed_count += len(matches)
# Also remove standalone code block markers that might be artifacts
# But preserve all actual content - only remove the ``` markers themselves
code_block_removed = 0
code_block_patterns = [
(r'^```\w*\s*\n', '\n'), # Opening code blocks - replace with newline
(r'\n```\s*$', ''), # Closing code blocks at end - remove entirely
(r'^```\w*\s*$', ''), # Standalone ``` on its own line - remove entirely
]
for pattern, replacement in code_block_patterns:
matches = re.findall(pattern, text, re.MULTILINE)
if matches:
text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
code_block_removed += len(matches)
# Clean up any extra whitespace or empty lines left after removing thinking tags
total_removed = removed_count + code_block_removed
if total_removed > 0:
# Remove multiple consecutive newlines
text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
# Remove leading/trailing whitespace
text = text.strip()
if removed_count > 0 and code_block_removed > 0:
print(f"🧠 Removed {removed_count} thinking tag(s) and {code_block_removed} code block marker(s)")
elif removed_count > 0:
print(f"🧠 Removed {removed_count} thinking tag(s)")
elif code_block_removed > 0:
print(f"📝 Removed {code_block_removed} code block marker(s)")
return text
@staticmethod
def clean_memory_artifacts(text):
"""Remove any memory/summary artifacts that leaked into the translation"""
text = re.sub(r'\[MEMORY\].*?\[END MEMORY\]', '', text, flags=re.DOTALL)
lines = text.split('\n')
cleaned_lines = []
skip_next = False
for line in lines:
if any(marker in line for marker in ['[MEMORY]', '[END MEMORY]', 'Previous context summary:',
'memory summary', 'context summary', '[Context]']):
skip_next = True
continue
if skip_next and line.strip() == '':
skip_next = False
continue
skip_next = False
cleaned_lines.append(line)
return '\n'.join(cleaned_lines)
@staticmethod
def emergency_restore_paragraphs(text, original_html=None, verbose=True):
"""Emergency restoration when AI returns wall of text without proper paragraph tags"""
def log(message):
if verbose:
print(message)
if text.count('') >= 3:
return text
if original_html:
original_para_count = original_html.count('
' not in text and len(text) > 300:
log("❌ No paragraph tags found - applying emergency restoration")
if '\n\n' in text:
parts = text.split('\n\n')
paragraphs = ['
' + part.strip() + '
' for part in parts if part.strip()]
return '\n'.join(paragraphs)
dialogue_pattern = r'(?<=[.!?])\s+(?=[""\u201c\u201d])'
if re.search(dialogue_pattern, text):
parts = re.split(dialogue_pattern, text)
paragraphs = []
for part in parts:
part = part.strip()
if part:
if not part.startswith('
'):
part = '
' + part
if not part.endswith('
'):
part = part + ''
paragraphs.append(part)
return '\n'.join(paragraphs)
sentence_boundary = r'(?<=[.!?])\s+(?=[A-Z\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af])'
sentences = re.split(sentence_boundary, text)
if len(sentences) > 1:
paragraphs = []
current_para = []
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
current_para.append(sentence)
should_break = (
len(current_para) >= 3 or
sentence.rstrip().endswith(('"', '"', '"')) or
'* * *' in sentence or
'***' in sentence or
'---' in sentence
)
if should_break:
para_text = ' '.join(current_para)
if not para_text.startswith('
'):
para_text = '
' + para_text
if not para_text.endswith('
'):
para_text = para_text + ''
paragraphs.append(para_text)
current_para = []
if current_para:
para_text = ' '.join(current_para)
if not para_text.startswith('
'):
para_text = '
' + para_text
if not para_text.endswith('
'):
para_text = para_text + ''
paragraphs.append(para_text)
result = '\n'.join(paragraphs)
log(f"✅ Restored {len(paragraphs)} paragraphs from wall of text")
return result
words = text.split()
if len(words) > 100:
paragraphs = []
words_per_para = max(100, len(words) // 10)
for i in range(0, len(words), words_per_para):
chunk = ' '.join(words[i:i + words_per_para])
if chunk.strip():
paragraphs.append('
' + chunk.strip() + '
')
return '\n'.join(paragraphs)
elif '
' in text and text.count('
') < 3 and len(text) > 1000:
log("⚠️ Very few paragraphs for long text - checking if more breaks needed")
soup = BeautifulSoup(text, 'html.parser')
existing_paras = soup.find_all('p')
new_paragraphs = []
for para in existing_paras:
para_text = para.get_text()
if len(para_text) > 500:
sentences = re.split(r'(?<=[.!?])\s+', para_text)
if len(sentences) > 5:
chunks = []
current = []
for sent in sentences:
current.append(sent)
if len(current) >= 3:
chunks.append('
' + ' '.join(current) + '
')
current = []
if current:
chunks.append('
' + ' '.join(current) + '
')
new_paragraphs.extend(chunks)
else:
new_paragraphs.append(str(para))
else:
new_paragraphs.append(str(para))
return '\n'.join(new_paragraphs)
return text
@staticmethod
def emergency_restore_images(text, original_html=None, verbose=True):
"""Emergency restoration of images lost during translation - Filename Pattern Search
Args:
text: Translated HTML or markdown text
original_html: Original HTML before translation (can be actual HTML or converted markdown)
verbose: Whether to print debug messages
Returns:
Text with restored image tags
"""
if not original_html or not text:
return text
def log(message):
if verbose:
print(message)
try:
import re
import os
# Parse both documents
soup_orig = BeautifulSoup(original_html, 'html.parser')
soup_text = BeautifulSoup(text, 'html.parser')
# Extract images from source
orig_images = soup_orig.find_all('img')
if not orig_images:
return text
# Extract images from translation
text_images = soup_text.find_all('img')
# If counts match, nothing to do
if len(orig_images) == len(text_images):
return text
# If translation has fewer images, try to restore them
if len(text_images) < len(orig_images):
log(f"🖼️ Image mismatch! Source: {len(orig_images)}, Translation: {len(text_images)}")
log("🔧 Attempting emergency image restoration (filename search method)...")
# Get the set of image sources present in translation
present_srcs = set()
for img in text_images:
src = img.get('src')
if src:
present_srcs.add(src)
# Collect missing images
missing_images = []
for img in orig_images:
src = img.get('src')
if src and src not in present_srcs:
missing_images.append((src, img))
if not missing_images:
return text
# Convert both to strings for searching
source_str = str(original_html)
text_str = str(text)
inserted_count = 0
# For each missing image, find where it appears in source and insert at same relative position in output
for src, orig_img in missing_images:
# Extract just the filename from the path
filename = os.path.basename(src)
log(f" 🔍 Processing missing image: {src}")
# Try to find with full filename first (most specific)
pattern = re.escape(filename)
source_matches = list(re.finditer(pattern, source_str, re.IGNORECASE))
log(f" Searching for full filename '{filename}': {len(source_matches)} matches")
# If not found, try without response_ prefix
if not source_matches and filename.lower().startswith('response_'):
filename_no_prefix = filename[9:] # Remove 'response_'
pattern = re.escape(filename_no_prefix)
source_matches = list(re.finditer(pattern, source_str, re.IGNORECASE))
log(f" Searching without response_ prefix '{filename_no_prefix}': {len(source_matches)} matches")
# If still not found, try core name without extension (least specific)
if not source_matches:
core_name = os.path.splitext(filename)[0]
if core_name.lower().startswith('response_'):
core_name = core_name[9:]
pattern = re.escape(core_name)
source_matches = list(re.finditer(pattern, source_str, re.IGNORECASE))
log(f" Searching for core name '{core_name}': {len(source_matches)} matches")
if source_matches:
# Found the filename in source! Calculate its relative position
source_pos = source_matches[0].start()
source_len = len(source_str)
# Calculate proportional position (0.0 to 1.0)
relative_pos = source_pos / source_len if source_len > 0 else 0.5
log(f" Position in source: {source_pos}/{source_len} ({relative_pos:.1%})")
# Calculate corresponding position in translation
text_len = len(text_str)
insert_pos = int(relative_pos * text_len)
log(f" Initial insert position in translation: {insert_pos}/{text_len}")
# Find a good insertion point - prefer after closing tag or before opening tag
# Search backwards for '>' or forwards for '<' within reasonable distance
original_insert_pos = insert_pos
max_search_distance = 200
# Search backwards for closing tag
backward_pos = insert_pos
search_start = max(0, insert_pos - max_search_distance)
while backward_pos > search_start and text_str[backward_pos] != '>':
backward_pos -= 1
backward_found = (text_str[backward_pos] == '>')
backward_distance = insert_pos - backward_pos if backward_found else max_search_distance + 1
# Search forwards for opening tag (but skip closing tags like )
forward_pos = insert_pos
search_end = min(len(text_str), insert_pos + max_search_distance)
forward_found = False
while forward_pos < search_end:
if text_str[forward_pos] == '<':
# Check if it's a closing tag
if forward_pos + 1 < len(text_str) and text_str[forward_pos + 1] != '/':
# It's an opening tag, use it
forward_found = True
break
# It's a closing tag, keep searching
forward_pos += 1
forward_distance = forward_pos - insert_pos if forward_found else max_search_distance + 1
# Use whichever is closer
if backward_found and backward_distance <= forward_distance:
insert_pos = backward_pos + 1 # After the '>'
log(f" Adjusted to after closing tag at position {insert_pos} (moved {original_insert_pos - insert_pos} chars back)")
elif forward_found:
insert_pos = forward_pos # Before the '<'
log(f" Adjusted to before opening tag at position {insert_pos} (moved {insert_pos - original_insert_pos} chars forward)")
else:
log(f" No nearby tags found within {max_search_distance} chars, using original position: {insert_pos}")
# Show context around insertion point
context_start = max(0, insert_pos - 30)
context_end = min(len(text_str), insert_pos + 30)
before_context = text_str[context_start:insert_pos]
after_context = text_str[insert_pos:context_end]
log(f" Context: ...{before_context}[INSERT HERE]{after_context}...")
# Create the image tag HTML
img_html = f'
'
log(f" Inserting: {img_html[:80]}...")
# Insert the image HTML at the calculated position
text_str = text_str[:insert_pos] + img_html + text_str[insert_pos:]
inserted_count += 1
log(f" ✅ Inserted successfully at position {insert_pos}")
else:
# Filename not found in source - append to end as fallback
soup_text = BeautifulSoup(text_str, 'html.parser')
body = soup_text.find('body')
if not body:
body = soup_text
new_p = soup_text.new_tag('p')
new_img = soup_text.new_tag('img', src=src)
for attr, val in orig_img.attrs.items():
if attr != 'src':
new_img[attr] = val
new_p.append(new_img)
body.append(new_p)
text_str = str(soup_text)
inserted_count += 1
log(f"✅ Restored {inserted_count} missing images using filename search")
return text_str
except Exception as e:
log(f"⚠️ Failed to restore images: {e}")
import traceback
traceback.print_exc()
return text
return text
@staticmethod
def get_content_hash(html_content):
"""Create a stable hash of content"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
for tag in soup(['script', 'style', 'meta', 'link']):
tag.decompose()
text_content = soup.get_text(separator=' ', strip=True)
text_content = ' '.join(text_content.split())
return hashlib.sha256(text_content.encode('utf-8')).hexdigest()
except Exception as e:
print(f"[WARNING] Failed to create hash: {e}")
return hashlib.sha256(html_content.encode('utf-8')).hexdigest()
@staticmethod
def is_meaningful_text_content(html_content):
"""Check if chapter has meaningful text beyond just structure"""
try:
# Check if this is plain text from enhanced extraction (html2text output)
# html2text output characteristics:
# - Often starts with # for headers
# - Contains markdown-style formatting
# - Doesn't have HTML tags
content_stripped = html_content.strip()
# Quick check for plain text/markdown content
is_plain_text = False
if content_stripped and (
not content_stripped.startswith('<') or # Doesn't start with HTML tag
content_stripped.startswith('#') or # Markdown header
'\n\n' in content_stripped[:500] or # Markdown paragraphs
not '
' in content_stripped[:500] and not '
' in content_stripped[:500] # No common HTML tags
):
# This looks like plain text or markdown from html2text
is_plain_text = True
if is_plain_text:
# For plain text, just check the length
text_length = len(content_stripped)
# Be more lenient with plain text since it's already extracted
return text_length > 50 # Much lower threshold for plain text
# Original HTML parsing logic
soup = BeautifulSoup(html_content, 'html.parser')
soup_copy = BeautifulSoup(str(soup), 'html.parser')
for img in soup_copy.find_all('img'):
img.decompose()
text_elements = soup_copy.find_all(['p', 'div', 'span'])
text_content = ' '.join(elem.get_text(strip=True) for elem in text_elements)
headers = soup_copy.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
header_text = ' '.join(h.get_text(strip=True) for h in headers)
if headers and len(text_content.strip()) > 1:
return True
if len(text_content.strip()) > 200:
return True
if len(header_text.strip()) > 100:
return True
return False
except Exception as e:
print(f"Warning: Error checking text content: {e}")
return True
@staticmethod
def is_only_image_links(html_content):
"""Return True if content contains only image links/paths (no meaningful text)."""
try:
if not html_content:
return False
content_stripped = html_content.strip()
if not content_stripped:
return False
# Prefer HTML text extraction when it looks like HTML
if content_stripped.startswith('<') or '<' in content_stripped[:200]:
try:
soup = BeautifulSoup(html_content, 'html.parser')
text = soup.get_text(separator='\n', strip=True)
except Exception:
text = content_stripped
else:
text = content_stripped
if not text:
return False
image_ext = r'(?:png|jpe?g|gif|webp|svg|bmp)'
md_img = re.compile(r'!\[[^\]]*\]\(([^)]+)\)', re.IGNORECASE)
url_pat = re.compile(r'https?://[^\s)>\"]+\.' + image_ext + r'(?:\?[^\s)>\"]*)?', re.IGNORECASE)
path_pat = re.compile(r'(?:[A-Za-z]:)?[^\s)>\"]+\.' + image_ext + r'(?:\?[^\s)>\"]*)?', re.IGNORECASE)
found_any = False
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
if not lines:
return False
for line in lines:
line_work = line
# Markdown image syntax
for m in md_img.findall(line_work):
if url_pat.search(m) or path_pat.search(m):
found_any = True
line_work = md_img.sub(' ', line_work)
if url_pat.search(line_work):
found_any = True
line_work = url_pat.sub(' ', line_work)
if path_pat.search(line_work):
found_any = True
line_work = path_pat.sub(' ', line_work)
# If any meaningful text remains, it's not image-only
if re.sub(r'[\s\.,;:\(\)\[\]<>\"\'\-\u200b]+', '', line_work):
return False
return found_any
except Exception:
return False
# =====================================================
# UNIFIED TRANSLATION PROCESSOR
# =====================================================
STOP_LOGGED = False
def log_stop_once(message="❌ Translation stopped by user request."):
"""Print a single stop message per run."""
global STOP_LOGGED
if not STOP_LOGGED:
# print(message) # Redundant with "Translation stopped by user" from exception
STOP_LOGGED = True
class TranslationProcessor:
"""Handles the translation of individual chapters"""
def __init__(self, config, client, out_dir, log_callback=None, stop_callback=None, uses_zero_based=False, is_text_file=False):
self.config = config
self.client = client
self.out_dir = out_dir
self.log_callback = log_callback
self.stop_callback = stop_callback
self.chapter_splitter = ChapterSplitter(model_name=config.MODEL)
self.uses_zero_based = uses_zero_based
self.is_text_file = is_text_file
# Check and log multi-key status
if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys:
stats = self.client.get_stats()
self._log(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys")
self._log(f" Active keys: {stats.get('active_keys', 0)}")
def _log(self, message):
"""Log a message"""
if self.log_callback:
self.log_callback(message)
else:
print(message)
def report_key_status(self):
"""Report multi-key status if available"""
if hasattr(self.client, 'get_stats'):
stats = self.client.get_stats()
if stats.get('multi_key_mode', False):
self._log(f"\n📊 API Key Status:")
self._log(f" Active Keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}")
self._log(f" Success Rate: {stats.get('success_rate', 0):.1%}")
self._log(f" Total Requests: {stats.get('total_requests', 0)}\n")
def check_stop(self):
"""Check if translation should stop"""
# During graceful stop, ALWAYS return False to let current chapter complete fully
# The main loop will check GRACEFUL_STOP at the START of each new chapter
if os.environ.get('GRACEFUL_STOP') == '1':
return False
if self.stop_callback and self.stop_callback():
log_stop_once()
return True
def check_duplicate_content(self, result, idx, prog, out, actual_num=None):
"""Check if translated content is duplicate - with mode selection"""
# Get detection mode from config
detection_mode = getattr(self.config, 'DUPLICATE_DETECTION_MODE', 'basic')
print(f" 🔍 DEBUG: Detection mode = '{detection_mode}'")
print(f" 🔍 DEBUG: Lookback chapters = {self.config.DUPLICATE_LOOKBACK_CHAPTERS}")
# Extract content_hash if available from progress
content_hash = None
if detection_mode == 'ai-hunter':
# Try to get content_hash from the current chapter info
# Use actual_num if provided, otherwise fallback to idx+1
if actual_num is not None:
chapter_key = str(actual_num)
else:
chapter_key = str(idx + 1)
if chapter_key in prog.get("chapters", {}):
chapter_info = prog["chapters"][chapter_key]
content_hash = chapter_info.get("content_hash")
print(f" 🔍 DEBUG: Found content_hash for chapter {idx}: {content_hash}")
if detection_mode == 'ai-hunter':
print(" 🤖 DEBUG: Routing to AI Hunter detection...")
# Check if AI Hunter method is available (injected by the wrapper)
if hasattr(self, '_check_duplicate_ai_hunter'):
return self._check_duplicate_ai_hunter(result, idx, prog, out, content_hash)
else:
print(" ⚠️ AI Hunter method not available, falling back to basic detection")
return self._check_duplicate_basic(result, idx, prog, out)
elif detection_mode == 'cascading':
print(" 🔄 DEBUG: Routing to Cascading detection...")
return self._check_duplicate_cascading(result, idx, prog, out)
else:
print(" 📋 DEBUG: Routing to Basic detection...")
return self._check_duplicate_basic(result, idx, prog, out)
def _check_duplicate_basic(self, result, idx, prog, out):
"""Original basic duplicate detection"""
try:
result_clean = re.sub(r'<[^>]+>', '', result).strip().lower()
result_sample = result_clean[:1000]
lookback_chapters = self.config.DUPLICATE_LOOKBACK_CHAPTERS
for prev_idx in range(max(0, idx - lookback_chapters), idx):
prev_key = str(prev_idx)
if prev_key in prog["chapters"] and prog["chapters"][prev_key].get("output_file"):
prev_file = prog["chapters"][prev_key]["output_file"]
prev_path = os.path.join(out, prev_file)
if os.path.exists(prev_path):
try:
with open(prev_path, 'r', encoding='utf-8') as f:
prev_content = f.read()
prev_clean = re.sub(r'<[^>]+>', '', prev_content).strip().lower()
prev_sample = prev_clean[:1000]
# Use SequenceMatcher for similarity comparison
similarity = SequenceMatcher(None, result_sample, prev_sample).ratio()
if similarity >= 0.85: # 85% threshold
print(f" 🚀 Basic detection: Duplicate found ({int(similarity*100)}%)")
return True, int(similarity * 100)
except Exception as e:
print(f" Warning: Failed to read {prev_path}: {e}")
continue
return False, 0
except Exception as e:
print(f" Warning: Failed to check duplicate content: {e}")
return False, 0
def _check_duplicate_cascading(self, result, idx, prog, out):
"""Cascading detection - basic first, then AI Hunter for borderline cases"""
# Step 1: Basic
is_duplicate_basic, similarity_basic = self._check_duplicate_basic(result, idx, prog, out)
if is_duplicate_basic:
return True, similarity_basic
# Step 2: If basic detection finds moderate similarity, use AI Hunter
if similarity_basic >= 60: # Configurable threshold
print(f" 🤖 Moderate similarity ({similarity_basic}%) - running AI Hunter analysis...")
if hasattr(self, '_check_duplicate_ai_hunter'):
is_duplicate_ai, similarity_ai = self._check_duplicate_ai_hunter(result, idx, prog, out)
if is_duplicate_ai:
return True, similarity_ai
else:
print(" ⚠️ AI Hunter method not available for cascading analysis")
return False, max(similarity_basic, 0)
def _extract_text_features(self, text):
"""Extract multiple features from text for AI Hunter analysis"""
features = {
'semantic': {},
'structural': {},
'characters': [],
'patterns': {}
}
# Semantic fingerprint
lines = text.split('\n')
# Character extraction (names that appear 3+ times)
words = re.findall(r'\b[A-Z][a-z]+\b', text)
word_freq = Counter(words)
features['characters'] = [name for name, count in word_freq.items() if count >= 3]
# Dialogue patterns
dialogue_patterns = re.findall(r'"([^"]+)"', text)
features['semantic']['dialogue_count'] = len(dialogue_patterns)
features['semantic']['dialogue_lengths'] = [len(d) for d in dialogue_patterns[:10]]
# Speaker patterns
speaker_patterns = re.findall(r'(\w+)\s+(?:said|asked|replied|shouted|whispered)', text.lower())
features['semantic']['speakers'] = list(set(speaker_patterns[:20]))
# Number extraction
numbers = re.findall(r'\b\d+\b', text)
features['patterns']['numbers'] = numbers[:20]
# Structural signature
para_lengths = []
dialogue_count = 0
for para in text.split('\n\n'):
if para.strip():
para_lengths.append(len(para))
if '"' in para:
dialogue_count += 1
features['structural']['para_count'] = len(para_lengths)
features['structural']['avg_para_length'] = sum(para_lengths) / max(1, len(para_lengths))
features['structural']['dialogue_ratio'] = dialogue_count / max(1, len(para_lengths))
# Create structural pattern string
pattern = []
for para in text.split('\n\n')[:20]: # First 20 paragraphs
if para.strip():
if '"' in para:
pattern.append('D') # Dialogue
elif len(para) > 300:
pattern.append('L') # Long
elif len(para) < 100:
pattern.append('S') # Short
else:
pattern.append('M') # Medium
features['structural']['pattern'] = ''.join(pattern)
return features
def _calculate_exact_similarity(self, text1, text2):
"""Calculate exact text similarity"""
return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()
def _calculate_smart_similarity(self, text1, text2):
"""Smart similarity with length-aware sampling"""
# Check length ratio first
len_ratio = len(text1) / max(1, len(text2))
if len_ratio < 0.7 or len_ratio > 1.3:
return 0.0
# Smart sampling for large texts
if len(text1) > 10000:
sample_size = 3000
samples1 = [
text1[:sample_size],
text1[len(text1)//2 - sample_size//2:len(text1)//2 + sample_size//2],
text1[-sample_size:]
]
samples2 = [
text2[:sample_size],
text2[len(text2)//2 - sample_size//2:len(text2)//2 + sample_size//2],
text2[-sample_size:]
]
similarities = [SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
for s1, s2 in zip(samples1, samples2)]
return sum(similarities) / len(similarities)
else:
# Use first 2000 chars for smaller texts
return SequenceMatcher(None, text1[:2000].lower(), text2[:2000].lower()).ratio()
def _calculate_semantic_similarity(self, sem1, sem2):
"""Calculate semantic fingerprint similarity"""
score = 0.0
max_score = 0.0
# Compare dialogue counts
if 'dialogue_count' in sem1 and 'dialogue_count' in sem2:
max_score += 1.0
ratio = min(sem1['dialogue_count'], sem2['dialogue_count']) / max(1, max(sem1['dialogue_count'], sem2['dialogue_count']))
score += ratio * 0.3
# Compare speakers
if 'speakers' in sem1 and 'speakers' in sem2:
max_score += 1.0
if sem1['speakers'] and sem2['speakers']:
overlap = len(set(sem1['speakers']) & set(sem2['speakers']))
total = len(set(sem1['speakers']) | set(sem2['speakers']))
score += (overlap / max(1, total)) * 0.4
# Compare dialogue lengths pattern
if 'dialogue_lengths' in sem1 and 'dialogue_lengths' in sem2:
max_score += 1.0
if sem1['dialogue_lengths'] and sem2['dialogue_lengths']:
# Compare dialogue length patterns
len1 = sem1['dialogue_lengths'][:10]
len2 = sem2['dialogue_lengths'][:10]
if len1 and len2:
avg1 = sum(len1) / len(len1)
avg2 = sum(len2) / len(len2)
ratio = min(avg1, avg2) / max(1, max(avg1, avg2))
score += ratio * 0.3
return score / max(1, max_score)
def _calculate_structural_similarity(self, struct1, struct2):
"""Calculate structural signature similarity"""
score = 0.0
# Compare paragraph patterns
if 'pattern' in struct1 and 'pattern' in struct2:
pattern_sim = SequenceMatcher(None, struct1['pattern'], struct2['pattern']).ratio()
score += pattern_sim * 0.4
# Compare paragraph statistics
if all(k in struct1 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']) and \
all(k in struct2 for k in ['para_count', 'avg_para_length', 'dialogue_ratio']):
# Paragraph count ratio
para_ratio = min(struct1['para_count'], struct2['para_count']) / max(1, max(struct1['para_count'], struct2['para_count']))
score += para_ratio * 0.2
# Average length ratio
avg_ratio = min(struct1['avg_para_length'], struct2['avg_para_length']) / max(1, max(struct1['avg_para_length'], struct2['avg_para_length']))
score += avg_ratio * 0.2
# Dialogue ratio similarity
dialogue_diff = abs(struct1['dialogue_ratio'] - struct2['dialogue_ratio'])
score += (1 - dialogue_diff) * 0.2
return score
def _calculate_character_similarity(self, chars1, chars2):
"""Calculate character name similarity"""
if not chars1 or not chars2:
return 0.0
# Find overlapping characters
set1 = set(chars1)
set2 = set(chars2)
overlap = len(set1 & set2)
total = len(set1 | set2)
return overlap / max(1, total)
def _calculate_pattern_similarity(self, pat1, pat2):
"""Calculate pattern-based similarity"""
score = 0.0
# Compare numbers (they rarely change in translations)
if 'numbers' in pat1 and 'numbers' in pat2:
nums1 = set(pat1['numbers'])
nums2 = set(pat2['numbers'])
if nums1 and nums2:
overlap = len(nums1 & nums2)
total = len(nums1 | nums2)
score = overlap / max(1, total)
return score
def generate_rolling_summary(
self,
history_manager,
actual_num,
base_system_content=None,
source_text=None,
previous_summary_text=None,
previous_summary_chapter_num=None,
prefer_translations_only_user=False,
):
"""Generate rolling summary after a chapter for context continuity.
Uses a dedicated summary system prompt (with glossary) distinct from translation.
Writes the summary to rolling_summary.txt and returns the summary string.
IMPORTANT: The SUMMARY_ROLE setting controls what is sent to the summary API:
- system: send system prompt + user message containing ONLY the translated text
- user: send ONLY a user message (configured prompt template + translated text)
- both: send system + user (current/legacy behavior)
Optional:
- previous_summary_text: when provided, it is sent as an assistant message for context.
- prefer_translations_only_user: when True, the user message will be ONLY the translated text
(even if SUMMARY_ROLE would otherwise use the configured user template).
"""
if not self.config.USE_ROLLING_SUMMARY:
return None
current_history = history_manager.load_history()
messages_to_include = self.config.ROLLING_SUMMARY_EXCHANGES * 2
# Prefer directly provided source text (e.g., just-translated chapter) when available
assistant_responses = []
if source_text and isinstance(source_text, str) and source_text.strip():
assistant_responses = [source_text]
else:
if len(current_history) >= 2:
recent_messages = current_history[-messages_to_include:] if messages_to_include > 0 else current_history
for h in recent_messages:
if h.get("role") == "assistant":
assistant_responses.append(h["content"])
# If still empty, skip quietly
if not assistant_responses:
return None
# Build a dedicated summary system prompt (do NOT reuse main translation system prompt)
# Append glossary to keep terminology consistent
summary_system_template = os.getenv("ROLLING_SUMMARY_SYSTEM_PROMPT", "You create concise summaries for continuity.").strip()
try:
glossary_path = find_glossary_file(self.out_dir)
except Exception:
glossary_path = None
# Rolling summary generation is a summarization-only call; do NOT append glossary here.
# (This keeps prompts smaller and avoids glossary-compression logic for summaries.)
_prev_append_glossary_env = os.environ.get("APPEND_GLOSSARY")
try:
os.environ["APPEND_GLOSSARY"] = "0"
system_prompt = build_system_prompt(summary_system_template, glossary_path, source_text=source_text)
finally:
if _prev_append_glossary_env is None:
os.environ.pop("APPEND_GLOSSARY", None)
else:
os.environ["APPEND_GLOSSARY"] = _prev_append_glossary_env
# Add explicit instruction for clarity (glossary usage instructions come from APPEND_GLOSSARY_PROMPT).
system_prompt += "\n\n[Instruction: Update the rolling summary using any prior summary context provided, plus the newly provided translated text. Do not include warnings or explanations.]"
user_prompt_template = os.getenv(
"ROLLING_SUMMARY_USER_PROMPT",
"Summarize the key events, characters, tone, and important details from these translations. "
"Focus on: character names/relationships, plot developments, and any special terminology used.\n\n"
"{translations}"
)
translations_text = "\n---\n".join(assistant_responses)
user_prompt = user_prompt_template.replace("{translations}", translations_text)
# Optional: provide the previous rolling summary as an assistant message for context.
# IMPORTANT: This MUST NOT be duplicated into the user message.
prev_summary_msg = None
if previous_summary_text and isinstance(previous_summary_text, str) and previous_summary_text.strip():
prev_summary_msg = {
"role": "assistant",
"content": (
"[PREVIOUS ROLLING SUMMARY — UPDATE THIS]\n"
+ previous_summary_text.strip()
+ "\n[END PREVIOUS ROLLING SUMMARY]"
),
}
# SUMMARY_ROLE also controls the rolling-summary generation payload.
# Default to 'both' to preserve legacy behavior when the env var isn't set.
summary_role = (os.getenv("SUMMARY_ROLE", "both") or "both").strip().lower()
# When requested, force the user message to be ONLY the translated text.
if prefer_translations_only_user:
summary_role = "system" # ensures we include system prompt + translations-only user message
if summary_role == "system":
# System prompt + user content containing ONLY the translated text
summary_msgs = [{"role": "system", "content": system_prompt}]
if prev_summary_msg:
summary_msgs.append(prev_summary_msg)
summary_msgs.append({"role": "user", "content": translations_text})
elif summary_role == "user":
# User prompt only (as configured) with translated text inside it
summary_msgs = []
if prev_summary_msg:
summary_msgs.append(prev_summary_msg)
summary_msgs.append({"role": "user", "content": user_prompt})
else:
# both (current behavior)
summary_msgs = [{"role": "system", "content": system_prompt}]
if prev_summary_msg:
summary_msgs.append(prev_summary_msg)
summary_msgs.append({"role": "user", "content": f"[Rolling Summary of Chapter {actual_num}]\n" + user_prompt})
try:
# Get configurable rolling summary token limit
# -1 means: use the main MAX_OUTPUT_TOKENS value
raw_max = os.getenv('ROLLING_SUMMARY_MAX_TOKENS', '-1')
try:
rolling_summary_max_tokens = int(str(raw_max).strip())
except Exception:
rolling_summary_max_tokens = -1
if rolling_summary_max_tokens == -1:
rolling_summary_max_tokens = int(getattr(self.config, 'MAX_OUTPUT_TOKENS', 8192))
send_result = send_with_interrupt(
summary_msgs, self.client, self.config.TEMP,
min(int(rolling_summary_max_tokens), self.config.MAX_OUTPUT_TOKENS),
self.check_stop,
context='summary'
)
# send_with_interrupt may return:
# - a plain string (content)
# - (content, finish_reason)
# - (content, finish_reason, raw_obj)
# We only need the content for rolling summaries.
if isinstance(send_result, tuple) and len(send_result) >= 1:
summary_resp = send_result[0]
else:
summary_resp = send_result
# Save the summary to the output folder
summary_file = os.path.join(self.out_dir, "rolling_summary.txt")
mode = "a" if self.config.ROLLING_SUMMARY_MODE == "append" else "w"
# Header formatting:
# - append mode: each appended block corresponds to a specific chapter → keep chapter-specific header
# - replace mode: file is overwritten and represents the current rolling window → label as "Last N Chapters"
if mode == "a":
header_title = f"=== Rolling Summary of Chapter {actual_num} ==="
else:
try:
_n = int(getattr(self.config, 'ROLLING_SUMMARY_MAX_ENTRIES', 0) or 0)
except Exception:
_n = 0
header_title = f"=== Rolling Summary of Last {_n} Chapters ===" if _n > 0 else "=== Rolling Summary ==="
header = header_title + "\n"
with open(summary_file, mode, encoding="utf-8") as sf:
if mode == "a":
sf.write("\n\n")
sf.write(header)
sf.write(summary_resp.strip())
# If in append mode, trim to retain only the last N entries if configured
try:
if self.config.ROLLING_SUMMARY_MODE == "append":
max_entries = int(getattr(self.config, "ROLLING_SUMMARY_MAX_ENTRIES", 0) or 0)
if max_entries > 0:
with open(summary_file, 'r', encoding='utf-8') as rf:
content = rf.read()
# Find the start of each summary block by header line
headers = [m.start() for m in re.finditer(r"(?m)^===\s*Rolling Summary.*$", content)]
if len(headers) > max_entries:
# Keep only the last max_entries blocks
keep_starts = headers[-max_entries:]
blocks = []
for i, s in enumerate(keep_starts):
e = keep_starts[i + 1] if i + 1 < len(keep_starts) else len(content)
block = content[s:e].strip()
if block:
blocks.append(block)
trimmed_content = ("\n\n".join(blocks) + "\n") if blocks else ""
with open(summary_file, 'w', encoding='utf-8') as wf:
wf.write(trimmed_content)
# Optional log showing retained count
try:
self._log(f"📚 Total summaries in memory: {len(blocks)} (trimmed to last {max_entries})")
except Exception:
pass
except Exception as _trim_err:
try:
self._log(f"⚠️ Failed to trim rolling summaries: {_trim_err}")
except Exception:
pass
# Log to GUI if available, otherwise console
try:
self._log(f"📝 Generated rolling summary for Chapter {actual_num} ({'append' if mode=='a' else 'replace'} mode)")
self._log(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)")
except Exception:
print(f"📝 Generated rolling summary for Chapter {actual_num} ({'append' if mode=='a' else 'replace'} mode)")
print(f" ➜ Saved to: {summary_file} ({len(summary_resp.strip())} chars)")
return summary_resp.strip()
except Exception as e:
try:
self._log(f"⚠️ Failed to generate rolling summary: {e}")
except Exception:
print(f"⚠️ Failed to generate rolling summary: {e}")
return None
def translate_with_retry(self, msgs, chunk_html, c, chunk_idx, total_chunks, merge_group_len=None, merged_chapters=None):
"""Handle translation with retry logic
Args:
merged_chapters: Optional list of chapter numbers that were merged into this request
"""
# CRITICAL FIX: Reset client state for each chunk
if hasattr(self.client, 'reset_cleanup_state'):
self.client.reset_cleanup_state()
# Also ensure we're not in cleanup mode from previous operations
if hasattr(self.client, '_in_cleanup'):
self.client._in_cleanup = False
if hasattr(self.client, '_cancelled'):
self.client._cancelled = False
# Reinitialize Gemini client if it was closed by a previous timeout
if hasattr(self.client, 'gemini_client') and self.client.gemini_client is None:
try:
self.client._setup_client()
except Exception:
pass
truncation_retry_count = 0
split_failed_retry_count = 0
# Get retry attempts from AI Hunter config if available
ai_config = {}
try:
# Try to get AI Hunter config from environment variable first
ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG')
if ai_hunter_config_str:
ai_config = json.loads(ai_hunter_config_str)
else:
# Fallback to config attribute
ai_config = getattr(self.config, 'ai_hunter_config', {})
except (json.JSONDecodeError, AttributeError):
ai_config = {}
if isinstance(ai_config, dict):
max_retries = ai_config.get('retry_attempts', 3)
max_duplicate_retries = ai_config.get('retry_attempts', 6) # Use same setting for duplicate retries
else:
max_retries = 3
max_duplicate_retries = 6
try:
truncation_retry_limit = int(os.getenv("TRUNCATION_RETRY_ATTEMPTS", "1"))
except Exception:
truncation_retry_limit = 1
try:
split_failed_retry_limit = int(getattr(self.config, 'SPLIT_FAILED_RETRY_ATTEMPTS', 2))
except Exception:
split_failed_retry_limit = 2
disable_merge_fallback_flag = os.getenv("DISABLE_MERGE_FALLBACK", "0") == "1" or getattr(self.config, 'DISABLE_MERGE_FALLBACK', False)
truncation_retry_enabled = (os.getenv("RETRY_TRUNCATED", "0") == "1") or bool(getattr(self.config, "RETRY_TRUNCATED", False))
split_retry_enabled = (os.getenv("RETRY_SPLIT_FAILED", "0") == "1") or bool(getattr(self.config, "RETRY_SPLIT_FAILED", False))
duplicate_retry_count = 0
timeout_retry_count = 0
try:
max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2"))
except Exception:
max_timeout_retries = 2
history_purged = False
original_max_tokens = self.config.MAX_OUTPUT_TOKENS
original_temp = self.config.TEMP
original_user_prompt = msgs[-1]["content"]
# Determine stable chapter number for this chunk (used for payload metadata)
idx = c.get('__index', 0)
actual_num = c.get('actual_chapter_num', c.get('num', idx + 1))
# Determine chunk timeout respecting runtime env overrides.
# If RETRY_TIMEOUT is "0"/false/blank, disable chunk timeouts entirely.
env_retry = os.getenv("RETRY_TIMEOUT")
if env_retry is not None:
retry_timeout_enabled = env_retry.strip().lower() not in ("0", "false", "off", "")
else:
retry_timeout_enabled = bool(getattr(self.config, "RETRY_TIMEOUT", False))
chunk_timeout = None
if retry_timeout_enabled:
env_ct = os.getenv("CHUNK_TIMEOUT")
if env_ct and str(env_ct).strip().lower() not in ("", "none", "0"):
try:
chunk_timeout = int(float(env_ct))
except Exception:
chunk_timeout = getattr(self.config, "CHUNK_TIMEOUT", None)
else:
chunk_timeout = getattr(self.config, "CHUNK_TIMEOUT", None)
# Treat non-positive timeouts as disabled
try:
if chunk_timeout is not None and float(chunk_timeout) <= 0:
chunk_timeout = None
except Exception:
chunk_timeout = None
result = None
finish_reason = None
# Fallback stop callback (overridden later for chunked chapters)
def local_stop_cb():
# First check if we should abort due to internal error/cancel
# Check stop_callback directly to bypass check_stop's graceful logic override
stop_requested = False
if self.stop_callback and self.stop_callback():
stop_requested = True
if stop_requested:
# User requested stop. Check graceful settings.
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if not graceful_stop_active:
# Force stop
log_stop_once()
return True
# Graceful stop is active.
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
# If wait_for_chunks is OFF (0), we should cancel immediately
# UNLESS we are at the last chunk (chunk_idx == total_chunks),
# in which case we wait for it to complete the chapter.
is_last_chunk = (chunk_idx == total_chunks)
if not wait_for_chunks and not is_last_chunk:
log_stop_once()
return True
# If wait_for_chunks is ON (1), we return False to let it finish
return False
return False
while True:
if local_stop_cb():
return None, None, None
try:
current_max_tokens = self.config.MAX_OUTPUT_TOKENS
current_temp = self.config.TEMP
# Compute token counts, separating assistant (memory/context) tokens when present
total_tokens = 0
assistant_tokens = 0
for m in msgs:
content = m.get("content", "")
tokens = self.chapter_splitter.count_tokens(content)
total_tokens += tokens
if m.get("role") == "assistant":
assistant_tokens += tokens
non_assistant_tokens = total_tokens - assistant_tokens
# Determine file reference
if c.get('is_chunk', False):
# Handle float chapter numbers in file reference
chapter_num_for_ref = c['num']
if isinstance(chapter_num_for_ref, float):
# Keep decimal notation for display (e.g., "Section_1.0")
file_ref = f"Section_{chapter_num_for_ref}"
else:
file_ref = f"Section_{chapter_num_for_ref}"
else:
# Check if this is a text file - need to access from self
is_text_source = self.is_text_file or c.get('filename', '').endswith('.txt')
terminology = "Section" if is_text_source else "Chapter"
chapter_num_for_ref = c['num']
if isinstance(chapter_num_for_ref, float):
file_ref = c.get('original_basename', f'{terminology}_{chapter_num_for_ref}')
else:
file_ref = c.get('original_basename', f'{terminology}_{chapter_num_for_ref}')
# When contextual translation is enabled and we have assistant-role
# context (memory, summaries, etc.), surface its token share explicitly.
if getattr(self.config, 'CONTEXTUAL', False) and assistant_tokens > 0:
print(
f"💬 Chunk {chunk_idx}/{total_chunks} combined prompt: "
f"{total_tokens:,} tokens (system + user: {non_assistant_tokens:,}, "
f"assistant/memory: {assistant_tokens:,}) / {self.get_token_budget_str()} [File: {file_ref}]"
)
else:
print(
f"💬 Chunk {chunk_idx}/{total_chunks} combined prompt: "
f"{total_tokens:,} tokens (system + user) / {self.get_token_budget_str()} [File: {file_ref}]"
)
self.client.context = 'translation'
# Generate filename for chunks
if chunk_idx and total_chunks > 1:
# This is a chunk - use chunk naming format
# Prefer original_basename to preserve source file's zero-padding (e.g., 0009_10)
original_basename = c.get('original_basename', '')
if original_basename:
# Use original basename (without extension) + _chunk_N
base = os.path.splitext(original_basename)[0]
fname = f"response_{base}_chunk_{chunk_idx}.html"
else:
# Handle float chapter numbers (e.g., 1.0, 2.5) properly
chapter_num = c['num']
if isinstance(chapter_num, float):
# For decimal chapters like 1.5, use format like "response_001_5_chunk_1.html"
major = int(chapter_num)
minor = int(round((chapter_num - major) * 100)) # 1.5 -> 50, 1.1 -> 10
if minor > 0:
fname = f"response_{major:03d}_{minor:02d}_chunk_{chunk_idx}.html"
else:
# It's like 1.0, just use the integer part
fname = f"response_{major:03d}_chunk_{chunk_idx}.html"
else:
fname = f"response_{chapter_num:03d}_chunk_{chunk_idx}.html"
else:
# Not a chunk - use regular naming
fname = FileUtilities.create_chapter_filename(c, c.get('actual_chapter_num', c['num']))
# Set output filename BEFORE the API call
if hasattr(self.client, 'set_output_filename'):
self.client.set_output_filename(fname)
# Track the filename so truncation logs know which file this is
if hasattr(self.client, '_current_output_file'):
self.client._current_output_file = fname
# Generate unique request ID for this chunk
#request_id = f"{c['num']:03d}_chunk{chunk_idx}_{uuid.uuid4().hex[:8]}"
chapter_ctx = {
'chapter': actual_num,
'chunk': chunk_idx,
'total_chunks': total_chunks,
'merged_chapters': merged_chapters,
}
result, finish_reason, raw_obj = send_with_interrupt(
msgs,
self.client,
current_temp,
current_max_tokens,
local_stop_cb,
chunk_timeout,
context='translation',
chapter_context=chapter_ctx,
bypass_graceful_stop=True
)
# Enhanced mode workflow:
# 1. Original HTML -> html2text -> Markdown/plain text (during extraction)
# 2. Markdown sent to translation API (better for translation quality)
# 3. Translated markdown -> HTML conversion (here)
if result and c.get("enhanced_extraction", False):
print(f"🔄 Converting translated markdown back to HTML...")
result = convert_enhanced_text_to_html(result, c)
# Emergency Image Restoration (if enabled globally OR forced for this chapter)
# Check for forced flag in progress data attached to chunk
prog_data = c.get('__progress', {})
force_restore = prog_data.get('force_image_restore', False) if isinstance(prog_data, dict) else False
if result and (self.config.EMERGENCY_IMAGE_RESTORE or force_restore):
# Use original HTML if available (for enhanced extraction), otherwise use chunk_html
source_html = c.get('original_html', chunk_html)
result = ContentProcessor.emergency_restore_images(result, source_html)
retry_needed = False
retry_reason = ""
retry_limit_for_reason = None
is_duplicate_retry = False
# Mark if we're already in a truncation retry to prevent nested retries
# This flag is set by the char-ratio check below to prevent infinite recursion
in_truncation_retry = c.get('__in_truncation_retry', False)
# Debug logging to verify the toggle state
#print(f" DEBUG: finish_reason='{finish_reason}', truncation_enabled={truncation_retry_enabled}, split_retry_enabled={split_retry_enabled}")
# DISABLED: Truncation retries are now handled entirely by unified_api_client.py
# This prevents double/triple retry cascades (unified_api_client → translate_with_retry → char-ratio check)
if False and finish_reason == "length" and not in_truncation_retry:
if truncation_retry_enabled and truncation_retry_count < truncation_retry_limit:
# Always attempt a truncation retry, even if token limits are equal
new_token_limit = self.config.MAX_RETRY_TOKENS
retry_needed = True
retry_reason = "truncated output"
retry_limit_for_reason = truncation_retry_limit
old_limit = self.config.MAX_OUTPUT_TOKENS
self.config.MAX_OUTPUT_TOKENS = new_token_limit
truncation_retry_count += 1
print(f" 🔄 TRUNCATION RETRY: Attempt {truncation_retry_count}/{truncation_retry_limit} — tokens {old_limit} → {new_token_limit}")
elif truncation_retry_enabled:
print(f" ⚠️ TRUNCATION DETECTED: Max truncation retries ({truncation_retry_limit}) reached - accepting truncated response")
else:
print(f" ⏭️ TRUNCATION DETECTED: Auto-retry is DISABLED - accepting truncated response")
elif False and finish_reason == "length" and in_truncation_retry:
# We're in a char-ratio triggered retry - don't nest another retry
print(f" 📋 Already in truncation retry chain - skipping nested retry")
# Treat split failures like truncation for auto-retry
split_failed_in_finish = bool(finish_reason and 'split' in str(finish_reason).lower())
split_failed_in_body = bool(isinstance(result, str) and 'SPLIT_FAILED' in result)
# Check for split markers if this is a merged request
split_validation_failed = False
if merge_group_len and merge_group_len > 1 and result and isinstance(result, str):
# We need to import RequestMerger here or assume it's available in module scope
# RequestMerger is defined at module level
try:
# Clean artifacts first? No, we want to check raw result usually,
# but split_by_markers is robust.
# However, translate_with_retry doesn't clean artifacts yet.
# Let's try splitting.
split_sections = RequestMerger.split_by_markers(result, merge_group_len)
if not split_sections or len(split_sections) != merge_group_len:
print(f" ⚠️ Split validation failed: Expected {merge_group_len} sections")
split_validation_failed = True
except Exception as e:
print(f" ⚠️ Split validation error: {e}")
split_validation_failed = True
if not retry_needed and (split_failed_in_finish or split_failed_in_body or split_validation_failed) and split_retry_enabled:
if split_failed_retry_count < split_failed_retry_limit:
retry_needed = True
retry_reason = "split failed"
retry_limit_for_reason = split_failed_retry_limit
split_failed_retry_count += 1
print(f" 🔄 Split failed — retrying merged request (attempt {split_failed_retry_count}/{split_failed_retry_limit})")
else:
print(f" ⚠️ SPLIT FAILED: Max split-failed retries ({split_failed_retry_limit}) reached - accepting response")
if not retry_needed:
# Force re-read the environment variable to ensure we have current setting
duplicate_enabled = os.getenv("RETRY_DUPLICATE_BODIES", "0") == "1"
if duplicate_enabled and duplicate_retry_count < max_duplicate_retries:
idx = c.get('__index', 0)
prog = c.get('__progress', {})
print(f" 🔍 Checking for duplicate content...")
# Get actual chapter number for duplicate detection
actual_num = c.get('actual_chapter_num', c.get('num', idx + 1))
is_duplicate, similarity = self.check_duplicate_content(result, idx, prog, self.out_dir, actual_num)
if is_duplicate:
retry_needed = True
is_duplicate_retry = True
retry_reason = f"duplicate content (similarity: {similarity}%)"
duplicate_retry_count += 1
# Check if temperature change is disabled
disable_temp_change = ai_config.get('disable_temperature_change', False) if isinstance(ai_config, dict) else False
if duplicate_retry_count >= 3 and not history_purged:
print(f" 🧹 Clearing history after 3 attempts...")
if 'history_manager' in c:
c['history_manager'].save_history([])
history_purged = True
if not disable_temp_change:
self.config.TEMP = original_temp
else:
print(f" 🌡️ Temperature change disabled - keeping current temp: {self.config.TEMP}")
elif duplicate_retry_count == 1:
if disable_temp_change:
print(f" 🔄 First duplicate retry - temperature change disabled")
else:
print(f" 🔄 First duplicate retry - same temperature")
elif history_purged:
if not disable_temp_change:
attempts_since_purge = duplicate_retry_count - 3
self.config.TEMP = min(original_temp + (0.1 * attempts_since_purge), 1.0)
print(f" 🌡️ Post-purge temp: {self.config.TEMP}")
else:
print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}")
else:
if not disable_temp_change:
self.config.TEMP = min(original_temp + (0.1 * (duplicate_retry_count - 1)), 1.0)
print(f" 🌡️ Gradual temp increase: {self.config.TEMP}")
else:
print(f" 🌡️ Temperature change disabled - keeping temp: {self.config.TEMP}")
if duplicate_retry_count == 1:
user_prompt = f"[RETRY] Chapter {c['num']}: Ensure unique translation.\n{chunk_html}"
elif duplicate_retry_count <= 3:
user_prompt = f"[ATTEMPT {duplicate_retry_count}] Translate uniquely:\n{chunk_html}"
else:
user_prompt = f"Chapter {c['num']}:\n{chunk_html}"
msgs[-1] = {"role": "user", "content": user_prompt}
if retry_needed:
if is_duplicate_retry:
print(f" 🔄 Duplicate retry {duplicate_retry_count}/{max_duplicate_retries}")
time.sleep(2)
continue
break
except UnifiedClientError as e:
error_msg = str(e)
if "stopped by user" in error_msg:
print("❌ Translation stopped by user during API call")
return None, None, None
# Treat cancelled errors (from client being closed) as timeout
if "cancelled" in error_msg or "Gemini client not initialized" in error_msg:
# Check stop flag before retrying
if self.check_stop():
print("❌ Translation stopped by user during timeout retry")
return None, None, None
# During graceful stop, don't retry - skip this chunk
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active:
print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Skipped (graceful stop)")
return None, "graceful_stop", None
if timeout_retry_count < max_timeout_retries:
timeout_retry_count += 1
print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: {error_msg}, retrying ({timeout_retry_count}/{max_timeout_retries})...")
# Reinitialize the client if it was closed (check correct client based on type)
client_type = getattr(self.client, 'client_type', 'unknown')
needs_reinit = False
if client_type == 'gemini':
needs_reinit = hasattr(self.client, 'gemini_client') and self.client.gemini_client is None
elif client_type == 'openai':
needs_reinit = hasattr(self.client, 'openai_client') and self.client.openai_client is None
if needs_reinit:
try:
print(f" 🔄 Reinitializing {client_type} client...")
self.client._setup_client()
except Exception as reinit_err:
print(f" ⚠️ Failed to reinitialize client: {reinit_err}")
# Use SEND_INTERVAL_SECONDS as base, random from half to full
import random
base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
retry_delay = random.uniform(base_delay / 2, base_delay)
print(f" ⏳ Waiting {retry_delay:.1f}s before retry...")
time.sleep(retry_delay)
continue
else:
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached - marking chunk as failed")
return "[TIMEOUT]", "timeout", None
if "took" in error_msg and "timeout:" in error_msg:
# Check stop flag before retrying
if self.check_stop():
print("❌ Translation stopped by user during timeout retry")
return None, None, None
# During graceful stop, don't retry - skip this chunk
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active:
print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timed out during graceful stop - skipping retry")
return "[TIMEOUT]", "timeout", None
if timeout_retry_count < max_timeout_retries:
timeout_retry_count += 1
print(f" ⏱️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Chunk took too long, retry {timeout_retry_count}/{max_timeout_retries}")
# Use SEND_INTERVAL_SECONDS as base, random from half to full
import random
base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
retry_delay = random.uniform(base_delay / 2, base_delay)
print(f" ⏳ Waiting {retry_delay:.1f}s before retry...")
time.sleep(retry_delay)
continue
else:
print(f" ❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries reached - marking chunk as failed")
return "[TIMEOUT]", "timeout", None
elif "timed out" in error_msg and "timeout:" not in error_msg:
# Check stop flag before retrying
if self.check_stop():
print("❌ Translation stopped by user during timeout retry")
return None, None, None
# During graceful stop, don't retry - skip
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active:
print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timed out during graceful stop - skipping retry")
return "[TIMEOUT]", "timeout", None
if timeout_retry_count < max_timeout_retries:
timeout_retry_count += 1
print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: {error_msg}, retrying ({timeout_retry_count}/{max_timeout_retries})...")
# Use SEND_INTERVAL_SECONDS as base, random from half to full
import random
base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
retry_delay = random.uniform(base_delay / 2, base_delay)
print(f" ⏳ Waiting {retry_delay:.1f}s before retry...")
time.sleep(retry_delay)
continue
else:
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached - marking chunk as failed")
return "[TIMEOUT]", "timeout", None
elif getattr(e, "error_type", None) == "rate_limit" or getattr(e, "http_status", None) == 429:
# Rate limit errors - clean handling without traceback
print("⚠️ Rate limited, sleeping 60s…")
for i in range(60):
if self.check_stop():
print("❌ Translation stopped during rate limit wait")
return None, None, None
time.sleep(1)
continue
else:
# For unexpected errors, show the error message but suppress traceback in most cases
if getattr(e, "error_type", None) in ["api_error", "validation", "prohibited_content"]:
print(f"❌ API Error: {error_msg}")
raise UnifiedClientError(f"API Error: {error_msg}")
else:
raise
except Exception as e:
print(f"❌ Unexpected error during API call: {e}")
import traceback
print(f"Full traceback:\n{traceback.format_exc()}")
raise
self.config.MAX_OUTPUT_TOKENS = original_max_tokens
self.config.TEMP = original_temp
total_simple_retries = truncation_retry_count + split_failed_retry_count
if total_simple_retries > 0 or duplicate_retry_count > 0 or timeout_retry_count > 0:
if duplicate_retry_count > 0:
print(f" 🔄 Restored original temperature: {self.config.TEMP} (after {duplicate_retry_count} duplicate retries)")
elif timeout_retry_count > 0:
print(f" 🔄 Restored original settings after {timeout_retry_count} timeout retries")
elif total_simple_retries > 0:
print(f" 🔄 Restored original settings after {total_simple_retries} retries")
if duplicate_retry_count >= max_duplicate_retries:
print(f" ⚠️ WARNING: Duplicate content issue persists after {max_duplicate_retries} attempts")
return result, finish_reason, raw_obj
def get_token_budget_str(self):
"""Get token budget as string"""
_tok_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
max_tokens_limit, budget_str = parse_token_limit(_tok_env)
return budget_str
# =====================================================
# BATCH TRANSLATION PROCESSOR
# =====================================================
class BatchTranslationProcessor:
"""Handles batch/parallel translation processing"""
def __init__(self, config, client, base_msg, out_dir, progress_lock,
save_progress_fn, update_progress_fn, check_stop_fn,
image_translator=None, is_text_file=False, history_manager=None):
self.config = config
self.client = client
self.base_msg = base_msg
self.out_dir = out_dir
self.progress_lock = progress_lock
self.save_progress_fn = save_progress_fn
self.update_progress_fn = update_progress_fn
self.check_stop_fn = check_stop_fn
self.image_translator = image_translator
self.chapters_completed = 0
self.chunks_completed = 0
self.is_text_file = is_text_file
# Optional shared HistoryManager for contextual translation across chapters
self.history_manager = history_manager
# Rolling summary support (batch mode): inject a snapshot per batch.
# This is updated by the main thread between batches.
import threading
self._batch_rolling_summary_lock = threading.Lock()
self._batch_rolling_summary_text = "" # exact rolling_summary.txt contents for current batch
# Optionally log multi-key status
if hasattr(self.client, 'use_multi_keys') and self.client.use_multi_keys:
stats = self.client.get_stats()
print(f"🔑 Batch processor using multi-key mode: {stats.get('total_keys', 0)} keys")
def set_batch_rolling_summary_text(self, text: str) -> None:
"""Set the rolling summary snapshot to be injected for the current batch."""
try:
if text is None:
text = ""
except Exception:
text = ""
with self._batch_rolling_summary_lock:
self._batch_rolling_summary_text = text
def get_batch_rolling_summary_text(self) -> str:
"""Get the rolling summary snapshot (thread-safe)."""
with self._batch_rolling_summary_lock:
return self._batch_rolling_summary_text
def process_single_chapter(self, chapter_data):
"""Process a single chapter (runs in thread)"""
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
idx, chapter = chapter_data
chap_num = chapter["num"]
# Use the pre-calculated actual_chapter_num from the main loop
actual_num = chapter.get('actual_chapter_num')
# Fallback if not set (common in batch mode where first pass might be skipped)
if actual_num is None:
# CHUNK FIX: For split text/PDF chunks with decimal numbering, use chap_num directly
# Chunks have 'is_chunk' flag and decimal 'num' values (1.0, 1.1, etc.)
if chapter.get('is_chunk', False) and isinstance(chap_num, float):
actual_num = chap_num
else:
# Try to extract it using the same logic as non-batch mode
raw_num = FileUtilities.extract_actual_chapter_number(chapter, patterns=None, config=self.config)
# Apply offset if configured
offset = self.config.CHAPTER_NUMBER_OFFSET if hasattr(self.config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num += offset
# Check if zero detection is disabled
if hasattr(self.config, 'DISABLE_ZERO_DETECTION') and self.config.DISABLE_ZERO_DETECTION:
actual_num = raw_num
elif hasattr(self.config, '_uses_zero_based') and self.config._uses_zero_based:
# This is a 0-based novel, adjust the number
actual_num = raw_num + 1
else:
# Default to raw number (1-based or unknown)
actual_num = raw_num
print(f" 📖 Extracted actual chapter number: {actual_num} (from raw: {raw_num})")
# APPLY INTERRUPTIBLE THREADING DELAY AFTER determining chapter number
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
if thread_delay > 0:
# Check if we need to wait (same logic as unified_api_client)
if hasattr(self.client, '_thread_submission_lock') and hasattr(self.client, '_last_thread_submission_time'):
with self.client._thread_submission_lock:
current_time = time.time()
time_since_last = current_time - self.client._last_thread_submission_time
if time_since_last < thread_delay:
sleep_time = thread_delay - time_since_last
thread_name = threading.current_thread().name
# Use actual_num now that it's been determined
# Only log if not during graceful stop (about to be cancelled)
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if not graceful_stop_active:
print(f"🧵 [{thread_name}] Applying thread delay: {sleep_time:.3f}s for Chapter {actual_num}")
# Interruptible sleep - check stop flag every 0.1 seconds
elapsed = 0
check_interval = 0.1
while elapsed < sleep_time:
if self.check_stop_fn():
# Only log if not during graceful stop (expected interruption)
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if not graceful_stop_active:
print(f"🛑 Threading delay interrupted by stop flag")
raise Exception("Translation stopped by user during threading delay")
sleep_chunk = min(check_interval, sleep_time - elapsed)
time.sleep(sleep_chunk)
elapsed += sleep_chunk
self.client._last_thread_submission_time = time.time()
if not hasattr(self.client, '_thread_submission_count'):
self.client._thread_submission_count = 0
self.client._thread_submission_count += 1
# Initialize variables that might be needed in except block
content_hash = None
ai_features = None
# Reinitialize Gemini client if it was closed by a previous timeout
if hasattr(self.client, 'gemini_client') and self.client.gemini_client is None:
try:
self.client._setup_client()
except Exception:
pass
try:
# Check if this is from a text file
is_text_source = self.is_text_file or chapter.get('filename', '').endswith('.txt') or chapter.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f"🔄 Starting #{idx+1} (Internal: {terminology} {chap_num}, Actual: {terminology} {actual_num}) (thread: {threading.current_thread().name}) [File: {chapter.get('original_basename', f'{terminology}_{chap_num}')}]")
content_hash = chapter.get("content_hash") or ContentProcessor.get_content_hash(chapter["body"])
# Determine output filename early so we can track it in progress
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
with self.progress_lock:
self.update_progress_fn(idx, actual_num, content_hash, fname, status="in_progress")
self.save_progress_fn()
chapter_body = chapter["body"]
if chapter.get('has_images') and self.image_translator and self.config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Processing images for Chapter {actual_num}...")
self.image_translator.set_current_chapter(actual_num)
chapter_body, image_translations = process_chapter_images(
chapter_body,
actual_num,
self.image_translator,
self.check_stop_fn
)
if image_translations:
# Create a copy of the processed body
from bs4 import BeautifulSoup
c = chapter
soup_for_text = BeautifulSoup(c["body"], 'html.parser')
# Remove all translated content
for trans_div in soup_for_text.find_all('div', class_='translated-text-only'):
trans_div.decompose()
# Use this cleaned version for text translation
text_to_translate = str(soup_for_text)
final_body_with_images = c["body"]
else:
text_to_translate = c["body"]
image_translations = {}
print(f"✅ Processed {len(image_translations)} images for Chapter {actual_num}")
# Build chapter-specific system prompt with glossary compression
glossary_path = find_glossary_file(self.out_dir)
# Capture compression stats if enabled
compress_glossary_enabled = os.getenv("COMPRESS_GLOSSARY_PROMPT", "0") == "1"
if compress_glossary_enabled and glossary_path and os.path.exists(glossary_path):
try:
# Load glossary to get original size
with open(glossary_path, 'r', encoding='utf-8') as f:
if glossary_path.lower().endswith(('.csv', '.md', '.txt')):
original_glossary = f.read()
else:
try:
glossary_data = json.load(f)
original_glossary = json.dumps(glossary_data, ensure_ascii=False, indent=2)
except json.JSONDecodeError:
# If JSON parsing fails, treat as text
f.seek(0)
original_glossary = f.read()
original_length = len(original_glossary)
# Build system prompt with compression
# Use get_system_prompt(1) since this is a single chapter (no merging)
base_prompt = self.config.get_system_prompt(actual_merge_count=1)
chapter_system_prompt = build_system_prompt(base_prompt, glossary_path, source_text=chapter_body)
# Extract compressed glossary from system prompt to measure compression
# The glossary is appended after the prompt, so we can estimate the size
prompt_without_glossary = base_prompt
glossary_in_prompt = len(chapter_system_prompt) - len(prompt_without_glossary) if len(chapter_system_prompt) > len(prompt_without_glossary) else 0
if glossary_in_prompt > 0 and original_length > glossary_in_prompt:
reduction_pct = ((original_length - glossary_in_prompt) / original_length * 100)
# Calculate token savings
try:
import tiktoken
try:
enc = tiktoken.encoding_for_model(self.config.MODEL)
except:
enc = tiktoken.get_encoding('cl100k_base')
original_tokens = len(enc.encode(original_glossary))
compressed_tokens = len(enc.encode(chapter_system_prompt)) - len(enc.encode(prompt_without_glossary))
token_reduction_pct = ((original_tokens - compressed_tokens) / original_tokens * 100) if original_tokens > 0 else 0
print(f"🗜️ Glossary: {original_length:,}→{glossary_in_prompt:,} chars ({reduction_pct:.1f}%), {original_tokens:,}→{compressed_tokens:,} tokens ({token_reduction_pct:.1f}%)")
except ImportError:
print(f"🗜️ Glossary compressed: {original_length:,} → {glossary_in_prompt:,} chars ({reduction_pct:.1f}% reduction)")
except Exception as e:
print(f"⚠️ Failed to measure glossary compression: {e}")
chapter_system_prompt = build_system_prompt(self.config.get_system_prompt(actual_merge_count=1), glossary_path, source_text=chapter_body)
else:
chapter_system_prompt = build_system_prompt(self.config.get_system_prompt(actual_merge_count=1), glossary_path, source_text=chapter_body)
# Check if chapter needs chunking
from chapter_splitter import ChapterSplitter
chapter_splitter = ChapterSplitter(model_name=self.config.MODEL)
# Get token budget
token_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
if not token_env or token_env.lower() == "unlimited":
max_input_tokens = 1000000
budget_str = "unlimited"
elif token_env.isdigit():
max_input_tokens = int(token_env)
budget_str = f"{max_input_tokens:,}"
else:
max_input_tokens = 1000000
budget_str = "unlimited"
# Calculate available tokens for content based on effective OUTPUT limit (same as calculation phase)
# Use output token limit with compression factor, not input limit
max_output_tokens = self.config.get_effective_output_limit()
safety_margin_output = 500
compression_factor = self.config.COMPRESSION_FACTOR
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
available_tokens = max(available_tokens, 1000) # Ensure minimum
# Split into chunks if needed
# Get filename for content type detection
chapter_filename = chapter.get('filename') or chapter.get('original_basename', '')
chunks = chapter_splitter.split_chapter(chapter_body, available_tokens, filename=chapter_filename)
total_chunks = len(chunks)
file_ref = chapter.get('original_basename', f'{terminology}_{chap_num}')
# Initialize shared structures for chunk processing (works for 1 or many chunks)
translated_chunks = [None] * total_chunks # Pre-allocate to maintain order
chunks_lock = threading.Lock()
if total_chunks > 1:
print(f"✂️ Chapter {actual_num} requires {total_chunks} chunks - processing in parallel")
def process_chunk(chunk_data):
"""Process a single chunk in parallel"""
chunk_html, chunk_idx, chunk_total = chunk_data
# Check if stop requested - but respect wait_for_chunks setting
if local_stop_cb():
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if not (graceful_stop_active and wait_for_chunks and chunk_total > 1):
# Return 5 values to match expected signature
return None, chunk_idx, None, False, "cancelled"
# If wait_for_chunks is enabled, continue processing
# Build user prompt for this chunk
if total_chunks > 1:
chunk_prompt_template = os.getenv("TRANSLATION_CHUNK_PROMPT", "[PART {chunk_idx}/{total_chunks}]\\n{chunk_html}")
user_prompt = chunk_prompt_template.format(
chunk_idx=chunk_idx,
total_chunks=total_chunks,
chunk_html=chunk_html
)
else:
user_prompt = chunk_html
# Build history-based memory when contextual translation is enabled
memory_msgs = []
if (
self.config.CONTEXTUAL
and self.history_manager is not None
and getattr(self.config, 'HIST_LIMIT', 0) > 0
):
try:
# Thread-safe history access - load_history() already has internal locking
history = self.history_manager.load_history()
hist_limit = getattr(self.config, 'HIST_LIMIT', 0)
trimmed = history[-hist_limit * 2:]
include_source = os.getenv("INCLUDE_SOURCE_IN_HISTORY", "0") == "1"
model_lower = getattr(self.config, 'MODEL', '').lower()
is_gemini_3 = ('gemini-3' in model_lower) or ('gemini-exp-' in model_lower)
if is_gemini_3:
# Preserve raw content (thought signatures) and reconstruct text when missing
for h in trimmed:
if not isinstance(h, dict):
continue
role = h.get('role', 'user')
raw_obj = h.get('_raw_content_object')
content = h.get('content') or ""
if (not content) and raw_obj:
content = extract_text_from_raw_content(raw_obj)
# Skip empty entries unless raw content exists
if not content and raw_obj is None:
continue
if role == 'user' and not include_source:
continue
msg = {'role': role}
if content:
msg['content'] = content
if raw_obj is not None:
msg['_raw_content_object'] = raw_obj
memory_msgs.append(msg)
else:
# Original memory block approach for non-Gemini 3 models
memory_blocks = []
for h in trimmed:
if not isinstance(h, dict):
continue
role = h.get('role', 'user')
content = h.get('content', '')
if not content:
continue
# Optionally skip previous source text when disabled
if role == 'user' and not include_source:
continue
if role == 'user':
prefix = (
"[MEMORY - PREVIOUS SOURCE TEXT]\\n"
"This is prior source content provided for context only.\\n"
"Do NOT translate or repeat this text directly in your response.\\n\\n"
)
else:
prefix = (
"[MEMORY - PREVIOUS TRANSLATION]\\n"
"This is prior translated content provided for context only.\\n"
"Do NOT repeat or re-output this translation.\\n\\n"
)
footer = "\\n\\n[END MEMORY BLOCK]\\n"
memory_blocks.append(prefix + content + footer)
if memory_blocks:
combined_memory = "\\n".join(memory_blocks)
# Present history as an assistant message so the model
# treats it as prior context, not a new user instruction.
memory_msgs = [{
'role': 'assistant',
'content': combined_memory
}]
except Exception as e:
print(f"⚠️ Failed to build contextual memory for batch chunk: {e}")
memory_msgs = []
# Build messages for this chunk (system + optional rolling summary + optional memory + user)
rolling_summary_msgs = []
if getattr(self.config, 'USE_ROLLING_SUMMARY', False):
try:
rs_text = self.get_batch_rolling_summary_text()
except Exception:
rs_text = ""
if isinstance(rs_text, str) and rs_text:
# Do not strip/parse the file content. Only wrap to prevent accidental translation.
rolling_summary_msgs = [{
"role": "assistant",
"content": (
"CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n"
"[MEMORY] Previous context summary:\n\n"
+ rs_text + "\n\n"
"[END MEMORY]\n"
"END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:"
)
}]
# Build optional assistant prefill message if configured
assistant_prefill_msgs = []
if getattr(self.config, 'ASSISTANT_PROMPT', '') and self.config.ASSISTANT_PROMPT.strip():
assistant_prefill_msgs = [{"role": "assistant", "content": self.config.ASSISTANT_PROMPT.strip()}]
chapter_msgs = (
[{"role": "system", "content": chapter_system_prompt}]
+ rolling_summary_msgs
+ memory_msgs
+ assistant_prefill_msgs
+ [{"role": "user", "content": user_prompt}]
)
# Abort immediately if a prior chunk triggered prohibition (NOT for user stop)
if chunk_abort_event.is_set():
raise UnifiedClientError("Chunk aborted due to prohibited content", error_type="cancelled")
# Log combined prompt token count, including assistant/memory tokens when present
total_tokens = 0
assistant_tokens = 0
for msg in chapter_msgs:
content = msg.get("content", "")
tokens = chapter_splitter.count_tokens(content)
total_tokens += tokens
if msg.get("role") == "assistant":
assistant_tokens += tokens
non_assistant_tokens = total_tokens - assistant_tokens
if self.config.CONTEXTUAL and assistant_tokens > 0:
print(
f"💬 Chapter {actual_num}: Chunk {chunk_idx}/{total_chunks} combined prompt: "
f"{total_tokens:,} tokens (system + user: {non_assistant_tokens:,}, "
f"assistant/memory: {assistant_tokens:,}) / {budget_str} [File: {file_ref}]"
)
else:
print(
f"💬 Chapter {actual_num}: Chunk {chunk_idx}/{total_chunks} combined prompt: "
f"{total_tokens:,} tokens (system + user) / {budget_str} [File: {file_ref}]"
)
# Generate filename before API call
if chunk_idx < total_chunks:
# This is a chunk - use chunk naming format
# Prefer original_basename to preserve source file's zero-padding (e.g., 0009_10)
original_basename = chapter.get('original_basename', '')
if original_basename:
# Use original basename (without extension) + _chunk_N
base = os.path.splitext(original_basename)[0]
fname = f"response_{base}_chunk_{chunk_idx}.html"
elif isinstance(actual_num, float):
# For decimal chapters like 1.5, use format like "response_001_5_chunk_1.html"
major = int(actual_num)
minor = int(round((actual_num - major) * 100)) # 1.5 -> 50, 1.1 -> 10
if minor > 0:
fname = f"response_{major:03d}_{minor:02d}_chunk_{chunk_idx}.html"
else:
# It's like 1.0, just use the integer part
fname = f"response_{major:03d}_chunk_{chunk_idx}.html"
else:
fname = f"response_{actual_num:03d}_chunk_{chunk_idx}.html"
else:
# Last chunk or single chunk - use regular naming
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
if hasattr(self.client, 'set_output_filename'):
self.client.set_output_filename(fname)
if hasattr(self.client, '_current_output_file'):
self.client._current_output_file = fname
# Set thread-local label so downstream logs include chapter/chunk
try:
tls = self.client._get_thread_local_client()
tls.current_request_label = f"Chapter {actual_num} (chunk {chunk_idx}/{total_chunks})"
except Exception:
pass
# Log removed - unified_api_client._log_pre_stagger will log this
# print(f"📤 Sending Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks} to API...")
chapter_ctx = {
'chapter': actual_num,
'chunk': chunk_idx,
'total_chunks': total_chunks,
}
# Get chunk timeout from environment
retry_timeout_enabled = os.getenv("RETRY_TIMEOUT", "0") == "1"
chunk_timeout = int(os.getenv("CHUNK_TIMEOUT", "1800")) if retry_timeout_enabled else None
# Timeout retry logic (same as sequential mode)
try:
max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2"))
except Exception:
max_timeout_retries = 2
timeout_retry_count = 0
while True:
try:
result, finish_reason, raw_obj_from_send = send_with_interrupt(
chapter_msgs,
self.client,
self.config.TEMP,
self.config.MAX_OUTPUT_TOKENS,
local_stop_cb,
chunk_timeout=chunk_timeout,
context='translation',
chapter_context=chapter_ctx,
bypass_graceful_stop=True
)
break # Success, exit retry loop
except UnifiedClientError as e:
error_msg = str(e)
# Treat cancelled errors (from client being closed) as timeout
if "cancelled" in error_msg or "Gemini client not initialized" in error_msg:
# Check stop flag before retrying
if local_stop_cb():
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Translation stopped by user during timeout retry")
return None, chunk_idx, None, False, "cancelled"
# During graceful stop, don't retry - skip this chunk
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active:
print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Skipped (graceful stop)")
return None, chunk_idx, None, False, "graceful_stop"
if timeout_retry_count < max_timeout_retries:
timeout_retry_count += 1
print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: API cancelled/client closed, retrying ({timeout_retry_count}/{max_timeout_retries})...")
# Reinitialize the client if it was closed (check correct client based on type)
client_type = getattr(self.client, 'client_type', 'unknown')
needs_reinit = False
if client_type == 'gemini':
needs_reinit = hasattr(self.client, 'gemini_client') and self.client.gemini_client is None
elif client_type == 'openai':
needs_reinit = hasattr(self.client, 'openai_client') and self.client.openai_client is None
if needs_reinit:
try:
print(f" 🔄 Reinitializing {client_type} client...")
self.client._setup_client()
except Exception as reinit_err:
print(f" ⚠️ Failed to reinitialize client: {reinit_err}")
# Stagger retries to avoid simultaneous API calls
# Use SEND_INTERVAL_SECONDS as base, random from half to full
import random
base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
retry_delay = random.uniform(base_delay / 2, base_delay)
print(f" ⏳ Waiting {retry_delay:.1f}s before retry...")
time.sleep(retry_delay)
continue
else:
# Max retries reached, return timeout to trigger chapter abort
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached")
return "[TIMEOUT]", chunk_idx, None, False, "timeout"
# Check for timeout errors
elif "timed out" in error_msg:
# Check stop flag before retrying
if local_stop_cb():
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Translation stopped by user during timeout retry")
return None, chunk_idx, None, False, "cancelled"
# During graceful stop, don't retry - skip this chunk
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active:
print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timed out during graceful stop - skipping retry")
return "[TIMEOUT]", chunk_idx, None, False, "timeout"
if timeout_retry_count < max_timeout_retries:
timeout_retry_count += 1
print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: API call timed out after {chunk_timeout} seconds, retrying ({timeout_retry_count}/{max_timeout_retries})...")
# Stagger retries to avoid simultaneous API calls
# Use SEND_INTERVAL_SECONDS as base, random from half to full
import random
base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
retry_delay = random.uniform(base_delay / 2, base_delay)
print(f" ⏳ Waiting {retry_delay:.1f}s before retry...")
time.sleep(retry_delay)
continue
else:
# Max retries reached, return timeout to trigger chapter abort
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Max timeout retries ({max_timeout_retries}) reached")
return "[TIMEOUT]", chunk_idx, None, False, "timeout"
else:
# Not a timeout error, re-raise
raise
# Use the raw object directly from send_with_interrupt
raw_obj = raw_obj_from_send
# if raw_obj:
# print(f"🧠 Captured thought signature for chunk {chunk_idx}/{total_chunks}")
if total_chunks and int(total_chunks) > 1:
print(f"📥 Received Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks} response, finish_reason: {finish_reason}")
else:
print(f"📥 Received Chapter {actual_num} response, finish_reason: {finish_reason}")
# Char-ratio truncation retry (silent truncation)
char_ratio_exhausted = False
try:
retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1"
except Exception:
retry_truncated_enabled = False
char_ratio_enabled = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1"
if retry_truncated_enabled and char_ratio_enabled:
has_base64_image = ('data:image' in chunk_html) or ('base64,' in chunk_html)
used_fallback = getattr(self.client, '_used_fallback_key', False)
# Parse settings with sane bounds
try:
char_ratio_threshold_pct = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50"))
except Exception:
char_ratio_threshold_pct = 50.0
try:
char_ratio_retry_limit = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1"))
except Exception:
char_ratio_retry_limit = 1
try:
char_ratio_min_output_chars = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100"))
except Exception:
char_ratio_min_output_chars = 100
char_ratio_threshold_pct = max(0.0, min(100.0, char_ratio_threshold_pct))
char_ratio_threshold = char_ratio_threshold_pct / 100.0
if char_ratio_retry_limit < 1:
char_ratio_retry_limit = 1
if char_ratio_min_output_chars < 0:
char_ratio_min_output_chars = 0
char_ratio_retry_count = 0
while not has_base64_image:
# Stop before any retries
if local_stop_cb():
break
input_char_count = len(chunk_html)
output_char_count = len(result) if result else 0
char_ratio = (output_char_count / input_char_count) if input_char_count > 0 else 0
# Only apply the char-ratio check when we didn't already see a truncation/prohibited-content signal
if finish_reason in ["length", "max_tokens", "content_filter", "prohibited_content"]:
break
if (char_ratio < char_ratio_threshold) and (output_char_count > char_ratio_min_output_chars):
# If the key fallback logic triggered, accept the output to avoid burning retries on worse keys
if used_fallback:
print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio suggests truncation but fallback key was used - accepting output")
break
if char_ratio_retry_count >= char_ratio_retry_limit:
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: All {char_ratio_retry_limit} char-ratio retries exhausted; marking as TRUNCATED")
char_ratio_exhausted = True
break
if char_ratio_retry_count == 0:
print(
f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio suggests truncation "
f"(Input chars: {input_char_count}, Output chars: {output_char_count}, Ratio: {char_ratio:.2f} < {char_ratio_threshold:.2f}). "
f"Attempting up to {char_ratio_retry_limit} retry(ies)..."
)
char_ratio_retry_count += 1
print(
f"🔄 Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry attempt "
f"{char_ratio_retry_count}/{char_ratio_retry_limit}"
)
# Force higher token limit on retries
try:
base_max_tokens = int(self.config.MAX_OUTPUT_TOKENS)
except Exception:
base_max_tokens = self.config.MAX_OUTPUT_TOKENS
try:
retry_cap = int(getattr(self.config, "MAX_RETRY_TOKENS", base_max_tokens))
except Exception:
retry_cap = base_max_tokens
if retry_cap <= 0:
retry_cap = base_max_tokens
retry_max_tokens = max(base_max_tokens, retry_cap)
# Prevent nested truncation retries within the unified client during our char-ratio retries
try:
tls_retry_client = self.client._get_thread_local_client()
except Exception:
tls_retry_client = None
if tls_retry_client is not None:
setattr(tls_retry_client, "_in_truncation_retry", True)
try:
result_retry, finish_reason_retry, raw_obj_retry = send_with_interrupt(
chapter_msgs,
self.client,
self.config.TEMP,
retry_max_tokens,
local_stop_cb,
chunk_timeout=chunk_timeout,
context='translation',
chapter_context=chapter_ctx,
bypass_graceful_stop=True
)
except UnifiedClientError as e:
# Treat timeout during char-ratio retry as a timeout for the chunk
error_msg = str(e)
if "cancelled" in error_msg or "Gemini client not initialized" in error_msg:
if local_stop_cb():
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Translation stopped by user during char-ratio retry")
return None, chunk_idx, None, False, "cancelled"
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active:
print(f"⏸️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Skipped char-ratio retry (graceful stop)")
return None, chunk_idx, None, False, "graceful_stop"
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry failed due to API cancellation")
return "[TIMEOUT]", chunk_idx, None, False, "timeout"
if "timed out" in error_msg:
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry timed out after {chunk_timeout} seconds")
return "[TIMEOUT]", chunk_idx, None, False, "timeout"
print(f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry error: {e}. Accepting current output.")
break
finally:
if tls_retry_client is not None:
try:
setattr(tls_retry_client, "_in_truncation_retry", False)
except Exception:
pass
retry_output_chars = len(result_retry) if result_retry else 0
if result_retry and retry_output_chars > output_char_count:
print(
f"✅ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry improved output "
f"({output_char_count} → {retry_output_chars} chars)"
)
result = result_retry
finish_reason = finish_reason_retry
raw_obj = raw_obj_retry
# Re-check ratio / decide on further retries
continue
print(
f"⚠️ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Char-ratio retry did not improve output "
f"({output_char_count} → {retry_output_chars} chars). Trying again if attempts remain..."
)
continue
# Ratio OK (or output too short to be meaningful): stop checking
break
# Treat truncation retries exhaustion as truncation even if finish_reason changed
# In batch mode each worker has its own thread-local client; check that flag too
try:
tls_client = self.client._get_thread_local_client()
except Exception:
tls_client = None
truncation_exhausted = False
if tls_client is not None:
truncation_exhausted = getattr(tls_client, "_truncation_retries_exhausted", False)
if not truncation_exhausted:
truncation_exhausted = getattr(self.client, "_truncation_retries_exhausted", False)
# Clear the flag on whichever client had it so it doesn't bleed into later calls
try:
if tls_client is not None and getattr(tls_client, "_truncation_retries_exhausted", False):
tls_client._truncation_retries_exhausted = False
except Exception:
pass
try:
if getattr(self.client, "_truncation_retries_exhausted", False):
self.client._truncation_retries_exhausted = False
except Exception:
pass
if finish_reason in ["length", "max_tokens"] or truncation_exhausted or char_ratio_exhausted:
print(f" ⚠️ Chunk {chunk_idx}/{total_chunks} response was TRUNCATED!")
# Track truncation status
is_truncated = True
else:
is_truncated = False
if result:
# Remove chunk markers from result
result = re.sub(r'\[PART \d+/\d+\]\s*', '', result, flags=re.IGNORECASE)
return result, chunk_idx, raw_obj, is_truncated, finish_reason
else:
raise Exception(f"Empty result for chunk {chunk_idx}/{total_chunks}")
# Use ThreadPoolExecutor to process chunks in parallel
# Use same batch size as chapter-level parallelism
max_chunk_workers = min(total_chunks, self.config.BATCH_SIZE)
# Shared abort flag for this chapter's chunks (set when a chunk hits prohibited content)
chunk_abort_event = threading.Event()
# Stop callback that also checks the per-chapter abort flag
def _user_stop_requested() -> bool:
try:
return (self.check_stop_fn() if hasattr(self, "check_stop_fn") else False)
except Exception:
return False
def local_stop_cb() -> bool:
# 1. Check for immediate aborts (errors, etc.)
if chunk_abort_event.is_set():
return True
# 2. Check for user stop request
if not _user_stop_requested():
return False
# 3. User requested stop. Check type of stop.
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if not graceful_stop_active:
# Force stop
return True
# 4. Graceful stop active. Check policies.
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if wait_for_chunks:
# User explicitly wants to wait for chunks -> Do not stop.
return False
# 5. Graceful stop, WAIT_FOR_CHUNKS=0.
# Only wait if ALL chunks are already sent/done.
try:
if _all_chunks_sent_or_done():
return False
except Exception:
pass
# 6. Otherwise (Graceful stop, partial chunks in flight) -> Stop.
return True
# WAIT_FOR_CHUNKS semantics (batch translation):
# - WAIT_FOR_CHUNKS=1: always wait for remaining chunks of this chapter
# - WAIT_FOR_CHUNKS=0: ONLY wait if all chunks have already been *sent to the API* (post-stagger)
# Otherwise, cancel this chapter (do not write partial output).
sent_or_done_chunks = set() # chunk indices (1-based) that were in-flight or completed
def _update_sent_or_done_from_watchdog() -> None:
try:
import unified_api_client
st = unified_api_client.get_api_watchdog_state() if hasattr(unified_api_client, 'get_api_watchdog_state') else {}
entries = st.get('in_flight_entries', []) if isinstance(st, dict) else []
if not isinstance(entries, list):
return
chap_key = str(actual_num)
for e in entries:
if not isinstance(e, dict):
continue
if e.get('status') != 'in_flight':
continue
if str(e.get('chapter')) != chap_key:
continue
try:
tot = int(e.get('total_chunks') or 0)
except Exception:
tot = 0
if tot and int(total_chunks or 0) and tot != int(total_chunks or 0):
continue
try:
ch = int(e.get('chunk') or 0)
except Exception:
ch = 0
if ch:
sent_or_done_chunks.add(ch)
except Exception:
return
def _all_chunks_sent_or_done() -> bool:
try:
_update_sent_or_done_from_watchdog()
except Exception:
pass
return int(total_chunks or 0) > 0 and len(sent_or_done_chunks) >= int(total_chunks or 0)
def _cancel_chapter_due_to_stop(reason: str):
# Ensure remaining chunk workers abort quickly
try:
chunk_abort_event.set()
except Exception:
pass
try:
print(f"🛑 Chapter {actual_num}: cancelling chapter (WAIT_FOR_CHUNKS=0) — {reason}")
except Exception:
pass
# Force a real cancel so in-flight requests stop too (user asked for full-stop for this chapter)
try:
from unified_api_client import UnifiedClientError
import unified_api_client
if hasattr(unified_api_client, 'set_stop_flag'):
unified_api_client.set_stop_flag(True)
if hasattr(unified_api_client, 'global_stop_flag'):
unified_api_client.global_stop_flag = True
if hasattr(unified_api_client, 'UnifiedClient'):
unified_api_client.UnifiedClient._global_cancelled = True
if hasattr(unified_api_client, 'hard_cancel_all'):
unified_api_client.hard_cancel_all()
raise UnifiedClientError("Operation cancelled by user", error_type="cancelled")
except Exception as e:
# If UnifiedClientError isn't available for some reason, raise a normal cancellation
raise
last_chunk_raw_obj = None
chapter_truncated = False # Track if any chunk was truncated
with ThreadPoolExecutor(max_workers=max_chunk_workers, thread_name_prefix=f"Ch{actual_num}Chunk") as chunk_executor:
# Submit chunks with staggered delay to prevent simultaneous starts
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
future_to_chunk = {}
for idx, chunk_data in enumerate(chunks):
# Sleep BEFORE submitting (apply to all chunks when multiple chunks exist)
if thread_delay > 0 and total_chunks > 1:
chunk_num = chunk_data[1] # Extract chunk number for logging
print(f"🧵 Chapter {actual_num}: Delaying {thread_delay}s before submitting chunk {chunk_num}/{total_chunks}")
# Interruptible sleep - check stop flag every 0.1s
# But respect WAIT_FOR_CHUNKS setting during graceful stop
elapsed = 0
check_interval = 0.1
while elapsed < thread_delay:
# Read env vars INSIDE loop to catch stop pressed mid-delay
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
user_stop = _user_stop_requested()
if user_stop:
if graceful_stop_active and wait_for_chunks:
# Explicit wait-for-chunks: keep going (we will still submit remaining chunks)
pass
elif graceful_stop_active and (not wait_for_chunks):
# WAIT_FOR_CHUNKS disabled: only wait if every chunk is already sent.
if _all_chunks_sent_or_done():
# Should be rare here (we're still about to submit), but keep consistent.
pass
else:
_cancel_chapter_due_to_stop("stop requested before all chunks were sent")
else:
# Immediate stop
print(f"🛑 Chunk submission delay interrupted")
raise Exception("Translation stopped by user during chunk submission delay")
if chunk_abort_event.is_set():
raise Exception("Translation stopped (chapter abort)")
sleep_chunk = min(check_interval, thread_delay - elapsed)
time.sleep(sleep_chunk)
elapsed += sleep_chunk
# Now submit the chunk
future = chunk_executor.submit(process_chunk, chunk_data)
future_to_chunk[future] = chunk_data[1] # Store chunk index
# Collect results as they complete
completed_chunks = 0
for future in as_completed(future_to_chunk):
# Read env vars INSIDE loop to catch stop pressed mid-chunk
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
stop_requested = local_stop_cb()
# FIRST: Get the result (future already completed, so this is instant)
# With graceful stop ON, we should save completed work before stopping
try:
result, chunk_idx, raw_obj, is_truncated, finish_reason = future.result()
# Handle graceful-stop skipped chunks
if finish_reason == "graceful_stop":
save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False))
if save_partial_results:
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
partial_content = None
try:
tls = self.client._get_thread_local_client()
partial_content = getattr(tls, '_last_truncated_content', None)
except Exception:
partial_content = getattr(self.client, '_last_truncated_content', None)
if isinstance(partial_content, str) and partial_content:
try:
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(partial_content)
except Exception:
pass
with self.progress_lock:
self.update_progress_fn(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["TRUNCATED"],
chapter_obj=chapter
)
self.save_progress_fn()
print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — saved truncated output")
else:
with self.progress_lock:
self.update_progress_fn(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["PARTIAL"],
chapter_obj=chapter
)
self.save_progress_fn()
print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — marked QA failed (PARTIAL)")
chunk_abort_event.set()
chunk_executor.shutdown(wait=False, cancel_futures=True)
# Let the outer handler mark the chapter as pending/skipped
raise UnifiedClientError(
"Graceful stop active - not starting new API call",
error_type="cancelled"
)
# Handle cancelled chunks (skipped due to stop request)
if finish_reason == "cancelled" or (result is None and finish_reason != "stop"):
print(f"⏭️ Chunk {chunk_idx}/{total_chunks} cancelled (stop requested)")
chunk_executor.shutdown(wait=False, cancel_futures=True)
raise Exception("Translation stopped by user")
# Immediate QA fail: stop remaining chunks and mark chapter
if finish_reason in ("content_filter", "prohibited_content", "error"):
# Signal other chunk workers to abort quickly (chapter-local only)
chunk_abort_event.set()
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_prohibited_results', False))
if save_prohibited_results:
# Do NOT preserve original; save AI output if any, otherwise empty
try:
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(result if isinstance(result, str) else "")
except Exception:
pass
with self.progress_lock:
self.update_progress_fn(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["PROHIBITED_CONTENT"],
chapter_obj=chapter
)
self.save_progress_fn()
chunk_executor.shutdown(wait=False, cancel_futures=True)
return False, actual_num, None, None, None
# Handle timeout failures - abort chapter and mark as failed
if finish_reason == "timeout":
chunk_abort_event.set()
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Timeout - aborting chapter")
with self.progress_lock:
self.update_progress_fn(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["TIMEOUT"],
chapter_obj=chapter
)
self.save_progress_fn()
chunk_executor.shutdown(wait=False, cancel_futures=True)
return False, actual_num, None, None, None
# Handle truncation - abort chapter and mark as failed
# Check if RETRY_TRUNCATED is enabled - if so, truncation should abort chapter
retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1"
if is_truncated and retry_truncated_enabled:
chunk_abort_event.set()
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
print(f"❌ Chapter {actual_num}, Chunk {chunk_idx}/{total_chunks}: Truncated - aborting chapter")
save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False))
if save_partial_results:
try:
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(result if isinstance(result, str) else "")
except Exception:
pass
with self.progress_lock:
self.update_progress_fn(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["TRUNCATED"],
chapter_obj=chapter
)
self.save_progress_fn()
chunk_executor.shutdown(wait=False, cancel_futures=True)
return False, actual_num, None, None, None
if result:
# Store result at correct index to maintain order
with chunks_lock:
translated_chunks[chunk_idx - 1] = result # chunk_idx is 1-based
self.chunks_completed += 1
completed_chunks += 1
# Store the raw object if it's the last chunk (or the only chunk)
if chunk_idx == total_chunks:
last_chunk_raw_obj = raw_obj
# Track if any chunk was truncated
if is_truncated:
chapter_truncated = True
# Log redundant with "Received Chapter X, Chunk Y" above
# print(f"✅ Chunk {chunk_idx}/{total_chunks} completed ({completed_chunks}/{total_chunks})")
# Mark this chunk as done (if we got a real result)
try:
if isinstance(chunk_idx, int) and chunk_idx > 0:
sent_or_done_chunks.add(int(chunk_idx))
except Exception:
pass
# AFTER storing result: check if we should stop
if stop_requested and total_chunks > 1:
if graceful_stop_active and (wait_for_chunks or _all_chunks_sent_or_done()):
# Wait for remaining chunks - continue processing
print(f"⏳ Graceful stop — waiting for remaining chunks of chapter {actual_num}...")
else:
# WAIT_FOR_CHUNKS disabled and not all chunks were actually sent:
# cancel this chapter entirely (no partial output).
try:
chunk_executor.shutdown(wait=False, cancel_futures=True)
except Exception:
pass
_cancel_chapter_due_to_stop("stop requested before all chunks were sent")
except Exception as e:
chunk_idx = future_to_chunk[future]
# Don't print chunk error - will be printed at chapter level
raise
# Verify chunks - handle partial completion
is_partial_result = False
if None in translated_chunks:
missing = [i+1 for i, chunk in enumerate(translated_chunks) if chunk is None]
completed = [i+1 for i, chunk in enumerate(translated_chunks) if chunk is not None]
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if graceful_stop_active and completed:
# Only allow partial output when WAIT_FOR_CHUNKS is explicitly enabled.
# When WAIT_FOR_CHUNKS is disabled, we cancel the whole chapter instead.
if wait_for_chunks:
print(f"⚠️ Chapter {actual_num}: partial translation ({len(completed)}/{total_chunks} chunks) due to graceful stop")
translated_chunks = [c for c in translated_chunks if c is not None]
is_partial_result = True
else:
_cancel_chapter_due_to_stop(f"missing chunks {missing} (WAIT_FOR_CHUNKS=0)")
else:
raise Exception(f"Failed to translate chunks: {missing}")
# Combine all chunks
if total_chunks > 1:
result = '\n'.join(translated_chunks)
print(f"🔗 Combined {total_chunks} chunks for Chapter {actual_num}")
else:
result = translated_chunks[0] if translated_chunks else None
if not result:
raise Exception("No translation result produced")
# Enhanced mode workflow (same as non-batch):
# 1. Original HTML -> html2text -> Markdown/plain text (during extraction)
# 2. Markdown sent to translation API (better for translation quality)
# 3. Translated markdown -> HTML conversion (here)
if result and chapter.get("enhanced_extraction", False):
print(f"🔄 Converting translated markdown back to HTML...")
result = convert_enhanced_text_to_html(result, chapter)
# Emergency Image Restoration (if enabled)
if result and self.config.EMERGENCY_IMAGE_RESTORE:
# Use original HTML if available (for enhanced extraction), otherwise use chapter_body
source_html = chapter.get('original_html', chapter_body)
result = ContentProcessor.emergency_restore_images(result, source_html)
if self.config.REMOVE_AI_ARTIFACTS:
result = ContentProcessor.clean_ai_artifacts(result, True)
result = ContentProcessor.clean_memory_artifacts(result)
cleaned = re.sub(r"^```(?:html)?\s*\n?", "", result, count=1, flags=re.MULTILINE)
cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE)
cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=self.config.REMOVE_AI_ARTIFACTS)
# Post-process: Remove duplicate H1+P pairs from translated OUTPUT if enabled
remove_duplicate_h1_p = os.getenv('REMOVE_DUPLICATE_H1_P', '0') == '1'
if remove_duplicate_h1_p and cleaned:
# First: HTML-based duplicate removal
from bs4 import BeautifulSoup
output_soup = BeautifulSoup(cleaned, 'html.parser')
for h1_tag in output_soup.find_all('h1'):
h1_id = h1_tag.get('id', '')
if h1_id and h1_id.startswith('split-'):
continue
h1_text = h1_tag.get_text(strip=True)
if 'SPLIT MARKER' in h1_text:
continue
# Check next sibling (P after H1)
next_sibling = h1_tag.find_next_sibling()
if next_sibling and next_sibling.name == 'p':
if h1_text == next_sibling.get_text(strip=True):
next_sibling.decompose()
continue
# Check previous sibling (P before H1)
prev_sibling = h1_tag.find_previous_sibling()
if prev_sibling and prev_sibling.name == 'p':
if h1_text == prev_sibling.get_text(strip=True):
prev_sibling.decompose()
cleaned = str(output_soup)
# Second: Markdown-based duplicate removal (for enhanced extraction mode)
# Pattern: "Title Text\n\n# Title Text" - remove the plain text line before markdown header
def remove_markdown_duplicate_headers_batch(text):
lines = text.split('\n')
result_lines = []
i = 0
while i < len(lines):
line = lines[i]
# Check if this is a non-empty line followed by blank lines and then a markdown header
if line.strip() and not line.strip().startswith('#'):
# Look ahead for pattern: [blank lines] [# header with same text]
j = i + 1
# Skip blank lines
while j < len(lines) and not lines[j].strip():
j += 1
# Check if next non-blank line is a markdown header
if j < len(lines):
next_line = lines[j]
header_match = re.match(r'^(#{1,6})\s+(.+)$', next_line)
if header_match:
header_text = header_match.group(2).strip()
# Compare with current line (stripped)
if line.strip() == header_text:
# Skip this duplicate line, keep blanks and header
i += 1
continue
result_lines.append(line)
i += 1
return '\n'.join(result_lines)
cleaned = remove_markdown_duplicate_headers_batch(cleaned)
# Check for empty or failed response BEFORE writing to disk
if not cleaned or not str(cleaned).strip():
print(f"❌ Batch: Translation empty for chapter {actual_num} — skipping file write")
with self.progress_lock:
self.update_progress_fn(idx, actual_num, content_hash, None, status="qa_failed", qa_issues_found=["EMPTY_OUTPUT"])
self.save_progress_fn()
return False, actual_num, None, None, None
if is_qa_failed_response(cleaned):
failure_reason = get_failure_reason(cleaned)
print(f"❌ Batch: Translation failed for chapter {actual_num} - marked as failed, no output file created (reason: {failure_reason})")
with self.progress_lock:
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False))
save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_prohibited_results', False))
should_save = (save_prohibited_results if is_prohibited_failure(cleaned, failure_reason) else save_partial_results)
if should_save:
try:
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(cleaned if isinstance(cleaned, str) else "")
except Exception:
pass
self.update_progress_fn(idx, actual_num, content_hash, fname, status="qa_failed", ai_features=ai_features)
self.save_progress_fn()
return False, actual_num, None, None, None
# NOTE: We no longer append to translation history here in the worker thread.
# History is now written in the main thread per batch, in a stable order.
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
# CRITICAL: Unescape img tags that were converted to HTML entities (applies to ALL HTML)
# Pattern matches: <img ... /> where the tag ends with /
# Post-process: Fix empty attribute tags for BeautifulSoup mode
if os.getenv('FIX_EMPTY_ATTR_TAGS_BS', '0') == '1' and not chapter.get('enhanced_extraction', False):
cleaned = _fix_empty_attr_tags_bs(cleaned)
img_count = len(re.findall(r'<img\s[^>]*?/>', cleaned, flags=re.IGNORECASE))
if img_count > 0:
print(f"🖼️ Unescaping {img_count} img tag(s) from HTML entities (post-processing)")
cleaned = re.sub(
r'<(img\s[^>]*?/)>',
r'<\1>',
cleaned,
flags=re.IGNORECASE
)
if self.is_text_file:
# For text files, save as plain text
fname_txt = fname.replace('.html', '.txt') if fname.endswith('.html') else fname
# Extract text from HTML
from bs4 import BeautifulSoup
soup = BeautifulSoup(cleaned, 'html.parser')
text_content = soup.get_text(strip=True)
# Merge image translations back with text translation
if 'final_body_with_images' in locals() and image_translations:
# Parse both versions
soup_with_images = BeautifulSoup(final_body_with_images, 'html.parser')
soup_with_text = BeautifulSoup(cleaned, 'html.parser')
# Get the translated text content (without images)
body_content = soup_with_text.body
# Add image translations to the translated content
for trans_div in soup_with_images.find_all('div', class_='translated-text-only'):
body_content.insert(0, trans_div)
final_html = str(soup_with_text)
cleaned = final_html
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(cleaned)
# Update with .txt filename
with self.progress_lock:
self.update_progress_fn(idx, actual_num, content_hash, fname_txt, status="completed", ai_features=ai_features)
self.save_progress_fn()
else:
# Original code for EPUB files
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(cleaned)
print(f"💾 Saved Chapter {actual_num}: {fname} ({len(cleaned)} chars)")
# Initialize ai_features at the beginning to ensure it's always defined
if ai_features is None:
ai_features = None
# Extract and save AI features for future duplicate detection
if (self.config.RETRY_DUPLICATE_BODIES and
hasattr(self.config, 'DUPLICATE_DETECTION_MODE') and
self.config.DUPLICATE_DETECTION_MODE in ['ai-hunter', 'cascading']):
try:
# Extract features from the translated content
cleaned_text = re.sub(r'<[^>]+>', '', cleaned).strip()
# Note: self.translator doesn't exist, so we can't extract features here
# The features will need to be extracted during regular processing
print(f" ⚠️ AI features extraction not available in batch mode")
except Exception as e:
print(f" ⚠️ Failed to extract AI features: {e}")
with self.progress_lock:
# Check for truncation or partial result first
if chapter_truncated:
chapter_status = "qa_failed"
print(f"⚠️ Batch: Chapter {actual_num} marked as qa_failed: Response was truncated")
self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features, qa_issues_found=["TRUNCATED"])
self.save_progress_fn()
return False, actual_num, None, None, None
elif is_partial_result:
chapter_status = "qa_failed"
print(f"⚠️ Batch: Chapter {actual_num} marked as qa_failed: Partial translation (graceful stop)")
self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features, qa_issues_found=["PARTIAL"])
self.save_progress_fn()
return False, actual_num, None, None, None
else:
chapter_status = "completed"
self.update_progress_fn(idx, actual_num, content_hash, fname, status=chapter_status, ai_features=ai_features)
self.save_progress_fn()
self.chapters_completed += 1
# Log removed - executor loop will log "Chapter X done"
# print(f"✅ Chapter {actual_num} completed successfully")
# Return chapter body and final cleaned translation so the main thread
# can append to translation history in a stable batch order.
return True, actual_num, chapter_body, cleaned, last_chunk_raw_obj
except Exception as e:
# Graceful-stop pre-send cancellations are expected (they prevent queued calls from starting).
# Do not spam per-chapter "failed" logs, and do not mark these chapters as failed.
error_msg = str(e)
is_graceful_stop_skip = (
"graceful stop active - not starting new api call" in (error_msg or "").lower()
or (hasattr(e, 'error_type') and getattr(e, 'error_type', None) == 'cancelled' and os.environ.get('GRACEFUL_STOP') == '1')
)
if is_graceful_stop_skip:
# Keep a concise log so the user understands why the chapter didn't run.
# (Do NOT include the original error text, since it is noisy and is suppressed in the GUI logger.)
print(f"⏭️ Chapter {actual_num} skipped (graceful stop)")
try:
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
with self.progress_lock:
# Reset back to pending so it can be resumed later.
self.update_progress_fn(idx, actual_num, content_hash, fname, status="pending")
self.save_progress_fn()
except Exception:
pass
return False, actual_num, None, None, None
with self.progress_lock:
# Use the same output filename so we can track failed chapters properly
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
# Check if it's a timeout failure
if "[TIMEOUT]" in error_msg or (hasattr(e, 'error_type') and e.error_type == 'timeout'):
self.update_progress_fn(idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TIMEOUT"], chapter_obj=chapter)
else:
self.update_progress_fn(idx, actual_num, content_hash, fname, status="failed")
self.save_progress_fn()
# Print consolidated error message
if total_chunks > 1:
print(f"❌ Chapter {actual_num} failed (chunk {chunk_idx}/{total_chunks}): {e}")
else:
print(f"❌ Chapter {actual_num} failed: {e}")
# No history for failed chapters
return False, actual_num, None, None, None
def process_merged_group(self, merge_group, progress_manager):
"""
Process a merge group (multiple chapters merged into a single API request).
Args:
merge_group: List of (idx, chapter) tuples to merge
progress_manager: ProgressManager instance for updating merged chapter status
Returns:
List of results, each in format: (success, actual_num, hist_user, hist_assistant, raw_obj)
"""
import threading
if len(merge_group) == 1:
# Single chapter, process normally
result = self.process_single_chapter(merge_group[0])
return [result]
# Get info for all chapters in the group
chapters_data = [] # List of (chapter_num, content, idx, chapter_obj, content_hash)
parent_idx, parent_chapter = merge_group[0]
parent_actual_num = parent_chapter.get('actual_chapter_num', parent_chapter['num'])
# Check for graceful stop before starting work
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active or self.check_stop_fn():
# Return failed results for all chapters in the group
results = []
for idx, chapter in merge_group:
actual_num = chapter.get('actual_chapter_num', chapter['num'])
results.append((False, actual_num, None, None, None))
raise Exception("Translation stopped by user")
# Only log if not about to be stopped
thread_name = threading.current_thread().name
print(f"\n🔗 [{thread_name}] Processing MERGED group: Chapters {[c.get('actual_chapter_num', c['num']) for _, c in merge_group]}")
# Double-check stop after logging but before doing real work
if self.check_stop_fn():
# Return failed results for all chapters in the group
results = []
for idx, chapter in merge_group:
actual_num = chapter.get('actual_chapter_num', chapter['num'])
results.append((False, actual_num, None, None, None))
raise Exception("Translation stopped by user")
# Check ignore settings for filtering
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
use_title_tag = os.getenv('USE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
remove_duplicate_h1_p = os.getenv('REMOVE_DUPLICATE_H1_P', '0') == '1'
for idx, chapter in merge_group:
actual_num = chapter.get('actual_chapter_num', chapter['num'])
content_hash = chapter.get("content_hash") or ContentProcessor.get_content_hash(chapter["body"])
# Get chapter body and apply ignore filters if needed
chapter_body = chapter["body"]
if (not use_title_tag or ignore_header_tags or remove_duplicate_h1_p) and chapter_body:
from bs4 import BeautifulSoup
body_soup = BeautifulSoup(chapter_body, 'html.parser')
# Remove title tags if ignored (including those in )
if not use_title_tag:
for title_tag in body_soup.find_all('title'):
title_tag.decompose()
# Remove header tags if ignored
if ignore_header_tags:
for header_tag in body_soup.find_all(['h1', 'h2', 'h3']):
header_tag.decompose()
# Remove duplicate H1+P pairs (where P is adjacent to H1 with same text)
if remove_duplicate_h1_p:
for h1_tag in body_soup.find_all('h1'):
# Skip split marker H1 tags
h1_id = h1_tag.get('id', '')
if h1_id and h1_id.startswith('split-'):
continue
h1_text = h1_tag.get_text(strip=True)
if 'SPLIT MARKER' in h1_text:
continue
# Check next sibling (P after H1)
next_sibling = h1_tag.find_next_sibling()
if next_sibling and next_sibling.name == 'p':
p_text = next_sibling.get_text(strip=True)
if h1_text == p_text:
next_sibling.decompose()
continue
# Check previous sibling (P before H1)
prev_sibling = h1_tag.find_previous_sibling()
if prev_sibling and prev_sibling.name == 'p':
p_text = prev_sibling.get_text(strip=True)
if h1_text == p_text:
prev_sibling.decompose()
chapter_body = str(body_soup)
# Also apply markdown-based duplicate removal (for enhanced extraction mode)
if remove_duplicate_h1_p and chapter_body:
def remove_markdown_duplicate_headers_input(text):
lines = text.split('\n')
result = []
i = 0
while i < len(lines):
line = lines[i]
# Check if this is a non-empty line followed by blank lines and then a markdown header
if line.strip() and not line.strip().startswith('#'):
# Look ahead for pattern: [blank lines] [# header with same text]
j = i + 1
# Skip blank lines
while j < len(lines) and not lines[j].strip():
j += 1
# Check if next non-blank line is a markdown header
if j < len(lines):
next_line = lines[j]
header_match = re.match(r'^(#{1,6})\s+(.+)$', next_line)
if header_match:
header_text = header_match.group(2).strip()
# Compare with current line (stripped)
if line.strip() == header_text:
# Skip this duplicate line, keep blanks and header
i += 1
continue
result.append(line)
i += 1
return '\n'.join(result)
chapter_body = remove_markdown_duplicate_headers_input(chapter_body)
chapters_data.append((actual_num, chapter_body, idx, chapter, content_hash))
try:
# Mark all chapters as in_progress
for actual_num, _, idx, chapter, content_hash in chapters_data:
with self.progress_lock:
# Determine output filename for tracking (consistent with process_single_chapter)
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
self.update_progress_fn(idx, actual_num, content_hash, fname, status="in_progress", chapter_obj=chapter)
self.save_progress_fn()
# Merge chapter contents
merge_input = [(cn, content, ch) for cn, content, _, ch, _ in chapters_data]
merged_content = RequestMerger.merge_chapters(merge_input)
expected_chapters = [cn for cn, _, _, _, _ in chapters_data]
print(f" 📊 Merged {len(merge_group)} chapters ({len(merged_content):,} chars total)")
# Build system prompt with glossary
# Use get_system_prompt() with actual merge count to conditionally include split marker instruction
glossary_path = find_glossary_file(self.out_dir)
base_system_prompt = self.config.get_system_prompt(actual_merge_count=len(merge_group))
chapter_system_prompt = build_system_prompt(
base_system_prompt,
glossary_path,
source_text=merged_content
)
# Build messages
rolling_summary_msgs = []
if getattr(self.config, 'USE_ROLLING_SUMMARY', False):
try:
rs_text = self.get_batch_rolling_summary_text()
except Exception:
rs_text = ""
if isinstance(rs_text, str) and rs_text:
rolling_summary_msgs = [{
"role": "assistant",
"content": (
"CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n"
"[MEMORY] Previous context summary:\n\n"
+ rs_text + "\n\n"
"[END MEMORY]\n"
"END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:"
)
}]
memory_msgs = []
if (self.config.CONTEXTUAL
and self.history_manager is not None
and getattr(self.config, 'HIST_LIMIT', 0) > 0):
try:
history = self.history_manager.load_history()
hist_limit = getattr(self.config, 'HIST_LIMIT', 0)
trimmed = history[-hist_limit * 2:]
include_source = os.getenv("INCLUDE_SOURCE_IN_HISTORY", "0") == "1"
for h in trimmed:
if not isinstance(h, dict):
continue
role = h.get('role', 'user')
raw_obj = h.get('_raw_content_object')
content = h.get('content') or ""
if role == 'user' and not include_source:
continue
if (not content) and raw_obj is None:
continue
msg = {'role': role}
if content:
msg['content'] = content
if raw_obj is not None:
msg['_raw_content_object'] = raw_obj
memory_msgs.append(msg)
except Exception as e:
print(f" ⚠️ Failed to load history for merged group: {e}")
# Build optional assistant prefill message if configured
assistant_prefill_msgs = []
if getattr(self.config, 'ASSISTANT_PROMPT', '') and self.config.ASSISTANT_PROMPT.strip():
assistant_prefill_msgs = [{"role": "assistant", "content": self.config.ASSISTANT_PROMPT.strip()}]
msgs = [{"role": "system", "content": chapter_system_prompt}] + rolling_summary_msgs + memory_msgs + assistant_prefill_msgs + [
{"role": "user", "content": merged_content}
]
# Prepare split-failed retry controls
try:
split_retry_limit = int(getattr(self.config, 'SPLIT_FAILED_RETRY_ATTEMPTS', 2))
except Exception:
split_retry_limit = 2
disable_fallback_flag = (os.getenv('DISABLE_MERGE_FALLBACK', '0') == '1') or bool(getattr(self.config, 'DISABLE_MERGE_FALLBACK', False))
# Use toggle/config for split retries (works in batch and non-batch)
split_retry_enabled = (os.getenv('RETRY_SPLIT_FAILED', '0') == '1') or bool(getattr(self.config, 'RETRY_SPLIT_FAILED', False))
split_retry_attempts = 0
print(f" [DEBUG] Split retry enabled={split_retry_enabled}, limit={split_retry_limit}, disable_fallback={disable_fallback_flag}")
# Log combined prompt token count for merged request (treated as Chunk 1/1).
try:
# Use the same token counter as regular batch splitting.
# Instantiate a lightweight ChapterSplitter here for counting only.
chapter_splitter = ChapterSplitter(model_name=self.config.MODEL)
# Count tokens for system+assistant(user/memory) messages
total_tokens = 0
assistant_tokens = 0
for m in msgs:
content = m.get("content", "")
tokens = chapter_splitter.count_tokens(content)
total_tokens += tokens
if m.get("role") == "assistant":
assistant_tokens += tokens
non_assistant_tokens = total_tokens - assistant_tokens
# Determine a stable file reference based on parent chapter
parent_file_ref = (
parent_chapter.get('original_basename')
or parent_chapter.get('filename')
or f"Chapter_{parent_actual_num}"
)
# Get budget string from MAX_INPUT_TOKENS
token_env = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
_, budget_str = parse_token_limit(token_env)
if self.config.CONTEXTUAL and assistant_tokens > 0:
print(
f"💬 Chunk 1/1 combined prompt: "
f"{total_tokens:,} tokens (system + user: {non_assistant_tokens:,}, "
f"assistant/memory: {assistant_tokens:,}) / {budget_str} [File: {parent_file_ref}]"
)
else:
print(
f"💬 Chunk 1/1 combined prompt: "
f"{total_tokens:,} tokens (system + user) / {budget_str} [File: {parent_file_ref}]"
)
except Exception as e:
# Never break translation due to logging issues.
print(f" ⚠️ Failed to log combined prompt tokens for merged group: {e}")
# Get max output tokens
env_max_output = os.getenv("MAX_OUTPUT_TOKENS", "")
if env_max_output.isdigit() and int(env_max_output) > 0:
mtoks = int(env_max_output)
else:
mtoks = self.config.MAX_OUTPUT_TOKENS
# Finite retry loop to avoid infinite re-requests when Split‑the‑Merge keeps failing.
max_merge_attempts = (max(1, split_retry_limit) + 1) if split_retry_enabled else 1
split_retry_attempts = 0
# Track char-ratio retries across the entire merged-group request sequence
# (don't reset per split-retry attempt)
char_ratio_attempts_used = 0
while split_retry_attempts < max_merge_attempts:
# Call API for merged content
print(f" 🌐 Sending merged request to API...")
# Build chapter context with merged chapter numbers for progress bar display
merged_chapter_nums_for_context = [cn for cn, _, _, _, _ in chapters_data]
chapter_ctx = {
'chapter': parent_actual_num,
'chunk': 1,
'total_chunks': 1,
'merged_chapters': merged_chapter_nums_for_context,
}
merged_response, finish_reason, raw_obj = send_with_interrupt(
msgs,
self.client,
self.config.TEMP,
mtoks,
self.check_stop_fn,
context='translation',
chapter_context=chapter_ctx,
)
# Preserve the finish reason from the merged API call for later status decisions.
merged_finish_reason = finish_reason
truncation_exhausted = getattr(self.client, "_truncation_retries_exhausted", False)
if truncation_exhausted:
try:
self.client._truncation_retries_exhausted = False
except Exception:
pass
# During graceful stop, let the split complete instead of aborting
if self.check_stop_fn() and os.environ.get('GRACEFUL_STOP') != '1':
raise Exception("Translation stopped by user")
if not merged_response:
raise Exception("Empty response from API for merged request")
# Char-ratio truncation retry (silent truncation)
char_ratio_exhausted = False
try:
retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1"
except Exception:
retry_truncated_enabled = False
char_ratio_enabled = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1"
if retry_truncated_enabled and char_ratio_enabled and not truncation_exhausted:
has_base64_image = ('data:image' in merged_content) or ('base64,' in merged_content)
used_fallback = getattr(self.client, '_used_fallback_key', False)
# Parse settings with sane bounds
try:
char_ratio_threshold_pct = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50"))
except Exception:
char_ratio_threshold_pct = 50.0
try:
char_ratio_retry_limit = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1"))
except Exception:
char_ratio_retry_limit = 1
try:
char_ratio_min_output_chars = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100"))
except Exception:
char_ratio_min_output_chars = 100
char_ratio_threshold_pct = max(0.0, min(100.0, char_ratio_threshold_pct))
char_ratio_threshold = char_ratio_threshold_pct / 100.0
if char_ratio_retry_limit < 1:
char_ratio_retry_limit = 1
if char_ratio_min_output_chars < 0:
char_ratio_min_output_chars = 0
char_ratio_retry_count = 0
while not has_base64_image:
if self.check_stop_fn():
break
# Only apply the char-ratio check when we didn't already see a truncation/prohibited-content signal
if merged_finish_reason in ["length", "max_tokens", "content_filter", "prohibited_content"]:
break
input_char_count = len(merged_content)
output_char_count = len(merged_response) if merged_response else 0
char_ratio = (output_char_count / input_char_count) if input_char_count > 0 else 0
if (char_ratio < char_ratio_threshold) and (output_char_count > char_ratio_min_output_chars):
if used_fallback:
print(f"⚠️ Merged group: Char-ratio suggests truncation but fallback key was used - accepting output")
break
# IMPORTANT: track retries across the whole merged-group request sequence
# so split-failed merge retries don't reset the char-ratio retry budget.
if char_ratio_attempts_used >= char_ratio_retry_limit:
print(f"❌ Merged group: All {char_ratio_retry_limit} char-ratio retries exhausted; marking as TRUNCATED")
char_ratio_exhausted = True
break
if char_ratio_retry_count == 0:
remaining = max(0, char_ratio_retry_limit - char_ratio_attempts_used)
print(
f"⚠️ Merged group: Char-ratio suggests truncation "
f"(Input chars: {input_char_count}, Output chars: {output_char_count}, Ratio: {char_ratio:.2f} < {char_ratio_threshold:.2f}). "
f"Attempting up to {remaining} retry(ies)..."
)
# Consume one attempt (global across this merged group)
char_ratio_attempts_used += 1
char_ratio_retry_count += 1
print(f"🔄 Merged group: Char-ratio retry attempt {char_ratio_attempts_used}/{char_ratio_retry_limit}")
# Force higher token limit on retries
try:
base_max_tokens = int(mtoks)
except Exception:
base_max_tokens = mtoks
try:
retry_cap = int(getattr(self.config, "MAX_RETRY_TOKENS", base_max_tokens))
except Exception:
retry_cap = base_max_tokens
if retry_cap <= 0:
retry_cap = base_max_tokens
retry_max_tokens = max(base_max_tokens, retry_cap)
# Prevent nested truncation retries within the unified client during our char-ratio retries
try:
tls_retry_client = self.client._get_thread_local_client()
except Exception:
tls_retry_client = None
if tls_retry_client is not None:
setattr(tls_retry_client, "_in_truncation_retry", True)
try:
merged_response_retry, finish_reason_retry, raw_obj_retry = send_with_interrupt(
msgs,
self.client,
self.config.TEMP,
retry_max_tokens,
self.check_stop_fn,
context='translation',
chapter_context=chapter_ctx,
)
finally:
if tls_retry_client is not None:
try:
setattr(tls_retry_client, "_in_truncation_retry", False)
except Exception:
pass
# Capture truncation exhaustion that might have occurred during the retry
try:
retry_trunc_exhausted = getattr(self.client, "_truncation_retries_exhausted", False)
if retry_trunc_exhausted:
truncation_exhausted = True
self.client._truncation_retries_exhausted = False
except Exception:
pass
if self.check_stop_fn() and os.environ.get('GRACEFUL_STOP') != '1':
raise Exception("Translation stopped by user")
retry_output_chars = len(merged_response_retry) if merged_response_retry else 0
if merged_response_retry and retry_output_chars > output_char_count:
print(f"✅ Merged group: Char-ratio retry improved output ({output_char_count} → {retry_output_chars} chars)")
merged_response = merged_response_retry
finish_reason = finish_reason_retry
raw_obj = raw_obj_retry
merged_finish_reason = finish_reason_retry
continue
print(
f"⚠️ Merged group: Char-ratio retry did not improve output "
f"({output_char_count} → {retry_output_chars} chars). Trying again if attempts remain..."
)
continue
break
# Check for truncation (use preserved finish reason so retries/merges don't lose the flag)
merged_truncated = merged_finish_reason in ["length", "max_tokens"] or truncation_exhausted or char_ratio_exhausted
if merged_truncated:
print(f" ⚠️ Merged response was TRUNCATED!")
# Clean the merged response
cleaned = merged_response
if self.config.REMOVE_AI_ARTIFACTS:
cleaned = ContentProcessor.clean_ai_artifacts(cleaned, True)
cleaned = ContentProcessor.clean_memory_artifacts(cleaned)
cleaned = re.sub(r"^```(?:html)?\s*\n?", "", cleaned, count=1, flags=re.MULTILINE)
cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE)
# Post-process: Fix empty attribute tags for BeautifulSoup mode
if os.getenv('FIX_EMPTY_ATTR_TAGS_BS', '0') == '1':
try:
enhanced_group_check = any(bool(ch.get('enhanced_extraction')) for _, _, _, ch, _ in chapters_data)
except Exception:
enhanced_group_check = False
if not enhanced_group_check:
cleaned = _fix_empty_attr_tags_bs(cleaned)
# Get parent chapter info
parent_actual_num, parent_content, parent_idx, parent_chapter, parent_content_hash = chapters_data[0]
merged_child_nums = [cn for cn, _, _, _, _ in chapters_data[1:]]
# Check if enhanced extraction was used
try:
enhanced_group = any(bool(ch.get('enhanced_extraction')) for _, _, _, ch, _ in chapters_data)
except Exception:
enhanced_group = False
# Check if Split the Merge is enabled
split_the_merge = os.getenv('SPLIT_THE_MERGE', '0') == '1'
# If Split the Merge is enabled, SKIP markdown→HTML conversion here
# We'll do it AFTER splitting so markers are preserved
if not split_the_merge and enhanced_group and isinstance(cleaned, str):
print(" 🔄 Converting merged enhanced text back to HTML...")
try:
cleaned = convert_enhanced_text_to_html(cleaned, parent_chapter)
except Exception as conv_err:
print(f" ⚠️ Enhanced HTML conversion failed: {conv_err} — saving raw content")
# Emergency Image Restoration (if enabled)
if self.config.EMERGENCY_IMAGE_RESTORE:
cleaned = ContentProcessor.emergency_restore_images(cleaned, merged_content)
# Optionally restore paragraphs if the output lacks structure
if getattr(self.config, 'EMERGENCY_RESTORE', False):
try:
if cleaned and cleaned.count('
') < 3 and len(cleaned) > 300:
cleaned = ContentProcessor.emergency_restore_paragraphs(cleaned)
except Exception:
pass
# Check for truncation / QA failures first
results = []
if is_qa_failed_response(cleaned):
# Only save file for debugging if it contains meaningful content beyond error markers
cleaned_stripped = cleaned.strip()
is_only_error_marker = cleaned_stripped in [
"[TRANSLATION FAILED]",
"[Content Blocked]",
"[IMAGE TRANSLATION FAILED]",
"[EXTRACTION FAILED]",
"[RATE LIMITED]",
"[]"
] or cleaned_stripped.startswith("[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]") or cleaned_stripped.startswith("[CONTENT BLOCKED - ORIGINAL TEXT PRESERVED]")
failure_reason = get_failure_reason(cleaned)
save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_partial_results', False))
save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(self.config, 'save_prohibited_results', False))
should_save = (save_prohibited_results if is_prohibited_failure(cleaned, failure_reason) else save_partial_results)
if should_save:
parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num)
try:
cleaned_to_save = cleaned
if split_the_merge:
cleaned_to_save = re.sub(
r'
]*id=\"split-\\d+\"[^>]*>.*?
\\s*',
'',
cleaned_to_save,
flags=re.IGNORECASE | re.DOTALL,
)
with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f:
f.write(cleaned_to_save if isinstance(cleaned_to_save, str) else "")
except Exception:
pass
elif not is_only_error_marker and cleaned_stripped:
parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num)
try:
cleaned_to_save = cleaned
if split_the_merge:
cleaned_to_save = re.sub(
r'
]*id=\"split-\\d+\"[^>]*>.*?
\\s*',
'',
cleaned_to_save,
flags=re.IGNORECASE | re.DOTALL,
)
with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f:
f.write(cleaned_to_save)
except Exception:
pass
# Use each chapter's own expected filename so we overwrite the existing in_progress entry
for actual_num, _, idx, chapter, content_hash in chapters_data:
chapter_fname = FileUtilities.create_chapter_filename(chapter, actual_num)
with self.progress_lock:
self.update_progress_fn(
idx,
actual_num,
content_hash,
chapter_fname,
status="qa_failed",
chapter_obj=chapter,
)
self.save_progress_fn()
results.append((False, actual_num, None, None, None))
return results
# Now handle split-the-merge
disable_fallback = disable_fallback_flag
split_sections = None
if split_the_merge and len(chapters_data) > 1:
# Try to split by invisible markers
split_sections = RequestMerger.split_by_markers(cleaned, len(chapters_data))
# If split failed, optionally retry; if retries exhausted, mark qa_failed when fallback disabled
if split_the_merge and (not split_sections or len(split_sections) != len(chapters_data)):
if split_retry_enabled and split_retry_attempts + 1 < max_merge_attempts:
split_retry_attempts += 1
print(f" 🔄 Split failed — retrying merged request (attempt {split_retry_attempts}/{max_merge_attempts - 1})")
continue
if disable_fallback:
print(f" ⚠️ Split failed and fallback disabled - marking merged group as qa_failed")
# Only save file for debugging if it contains meaningful content beyond error markers
cleaned_stripped = cleaned.strip()
is_only_error_marker = cleaned_stripped in [
"[TRANSLATION FAILED]",
"[Content Blocked]",
"[IMAGE TRANSLATION FAILED]",
"[EXTRACTION FAILED]",
"[RATE LIMITED]",
"[]"
] or cleaned_stripped.startswith("[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]") or cleaned_stripped.startswith("[CONTENT BLOCKED - ORIGINAL TEXT PRESERVED]")
if not is_only_error_marker and cleaned_stripped:
# Save for debugging - contains actual translation attempt that failed split
parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num)
try:
cleaned_to_save = cleaned
if split_the_merge:
cleaned_to_save = re.sub(
r'
]*id=\"split-\\d+\"[^>]*>.*?
\\s*',
'',
cleaned_to_save,
flags=re.IGNORECASE | re.DOTALL,
)
with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f:
f.write(cleaned_to_save)
except Exception:
pass
# IMPORTANT:
# Use each chapter's own expected filename so we overwrite the
# existing in_progress entry instead of creating composite keys.
for actual_num, _, idx, chapter, content_hash in chapters_data:
chapter_fname = FileUtilities.create_chapter_filename(chapter, actual_num)
with self.progress_lock:
self.update_progress_fn(
idx,
actual_num,
content_hash,
chapter_fname,
status="qa_failed",
qa_issues_found=["SPLIT_FAILED"],
chapter_obj=chapter,
)
self.save_progress_fn()
results.append((False, actual_num, None, None, None))
return results
# If split failed and fallback is allowed, optionally retry merged translation
if split_the_merge and (not split_sections or len(split_sections) != len(chapters_data)) and split_retry_enabled:
if split_retry_attempts < split_retry_limit:
split_retry_attempts += 1
attempt_no = split_retry_attempts
print(f" 🔄 Split failed retry {attempt_no}/{split_retry_limit} — requesting new merged translation")
time.sleep(1)
# Try a fresh merged request on next loop iteration
continue
else:
print(f" ⚠️ Split failed after {split_retry_limit} retries, falling back to merged output")
if split_sections and len(split_sections) == len(chapters_data):
# Split successful - save each section as individual file
print(f" ✂️ Splitting merged content into {len(split_sections)} individual files")
saved_files = []
for i, (actual_num, content, idx, chapter, content_hash) in enumerate(chapters_data):
section_content = split_sections[i]
# NOW convert markdown→HTML for each section if enhanced extraction was used
if enhanced_group and isinstance(section_content, str):
try:
section_content = convert_enhanced_text_to_html(section_content, chapter)
except Exception as conv_err:
print(f" ⚠️ Enhanced HTML conversion failed for chapter {actual_num}: {conv_err}")
# Generate filename for this chapter using content.opf naming
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
# Handle text file mode
if getattr(self, 'is_text_file', False):
fname = fname.replace('.html', '.txt')
from bs4 import BeautifulSoup
soup = BeautifulSoup(section_content, 'html.parser')
section_content = soup.get_text(strip=True)
# Save the section
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(section_content)
saved_files.append((actual_num, fname, idx, chapter, content_hash))
print(f" 💾 Saved Chapter {actual_num}: {fname} ({len(section_content)} chars)")
# Mark all chapters as completed or qa_failed (for truncated)
with self.progress_lock:
for actual_num, fname, idx, chapter, content_hash in saved_files:
chapter_status = "qa_failed" if merged_truncated else "completed"
qa_issues = ["TRUNCATED"] if merged_truncated else None
self.update_progress_fn(
idx, actual_num, content_hash, fname,
status=chapter_status, qa_issues_found=qa_issues, chapter_obj=chapter
)
self.chapters_completed += 1
# Save once after all updates
self.save_progress_fn()
# Build results - if truncated, treat as failure for all chapters
if merged_truncated:
for actual_num, _, idx, chapter, content_hash in chapters_data:
results.append((False, actual_num, None, None, None))
else:
results.append((True, chapters_data[0][0], merged_content, merged_response, raw_obj))
for actual_num, _, idx, chapter, content_hash in chapters_data[1:]:
results.append((True, actual_num, None, None, None))
print(f" ✅ Split the Merge complete: {len(saved_files)} files created")
return results
# Normal merged behavior (split not enabled or header count mismatch)
# Save entire merged response to parent chapter's file
fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num)
# If Split-the-Merge was enabled but we couldn't split reliably, remove injected markers
cleaned_to_save = cleaned
if split_the_merge and len(chapters_data) > 1:
cleaned_to_save = re.sub(
r'
]*id=\"split-\\d+\"[^>]*>.*?
\\s*',
'',
cleaned_to_save,
flags=re.IGNORECASE | re.DOTALL,
)
# If translating a plain text source, mirror non-merged behavior and write .txt
if getattr(self, 'is_text_file', False):
parent_fname = fname.replace('.html', '.txt')
from bs4 import BeautifulSoup
soup = BeautifulSoup(cleaned_to_save, 'html.parser')
text_content = soup.get_text(strip=True)
with open(os.path.join(self.out_dir, parent_fname), 'w', encoding='utf-8') as f:
f.write(text_content)
saved_name = parent_fname
else:
with open(os.path.join(self.out_dir, fname), 'w', encoding='utf-8') as f:
f.write(cleaned_to_save)
saved_name = fname
print(f" 💾 Saved merged content to Chapter {parent_actual_num}: {saved_name} ({len(cleaned_to_save)} chars)")
with self.progress_lock:
if merged_truncated:
# Truncated merged response: mark ALL chapters as qa_failed
# Check if we can retry this truncation failure as a general merge failure
if (not char_ratio_exhausted) and split_retry_enabled and split_retry_attempts + 1 < max_merge_attempts:
split_retry_attempts += 1
print(f" 🔄 Truncated merged response — retrying request (attempt {split_retry_attempts}/{max_merge_attempts - 1})")
time.sleep(2)
continue
# Check if we can retry this truncation failure as a general merge failure
if (not char_ratio_exhausted) and split_retry_enabled and split_retry_attempts + 1 < max_merge_attempts:
split_retry_attempts += 1
print(f" 🔄 Truncated merged response — retrying request (attempt {split_retry_attempts}/{max_merge_attempts - 1})")
time.sleep(2)
continue
qa_issues = ["TRUNCATED"]
self.update_progress_fn(
parent_idx, parent_actual_num, parent_content_hash, saved_name,
status="qa_failed", qa_issues_found=qa_issues, chapter_obj=parent_chapter
)
for actual_num, _, idx, chapter, content_hash in chapters_data[1:]:
self.update_progress_fn(
idx, actual_num, content_hash, None,
status="qa_failed", qa_issues_found=qa_issues, chapter_obj=chapter
)
self.chapters_completed += len(chapters_data)
else:
# Normal success path: parent completed, children merged
self.update_progress_fn(
parent_idx, parent_actual_num, parent_content_hash, saved_name,
status="completed",
merged_chapters=merged_child_nums,
chapter_obj=parent_chapter
)
self.chapters_completed += 1
# Then mark all child chapters as merged (only after parent is completed)
for actual_num, _, idx, chapter, content_hash in chapters_data[1:]:
progress_manager.mark_as_merged(idx, actual_num, content_hash, parent_actual_num, chapter, parent_output_file=saved_name)
self.chapters_completed += 1
# Save once after all updates
self.save_progress_fn()
# Build results based on truncation status
if merged_truncated:
for actual_num, _, idx, chapter, content_hash in chapters_data:
results.append((False, actual_num, None, None, None))
else:
results.append((True, parent_actual_num, merged_content, merged_response, raw_obj))
for actual_num, _, idx, chapter, content_hash in chapters_data[1:]:
results.append((True, actual_num, None, None, None))
return results
# Should never hit this line; guard to prevent infinite loop
raise RuntimeError("Merged translation exited retry loop without returning a result")
except Exception as e:
print(f"❌ Merged group failed: {e} (NOTE: API Error triggered cancellation logic)")
# Mark all chapters as failed
results = []
for actual_num, _, idx, chapter, content_hash in chapters_data:
with self.progress_lock:
fname = FileUtilities.create_chapter_filename(chapter, actual_num)
self.update_progress_fn(idx, actual_num, content_hash, fname, status="failed", chapter_obj=chapter)
self.save_progress_fn()
results.append((False, actual_num, None, None, None))
return results
# =====================================================
# UNIFIED UTILITIES
# =====================================================
def sanitize_resource_filename(filename):
"""Sanitize resource filenames for filesystem compatibility"""
filename = unicodedata.normalize('NFC', filename)
replacements = {
'/': '_', '\\': '_', ':': '_', '*': '_',
'?': '_', '"': '_', '<': '_', '>': '_',
'|': '_', '\0': '', '\n': '_', '\r': '_'
}
for old, new in replacements.items():
filename = filename.replace(old, new)
filename = ''.join(char for char in filename if ord(char) >= 32)
name, ext = os.path.splitext(filename)
if not name:
name = 'resource'
return name + ext
def should_retain_source_extension():
"""Read GUI toggle for retaining original extension and no 'response_' prefix.
This is stored in config or env by the GUI; we read env as bridge.
"""
return os.getenv('RETAIN_SOURCE_EXTENSION', os.getenv('retain_source_extension', '0')) in ('1', 'true', 'True')
def make_safe_filename(title, actual_num):
"""Create a safe filename that works across different filesystems"""
if not title:
return f"chapter_{actual_num:03d}"
title = unicodedata.normalize('NFC', str(title))
dangerous_chars = {
'/': '_', '\\': '_', ':': '_', '*': '_', '?': '_',
'"': '_', '<': '_', '>': '_', '|': '_', '\0': '',
'\n': ' ', '\r': ' ', '\t': ' '
}
for old, new in dangerous_chars.items():
title = title.replace(old, new)
title = ''.join(char for char in title if ord(char) >= 32)
title = re.sub(r'\s+', '_', title)
title = title.strip('_.• \t')
if not title or title == '_' * len(title):
title = f"chapter_{actual_num:03d}"
return title
def get_content_hash(html_content):
"""Create a stable hash of content"""
return ContentProcessor.get_content_hash(html_content)
def clean_ai_artifacts(text, remove_artifacts=True):
"""Remove AI response artifacts from text"""
return ContentProcessor.clean_ai_artifacts(text, remove_artifacts)
def find_glossary_file(output_dir):
"""Return path to glossary file preferring CSV/MD/TXT over JSON, or None if not found"""
candidates = [
os.path.join(output_dir, "glossary.csv"),
os.path.join(output_dir, "glossary.md"),
os.path.join(output_dir, "glossary.txt"),
os.path.join(output_dir, "glossary.json"),
]
for p in candidates:
if os.path.exists(p):
return p
return None
def clean_memory_artifacts(text):
"""Remove any memory/summary artifacts"""
return ContentProcessor.clean_memory_artifacts(text)
def emergency_restore_paragraphs(text, original_html=None, verbose=True):
"""Emergency restoration when AI returns wall of text"""
return ContentProcessor.emergency_restore_paragraphs(text, original_html, verbose)
def _fix_empty_attr_tags_bs(text: str) -> str:
"""Post-process: escape hallucinated empty-attribute tags in BeautifulSoup output.
Transforms patterns like Content into <Tag Attr>Content
for non-standard HTML tags, preserving real HTML tags untouched.
"""
known_tags = {
'html','head','body','title','meta','link','style','script','noscript',
'p','div','span','br','hr','img','a','h1','h2','h3','h4','h5','h6',
'ul','ol','li','dl','dt','dd',
'pre','code','em','strong','b','i','u','s','strike','del','ins','mark','small','sub','sup',
'table','thead','tbody','tr','td','th','caption','col','colgroup',
'blockquote','q','cite',
'section','article','header','footer','nav','main','aside','details','summary',
'figure','figcaption',
'form','input','button','select','option','textarea','label','fieldset','legend',
'iframe','canvas','svg','math',
'video','audio','source','track','embed','object','param',
'map','area',
'center', 'font', 'base'
}
def _repl_pair(m):
tagname = m.group(1)
if tagname.lower() in known_tags:
return m.group(0)
attrname = m.group(2)
content = m.group(3)
return f"<{tagname} {attrname}>{content}"
text = re.sub(r'<([a-zA-Z0-9_\-]+)\s+([a-zA-Z0-9_\-]+)=""\s*>(.*?)\1>', _repl_pair, text, flags=re.DOTALL)
return text
def is_meaningful_text_content(html_content):
"""Check if chapter has meaningful text beyond just structure"""
return ContentProcessor.is_meaningful_text_content(html_content)
# =====================================================
# GLOBAL SETTINGS AND FLAGS
# =====================================================
logging.basicConfig(level=logging.DEBUG)
try:
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8', errors='ignore')
except AttributeError:
if sys.stdout is None:
devnull = open(os.devnull, "wb")
sys.stdout = io.TextIOWrapper(devnull, encoding='utf-8', errors='ignore')
elif hasattr(sys.stdout, 'buffer'):
try:
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
except:
pass
_stop_requested = False
def set_stop_flag(value):
"""Set the global stop flag"""
global _stop_requested
_stop_requested = value
def is_stop_requested():
"""Check if stop was requested"""
global _stop_requested
return _stop_requested
def set_output_redirect(log_callback=None):
"""Redirect print statements to a callback function for GUI integration"""
if log_callback:
import threading
class CallbackWriter:
def __init__(self, callback):
self.callback = callback
self.main_thread = threading.main_thread()
def write(self, text):
if text.strip():
# The callback (append_log) is already thread-safe - it handles QTimer internally
# So we can call it directly from any thread
self.callback(text.strip())
def flush(self):
pass
sys.stdout = CallbackWriter(log_callback)
# =====================================================
# EPUB AND FILE PROCESSING
# =====================================================
def extract_chapter_number_from_filename(filename, opf_spine_position=None, opf_spine_data=None):
"""Extract chapter number from filename.
Preference order:
1) Rightmost digits in the filename (0 if all zeros)
2) Special keywords with no digits -> 0
3) Legacy fallback patterns
"""
# Normalize: strip directory, extension, and response_ prefix for parsing
basename = os.path.basename(filename)
base_no_ext = os.path.splitext(basename)[0]
if base_no_ext.lower().startswith('response_'):
base_no_ext = base_no_ext[len('response_'):]
base_no_ext_lower = base_no_ext.lower()
# Priority 1: digits in filename (use rightmost match to mirror GUI column)
numbers = re.findall(r'[0-9]+', base_no_ext)
if numbers:
last_num = int(numbers[-1])
if last_num == 0:
return 0, 'filename_zero'
return last_num, 'filename_digits'
# Priority 2: special keyword files with no digits -> chapter 0
# Priority 3: special keyword files with no digits -> chapter 0
special_keywords = ['title', 'toc', 'cover', 'index', 'copyright', 'preface', 'nav', 'message', 'info', 'notice', 'colophon', 'dedication', 'epigraph', 'foreword', 'acknowledgment', 'author', 'appendix', 'glossary', 'bibliography']
if any(name in base_no_ext_lower for name in special_keywords):
return 0, 'special_file'
# Priority 3: legacy fallback patterns
name_without_ext = base_no_ext
fallback_patterns = [
(r'^response_(\d+)[_\.]', 'response_prefix'),
(r'[Cc]hapter[_\s]*(\d+)', 'chapter_word'),
(r'[Cc]h[_\s]*(\d+)', 'ch_abbreviation'),
(r'No(\d+)', 'no_prefix'),
(r'第(\d+)[章话回]', 'chinese_chapter'),
(r'-h-(\d+)', 'h_suffix'), # For your -h-16 pattern
(r'_(\d+)', 'underscore_suffix'),
(r'-(\d+)', 'dash_suffix'),
(r'(\d+)', 'trailing_number'),
]
for pattern, method in fallback_patterns:
match = re.search(pattern, name_without_ext, re.IGNORECASE)
if match:
return int(match.group(1)), method
return None, None
def process_chapter_images(chapter_html: str, actual_num: int, image_translator: ImageTranslator,
check_stop_fn=None) -> Tuple[str, Dict[str, str]]:
"""Process and translate images in a chapter"""
from bs4 import BeautifulSoup
images = image_translator.extract_images_from_chapter(chapter_html)
if not images:
return chapter_html, {}
print(f"🖼️ Found {len(images)} images in chapter {actual_num}")
soup = BeautifulSoup(chapter_html, 'html.parser')
image_translations = {}
translated_count = 0
max_images_per_chapter = int(os.getenv('MAX_IMAGES_PER_CHAPTER', '10'))
if len(images) > max_images_per_chapter:
print(f" ⚠️ Chapter has {len(images)} images - processing first {max_images_per_chapter} only")
images = images[:max_images_per_chapter]
for idx, img_info in enumerate(images, 1):
if check_stop_fn and check_stop_fn():
print("❌ Image translation stopped by user")
break
img_src = img_info['src']
original_img_src = img_src # keep for DOM matching
img_path = None
# Handle inline data URI images (e.g., PDF image render mode)
if img_src.startswith('data:image'):
try:
import base64, uuid, mimetypes
header, b64data = img_src.split(',', 1)
mime = 'image/png'
if ':' in header and ';' in header:
mime = header.split(';')[0].split(':')[1] or mime
ext = mimetypes.guess_extension(mime) or '.png'
os.makedirs(image_translator.images_dir, exist_ok=True)
temp_name = f"datauri_{actual_num}_{idx}_{uuid.uuid4().hex}{ext}"
img_path = os.path.join(image_translator.images_dir, temp_name)
with open(img_path, 'wb') as f:
f.write(base64.b64decode(b64data))
# Keep img_src pointing to original so DOM match works; translator uses img_path
except Exception as e:
print(f" ❌ Failed to decode data URI image: {e}")
continue
if img_path is None and img_src.startswith('../'):
img_path = os.path.join(image_translator.output_dir, img_src[3:])
elif img_path is None and img_src.startswith('./'):
img_path = os.path.join(image_translator.output_dir, img_src[2:])
elif img_path is None and img_src.startswith('/'):
img_path = os.path.join(image_translator.output_dir, img_src[1:])
elif img_path is None:
possible_paths = [
os.path.join(image_translator.images_dir, os.path.basename(img_src)),
os.path.join(image_translator.output_dir, img_src),
os.path.join(image_translator.output_dir, 'images', os.path.basename(img_src)),
os.path.join(image_translator.output_dir, os.path.basename(img_src)),
os.path.join(image_translator.output_dir, os.path.dirname(img_src), os.path.basename(img_src))
]
img_path = None
for path in possible_paths:
if os.path.exists(path):
img_path = path
print(f" ✅ Found image at: {path}")
break
if not img_path:
print(f" ❌ Image not found in any location for: {img_src}")
print(f" Tried: {possible_paths}")
continue
img_path = os.path.normpath(img_path)
if not os.path.exists(img_path):
print(f" ⚠️ Image not found: {img_path}")
print(f" 📁 Images directory: {image_translator.images_dir}")
print(f" 📁 Output directory: {image_translator.output_dir}")
print(f" 📁 Working directory: {os.getcwd()}")
if os.path.exists(image_translator.images_dir):
files = os.listdir(image_translator.images_dir)
print(f" 📁 Files in images dir: {files[:5]}...")
continue
print(f" 🔍 Processing image {idx}/{len(images)}: {os.path.basename(img_path)}")
context = ""
if img_info.get('alt'):
context += f", Alt text: {img_info['alt']}"
if translated_count > 0:
delay = float(os.getenv('IMAGE_API_DELAY', '1.0'))
time.sleep(delay)
translation_result = image_translator.translate_image(img_path, context, check_stop_fn)
print(f"\n🔍 DEBUG: Image {idx}/{len(images)}")
print(f" Translation result: {'Success' if translation_result and '[Image Translation Error:' not in translation_result else 'Failed'}")
if translation_result and "[Image Translation Error:" in translation_result:
print(f" Error message: {translation_result}")
if translation_result:
img_tag = None
for img in soup.find_all('img'):
if img.get('src') == original_img_src:
img_tag = img
break
if img_tag:
hide_label = os.getenv("HIDE_IMAGE_TRANSLATION_LABEL", "0") == "1"
print(f" 🔍 DEBUG: Integration Phase")
print(f" 🏷️ Hide label mode: {hide_label}")
src_display = img_tag.get('src', '')
if src_display.startswith('data:image'):
src_display = src_display[:80] + '...'
print(f" 📍 Found img tag: {src_display}")
# Store the translation result in the dictionary FIRST
image_translations[img_path] = translation_result
# Parse the translation result to integrate into the chapter HTML
if '
' in translation_result:
trans_soup = BeautifulSoup(translation_result, 'html.parser')
# Try to get the full container first
full_container = trans_soup.find('div', class_=['translated-text-only', 'image-with-translation'])
if full_container:
# Clone the container to avoid issues
new_container = BeautifulSoup(str(full_container), 'html.parser').find('div')
img_tag.replace_with(new_container)
print(f" ✅ Replaced image with full translation container")
else:
# Fallback: manually build the structure
trans_div = trans_soup.find('div', class_='image-translation')
if trans_div:
container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'})
img_tag.replace_with(container)
if not hide_label:
new_img = soup.new_tag('img', src=img_src)
if img_info.get('alt'):
new_img['alt'] = img_info.get('alt')
container.append(new_img)
# Clone the translation div content
new_trans_div = soup.new_tag('div', **{'class': 'image-translation'})
# Copy all children from trans_div to new_trans_div
for child in trans_div.children:
if hasattr(child, 'name'):
new_trans_div.append(BeautifulSoup(str(child), 'html.parser'))
else:
new_trans_div.append(str(child))
container.append(new_trans_div)
print(f" ✅ Built container with translation div")
else:
print(f" ⚠️ No translation div found in result")
continue
else:
# Plain text translation - build structure manually
container = soup.new_tag('div', **{'class': 'translated-text-only' if hide_label else 'image-with-translation'})
img_tag.replace_with(container)
if not hide_label:
new_img = soup.new_tag('img', src=img_src)
if img_info.get('alt'):
new_img['alt'] = img_info.get('alt')
container.append(new_img)
# Create translation div with content
translation_div = soup.new_tag('div', **{'class': 'image-translation'})
if not hide_label:
label_p = soup.new_tag('p')
label_em = soup.new_tag('em')
#label_em.string = "[Image text translation:]"
label_p.append(label_em)
translation_div.append(label_p)
trans_p = soup.new_tag('p')
trans_p.string = translation_result
translation_div.append(trans_p)
container.append(translation_div)
print(f" ✅ Created plain text translation structure")
translated_count += 1
# Save to translated_images folder
trans_filename = f"ch{actual_num:03d}_img{idx:02d}_translation.html"
trans_filepath = os.path.join(image_translator.translated_images_dir, trans_filename)
# Extract just the translation content for saving
save_soup = BeautifulSoup(translation_result, 'html.parser')
save_div = save_soup.find('div', class_='image-translation')
if not save_div:
# Create a simple div for plain text
save_div = f'
{translation_result}
'
with open(trans_filepath, 'w', encoding='utf-8') as f:
f.write(f"""
Chapter {actual_num} - Image {idx} Translation
Chapter {actual_num} - Image {idx}
Original: {os.path.basename(img_path)}
{save_div}
""")
print(f" ✅ Saved translation to: {trans_filename}")
else:
print(f" ⚠️ Could not find image tag in HTML for: {img_src}")
if translated_count > 0:
print(f" 🖼️ Successfully translated {translated_count} images")
# Debug output
final_html = str(soup)
trans_count = final_html.count('
')
print(f" 📊 Final HTML has {trans_count} translation divs")
print(f" 📊 image_translations dict has {len(image_translations)} entries")
prog = image_translator.load_progress()
if "image_chunks" in prog:
completed_images = []
for img_key, img_data in prog["image_chunks"].items():
if len(img_data["completed"]) == img_data["total"]:
completed_images.append(img_key)
for img_key in completed_images:
del prog["image_chunks"][img_key]
if completed_images:
image_translator.save_progress(prog)
print(f" 🧹 Cleaned up progress for {len(completed_images)} completed images")
image_translator.save_translation_log(actual_num, image_translations)
return str(soup), image_translations
else:
print(f" ℹ️ No images were successfully translated")
return chapter_html, {}
def detect_novel_numbering(chapters):
"""Detect if the novel uses 0-based or 1-based chapter numbering with improved accuracy"""
print("[DEBUG] Detecting novel numbering system...")
if not chapters:
return False
if isinstance(chapters[0], str):
print("[DEBUG] Text file detected, skipping numbering detection")
return False
patterns = PatternManager.FILENAME_EXTRACT_PATTERNS
# Special check for prefix_suffix pattern like "0000_1.xhtml"
prefix_suffix_pattern = r'^(\d+)_(\d+)[_\.]'
# Track chapter numbers from different sources
filename_numbers = []
content_numbers = []
has_prefix_suffix = False
prefix_suffix_numbers = []
for idx, chapter in enumerate(chapters):
extracted_num = None
# Check filename patterns
if 'original_basename' in chapter and chapter['original_basename']:
filename = chapter['original_basename']
elif 'filename' in chapter:
filename = os.path.basename(chapter['filename'])
else:
continue
# First check for prefix_suffix pattern
prefix_match = re.search(prefix_suffix_pattern, filename, re.IGNORECASE)
if prefix_match:
has_prefix_suffix = True
# Use the SECOND number (after underscore)
suffix_num = int(prefix_match.group(2))
prefix_suffix_numbers.append(suffix_num)
extracted_num = suffix_num
print(f"[DEBUG] Prefix_suffix pattern matched: {filename} -> Chapter {suffix_num}")
else:
# Try other patterns
for pattern in patterns:
match = re.search(pattern, filename)
if match:
extracted_num = int(match.group(1))
#print(f"[DEBUG] Pattern '{pattern}' matched: {filename} -> Chapter {extracted_num}")
break
if extracted_num is not None:
filename_numbers.append(extracted_num)
# Also check chapter content for chapter declarations
if 'body' in chapter:
# Look for "Chapter N" in the first 1000 characters
content_preview = chapter['body'][:1000]
content_match = re.search(r'Chapter\s+(\d+)', content_preview, re.IGNORECASE)
if content_match:
content_num = int(content_match.group(1))
content_numbers.append(content_num)
print(f"[DEBUG] Found 'Chapter {content_num}' in content")
# Decision logic with improved heuristics
# 1. If using prefix_suffix pattern, trust those numbers exclusively
if has_prefix_suffix and prefix_suffix_numbers:
min_suffix = min(prefix_suffix_numbers)
if min_suffix >= 1:
print(f"[DEBUG] ✅ 1-based novel detected (prefix_suffix pattern starts at {min_suffix})")
return False
else:
print(f"[DEBUG] ✅ 0-based novel detected (prefix_suffix pattern starts at {min_suffix})")
return True
# 2. If we have content numbers, prefer those over filename numbers
if content_numbers:
min_content = min(content_numbers)
# Check if we have a good sequence starting from 0 or 1
if 0 in content_numbers and 1 in content_numbers:
print(f"[DEBUG] ✅ 0-based novel detected (found both Chapter 0 and Chapter 1 in content)")
return True
elif min_content == 1:
print(f"[DEBUG] ✅ 1-based novel detected (content chapters start at 1)")
return False
# 3. Fall back to filename numbers
if filename_numbers:
min_filename = min(filename_numbers)
max_filename = max(filename_numbers)
# Check for a proper sequence
# If we have 0,1,2,3... it's likely 0-based
# If we have 1,2,3,4... it's likely 1-based
# Count how many chapters we have in sequence starting from 0
zero_sequence_count = 0
for i in range(len(chapters)):
if i in filename_numbers:
zero_sequence_count += 1
else:
break
# Count how many chapters we have in sequence starting from 1
one_sequence_count = 0
for i in range(1, len(chapters) + 1):
if i in filename_numbers:
one_sequence_count += 1
else:
break
print(f"[DEBUG] Zero-based sequence length: {zero_sequence_count}")
print(f"[DEBUG] One-based sequence length: {one_sequence_count}")
# If we have a better sequence starting from 1, it's 1-based
if one_sequence_count > zero_sequence_count and min_filename >= 1:
print(f"[DEBUG] ✅ 1-based novel detected (better sequence match starting from 1)")
return False
# If we have any 0 in filenames and it's part of a sequence
if 0 in filename_numbers and zero_sequence_count >= 3:
print(f"[DEBUG] ✅ 0-based novel detected (found 0 in sequence)")
return True
# 4. Default to 1-based if uncertain
print(f"[DEBUG] ✅ Defaulting to 1-based novel (insufficient evidence for 0-based)")
return False
def validate_chapter_continuity(chapters):
"""Validate chapter continuity and warn about issues"""
if not chapters:
print("No chapters to translate")
return
issues = []
# Get all chapter numbers
chapter_nums = [c['num'] for c in chapters]
actual_nums = [c.get('actual_chapter_num', c['num']) for c in chapters]
# Check for duplicates
duplicates = [num for num in chapter_nums if chapter_nums.count(num) > 1]
if duplicates:
issues.append(f"Duplicate chapter numbers found: {set(duplicates)}")
# Check for gaps in sequence
min_num = min(chapter_nums)
max_num = max(chapter_nums)
expected = set(range(min_num, max_num + 1))
actual = set(chapter_nums)
missing = expected - actual
if missing:
issues.append(f"Missing chapter numbers: {sorted(missing)}")
# Show gaps more clearly
gaps = []
sorted_missing = sorted(missing)
if sorted_missing:
start = sorted_missing[0]
end = sorted_missing[0]
for num in sorted_missing[1:]:
if num == end + 1:
end = num
else:
gaps.append(f"{start}-{end}" if start != end else str(start))
start = end = num
gaps.append(f"{start}-{end}" if start != end else str(start))
issues.append(f"Gap ranges: {', '.join(gaps)}")
# Check for duplicate titles
title_map = {}
for c in chapters:
title_lower = c['title'].lower().strip()
if title_lower in title_map:
title_map[title_lower].append(c['num'])
else:
title_map[title_lower] = [c['num']]
for title, nums in title_map.items():
if len(nums) > 1:
issues.append(f"Duplicate title '{title}' in chapters: {nums}")
# Print summary
print("\n" + "="*60)
print("📚 CHAPTER VALIDATION SUMMARY")
print("="*60)
print(f"Total chapters: {len(chapters)}")
print(f"Chapter range: {min_num} to {max_num}")
print(f"Expected count: {max_num - min_num + 1}")
print(f"Actual count: {len(chapters)}")
if len(chapters) != (max_num - min_num + 1):
print(f"⚠️ Chapter count mismatch - missing {(max_num - min_num + 1) - len(chapters)} chapters")
if issues:
print("\n⚠️ Issues found:")
for issue in issues:
print(f" - {issue}")
else:
print("✅ No continuity issues detected")
print("="*60 + "\n")
def validate_epub_structure(output_dir):
"""Validate that all necessary EPUB structure files are present"""
print("🔍 Validating EPUB structure...")
required_files = {
'container.xml': 'META-INF container file (critical)',
'*.opf': 'OPF package file (critical)',
'*.ncx': 'Navigation file (recommended)'
}
found_files = {}
missing_files = []
container_path = os.path.join(output_dir, 'container.xml')
if os.path.exists(container_path):
found_files['container.xml'] = 'Found'
print(" ✅ container.xml - Found")
else:
missing_files.append('container.xml')
print(" ❌ container.xml - Missing (CRITICAL)")
opf_files = []
ncx_files = []
for file in os.listdir(output_dir):
if file.lower().endswith('.opf'):
opf_files.append(file)
elif file.lower().endswith('.ncx'):
ncx_files.append(file)
if opf_files:
found_files['opf'] = opf_files
print(f" ✅ OPF file(s) - Found: {', '.join(opf_files)}")
else:
missing_files.append('*.opf')
print(" ❌ OPF file - Missing (CRITICAL)")
if ncx_files:
found_files['ncx'] = ncx_files
print(f" ✅ NCX file(s) - Found: {', '.join(ncx_files)}")
else:
missing_files.append('*.ncx')
print(" ⚠️ NCX file - Missing (navigation may not work)")
html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')]
if html_files:
print(f" ✅ Translated chapters - Found: {len(html_files)} files")
else:
print(" ⚠️ No translated chapter files found")
critical_missing = [f for f in missing_files if f in ['container.xml', '*.opf']]
if not critical_missing:
print("✅ EPUB structure validation PASSED")
print(" All critical files present for EPUB reconstruction")
return True
else:
print("❌ EPUB structure validation FAILED")
print(f" Missing critical files: {', '.join(critical_missing)}")
print(" EPUB reconstruction may fail without these files")
return False
def check_epub_readiness(output_dir):
"""Check if the output directory is ready for EPUB compilation"""
print("📋 Checking EPUB compilation readiness...")
issues = []
if not validate_epub_structure(output_dir):
issues.append("Missing critical EPUB structure files")
html_files = [f for f in os.listdir(output_dir) if f.lower().endswith('.html') and f.startswith('response_')]
if not html_files:
issues.append("No translated chapter files found")
else:
print(f" ✅ Found {len(html_files)} translated chapters")
metadata_path = os.path.join(output_dir, 'metadata.json')
if os.path.exists(metadata_path):
print(" ✅ Metadata file present")
try:
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
if 'title' not in metadata:
issues.append("Metadata missing title")
except Exception as e:
issues.append(f"Metadata file corrupted: {e}")
else:
issues.append("Missing metadata.json file")
resource_dirs = ['css', 'fonts', 'images']
found_resources = 0
for res_dir in resource_dirs:
res_path = os.path.join(output_dir, res_dir)
if os.path.exists(res_path):
files = [f for f in os.listdir(res_path) if os.path.isfile(os.path.join(res_path, f))]
if files:
found_resources += len(files)
print(f" ✅ Found {len(files)} {res_dir} files")
if found_resources > 0:
print(f" ✅ Total resources: {found_resources} files")
else:
print(" ⚠️ No resource files found (this may be normal)")
if not issues:
print("🎉 EPUB compilation readiness: READY")
print(" All necessary files present for EPUB creation")
return True
else:
print("⚠️ EPUB compilation readiness: ISSUES FOUND")
for issue in issues:
print(f" • {issue}")
return False
def cleanup_previous_extraction(output_dir):
"""Clean up any files from previous extraction runs (preserves CSS files)"""
# Remove 'css' from cleanup_items to preserve CSS files
cleanup_items = [
'images', # Removed 'css' from this list
'.resources_extracted'
]
epub_structure_files = [
'container.xml', 'content.opf', 'toc.ncx'
]
cleaned_count = 0
# Clean up directories (except CSS)
for item in cleanup_items:
if item.startswith('.'):
continue
item_path = os.path.join(output_dir, item)
try:
if os.path.isdir(item_path):
shutil.rmtree(item_path)
print(f"🧹 Removed directory: {item}")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Could not remove directory {item}: {e}")
# Clean up EPUB structure files
for epub_file in epub_structure_files:
file_path = os.path.join(output_dir, epub_file)
try:
if os.path.isfile(file_path):
os.remove(file_path)
print(f"🧹 Removed EPUB file: {epub_file}")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Could not remove {epub_file}: {e}")
# Clean up any loose .opf and .ncx files
try:
for file in os.listdir(output_dir):
if file.lower().endswith(('.opf', '.ncx')):
file_path = os.path.join(output_dir, file)
if os.path.isfile(file_path):
os.remove(file_path)
print(f"🧹 Removed EPUB file: {file}")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Error scanning for EPUB files: {e}")
# Remove extraction marker
marker_path = os.path.join(output_dir, '.resources_extracted')
try:
if os.path.isfile(marker_path):
os.remove(marker_path)
print(f"🧹 Removed extraction marker")
cleaned_count += 1
except Exception as e:
print(f"⚠️ Could not remove extraction marker: {e}")
# Check if CSS files exist and inform user they're being preserved
css_path = os.path.join(output_dir, 'css')
if os.path.exists(css_path):
try:
css_files = [f for f in os.listdir(css_path) if os.path.isfile(os.path.join(css_path, f))]
if css_files:
print(f"📚 Preserving {len(css_files)} CSS files")
except Exception:
pass
if cleaned_count > 0:
print(f"🧹 Cleaned up {cleaned_count} items from previous runs (CSS files preserved)")
return cleaned_count
# =====================================================
# API AND TRANSLATION UTILITIES
# =====================================================
def send_with_interrupt(messages, client, temperature, max_tokens, stop_check_fn,
chunk_timeout=None, request_id=None, context=None,
chapter_context=None, bypass_graceful_stop=False):
"""Send API request with interrupt capability and optional timeout retry.
Optional context parameter is passed through to the client to improve payload labeling.
chapter_context (dict) may contain "chapter", "chunk", "total_chunks", and "merged_chapters".
When provided and the client supports set_chapter_context, it will be applied
inside the API thread so that thread-local payload metadata is accurate.
"""
# Import UnifiedClientError at function level to avoid scoping issues
from unified_api_client import UnifiedClientError
# The client.send() call will handle multi-key rotation automatically
result_queue = queue.Queue()
cancel_event = threading.Event()
# Honor RETRY_TIMEOUT toggle: when off, disable chunk timeout entirely
retry_env = os.getenv("RETRY_TIMEOUT")
# Default: wrapper chunk timeout is OFF unless RETRY_TIMEOUT is explicitly truthy
retry_timeout_enabled = bool(retry_env) and retry_env.strip().lower() not in ("0", "false", "off", "")
if not retry_timeout_enabled:
chunk_timeout = None
def _clear_watchdog_for_chapter_context() -> None:
"""Best-effort cleanup so the GUI watchdog doesn't stay stuck when this wrapper abandons a call."""
try:
import unified_api_client
clear_fn = getattr(unified_api_client, '_api_watchdog_clear_chapter', None)
if not callable(clear_fn):
return
chap = None
merged = None
if isinstance(chapter_context, dict):
chap = chapter_context.get('chapter')
merged = chapter_context.get('merged_chapters')
if chap is not None:
clear_fn(chap)
if merged:
try:
for mc in merged:
if mc is not None:
clear_fn(mc)
except Exception:
pass
except Exception:
pass
def api_call():
try:
start_time = time.time()
# Apply chapter/chunk context in THIS thread so UnifiedClient's
# thread-local chapter_info is visible to payload saving.
if chapter_context and hasattr(client, 'set_chapter_context'):
try:
client.set_chapter_context(
chapter=chapter_context.get('chapter'),
chunk=chapter_context.get('chunk'),
total_chunks=chapter_context.get('total_chunks'),
merged_chapters=chapter_context.get('merged_chapters'),
)
except Exception:
# Context is best-effort and should never break the call
pass
# Build send parameters (context is optional)
send_params = {
'messages': messages,
'temperature': temperature,
'max_tokens': max_tokens,
}
sig = inspect.signature(client.send)
if 'context' in sig.parameters and context is not None:
send_params['context'] = context
result = client.send(**send_params)
# If the caller has already timed out/cancelled, do not publish a stale result.
if cancel_event.is_set():
return
# Capture raw response object for thought signatures (if available)
raw_obj = None
if hasattr(client, 'get_last_response_object'):
resp_obj = client.get_last_response_object()
if resp_obj and hasattr(resp_obj, 'raw_content_object'):
raw_obj = resp_obj.raw_content_object
# print("🧠 Captured thought signature for history in send_with_interrupt")
elapsed = time.time() - start_time
# Include raw_obj in the result tuple
result_queue.put((result, elapsed, raw_obj))
except Exception as e:
# If already cancelled, suppress late exceptions from the abandoned call.
if cancel_event.is_set():
return
result_queue.put(e)
# Pre-send submission spacing to align staggered logs with actual delay
try:
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", os.getenv("THREAD_SUBMISSION_DELAY", "0.1")))
except Exception:
thread_delay = 0.1
try:
api_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
except Exception:
api_delay = 2.0
enforce_delay = max(thread_delay, api_delay)
if enforce_delay > 0:
global _translation_thread_submit_lock, _translation_last_thread_submit
with _translation_thread_submit_lock:
now = time.time()
remaining = enforce_delay - (now - _translation_last_thread_submit)
if remaining > 0:
elapsed = 0.0
step = 0.1
while elapsed < remaining:
if stop_check_fn():
raise UnifiedClientError("Translation stopped by user during threading delay", error_type="cancelled")
dt = min(step, remaining - elapsed)
time.sleep(dt)
elapsed += dt
_translation_last_thread_submit = time.time()
else:
_translation_last_thread_submit = now
api_thread = threading.Thread(target=api_call)
api_thread.daemon = True
api_thread.start()
timeout = chunk_timeout
check_interval = 0.5
elapsed = 0
while True:
try:
result = result_queue.get(timeout=check_interval)
if isinstance(result, Exception):
# For expected errors like rate limits, preserve the error type without extra traceback
if hasattr(result, 'error_type') and result.error_type == "rate_limit":
raise result
elif "429" in str(result) or "rate limit" in str(result).lower():
# Convert generic exceptions to UnifiedClientError for rate limits
raise UnifiedClientError(str(result), error_type="rate_limit")
else:
raise result
if isinstance(result, tuple):
# Unpack the tuple (now includes raw_obj)
if len(result) == 3:
api_result, api_time, raw_obj = result
# Store raw_obj as an attribute for later retrieval
if hasattr(api_result, '__class__'):
# If api_result is a tuple, return a new tuple with raw_obj
if isinstance(api_result, tuple):
return (*api_result, raw_obj)
else:
# Store as attribute for retrieval
api_result._raw_obj = raw_obj
else:
# Backward compatibility for old format
api_result, api_time = result
if chunk_timeout is not None and api_time > chunk_timeout:
# Set cleanup flag when chunk timeout occurs
if hasattr(client, '_in_cleanup'):
client._in_cleanup = True
cancel_event.set()
if hasattr(client, 'cancel_current_operation'):
client.cancel_current_operation()
# Clear watchdog entries for this chapter since we're abandoning the result.
_clear_watchdog_for_chapter_context()
try:
api_thread.join(timeout=2.0)
except Exception:
pass
raise UnifiedClientError(f"API call took {api_time:.1f}s (timeout: {chunk_timeout}s)")
# If graceful stop was requested, mark that an API call completed
if os.environ.get('GRACEFUL_STOP') == '1':
os.environ['GRACEFUL_STOP_COMPLETED'] = '1'
return api_result
return result
except queue.Empty:
# During graceful stop, don't cancel the API call - let it complete
# Unless bypass_graceful_stop is enabled, in which case we defer to stop_check_fn logic
should_stop = stop_check_fn()
graceful_active = os.environ.get('GRACEFUL_STOP') == '1'
# Hard cancellation (e.g. double-click force stop via hard_cancel_all)
# overrides graceful stop protection for in-flight calls.
hard_cancelled = hasattr(client, 'is_globally_cancelled') and client.is_globally_cancelled()
# During graceful stop, protect in-flight calls unless hard-cancelled.
should_cancel = hard_cancelled or (should_stop and not graceful_active)
if should_cancel:
# Set cleanup flag when user stops
if hasattr(client, '_in_cleanup'):
client._in_cleanup = True
cancel_event.set()
if hasattr(client, 'cancel_current_operation'):
client.cancel_current_operation()
# Clear watchdog entries for this chapter since we're abandoning the result.
_clear_watchdog_for_chapter_context()
try:
api_thread.join(timeout=2.0)
except Exception:
pass
raise UnifiedClientError("Translation stopped by user")
elapsed += check_interval
if chunk_timeout is not None and elapsed >= chunk_timeout:
if hasattr(client, '_in_cleanup'):
client._in_cleanup = True
cancel_event.set()
if hasattr(client, 'cancel_current_operation'):
client.cancel_current_operation()
# Clear watchdog entries for this chapter since we're abandoning the result.
_clear_watchdog_for_chapter_context()
# Give the background thread a brief chance to unwind after transport closure
try:
api_thread.join(timeout=2.0)
except Exception:
pass
raise UnifiedClientError(f"API call timed out after {chunk_timeout} seconds")
def handle_api_error(processor, error, chunk_info=""):
"""Handle API errors with multi-key support"""
error_str = str(error)
# Check for rate limit
if "429" in error_str or "rate limit" in error_str.lower():
if processor.config.use_multi_api_keys:
print(f"⚠️ Rate limit hit {chunk_info}, client should rotate to next key")
stats = processor.client.get_stats()
print(f"📊 API Stats - Active keys: {stats.get('active_keys', 0)}/{stats.get('total_keys', 0)}")
if stats.get('active_keys', 0) == 0:
print("⏳ All API keys are cooling down - will wait and retry")
print(f"🔄 Multi-key error handling: Rate limit processed, preparing for key rotation...")
time.sleep(0.1) # Brief pause after rate limit detection for stability
return True # Always retry
else:
print(f"⚠️ Rate limit hit {chunk_info}, waiting before retry...")
time.sleep(60)
print(f"🔄 Single-key error handling: Rate limit wait completed, ready for retry...")
time.sleep(0.1) # Brief pause after rate limit wait for stability
return True # Always retry
# Other errors
print(f"❌ API Error {chunk_info}: {error_str}")
return False
def parse_token_limit(env_value):
"""Parse token limit from environment variable"""
if not env_value or env_value.strip() == "":
return None, "unlimited"
env_value = env_value.strip()
if env_value.lower() == "unlimited":
return None, "unlimited"
if env_value.isdigit() and int(env_value) > 0:
limit = int(env_value)
return limit, str(limit)
return 1000000, "1000000 (default)"
def build_system_prompt(user_prompt, glossary_path=None, source_text=None):
"""Build the system prompt with glossary - TRUE BRUTE FORCE VERSION"""
append_glossary = os.getenv("APPEND_GLOSSARY", "1") == "1"
actual_glossary_path = glossary_path
# Replace {target_lang} placeholder if present
target_lang = os.getenv("OUTPUT_LANGUAGE", "English")
if user_prompt and "{target_lang}" in user_prompt:
user_prompt = user_prompt.replace("{target_lang}", target_lang)
system = user_prompt if user_prompt else ""
if append_glossary and actual_glossary_path and os.path.exists(actual_glossary_path):
try:
print(f"✅ Loading glossary from: {os.path.abspath(actual_glossary_path)}")
# Try to load as JSON first
try:
with open(actual_glossary_path, "r", encoding="utf-8") as gf:
glossary_data = json.load(gf)
glossary_text = json.dumps(glossary_data, ensure_ascii=False, indent=2)
print(f"Loaded as JSON")
except json.JSONDecodeError:
# If JSON fails, just read as raw text
with open(actual_glossary_path, "r", encoding="utf-8") as gf:
glossary_text = gf.read()
# Apply glossary compression if enabled and source text is provided
compress_glossary_enabled = os.getenv("COMPRESS_GLOSSARY_PROMPT", "0") == "1"
if compress_glossary_enabled and source_text:
try:
from glossary_compressor import compress_glossary
original_glossary_text = glossary_text # Store original for token counting
original_length = len(glossary_text)
glossary_text = compress_glossary(glossary_text, source_text, glossary_format='auto')
compressed_length = len(glossary_text)
reduction_pct = ((original_length - compressed_length) / original_length * 100) if original_length > 0 else 0
# Also calculate token savings if tiktoken is available
try:
import tiktoken
try:
enc = tiktoken.encoding_for_model(os.getenv("MODEL", "gpt-4"))
except:
enc = tiktoken.get_encoding("cl100k_base")
# Count tokens for original and compressed glossary
original_tokens = len(enc.encode(original_glossary_text))
compressed_tokens = len(enc.encode(glossary_text))
token_reduction = original_tokens - compressed_tokens
token_reduction_pct = (token_reduction / original_tokens * 100) if original_tokens > 0 else 0
print(f"🗜️ Glossary: {original_length:,}→{compressed_length:,} chars ({reduction_pct:.1f}%), {original_tokens:,}→{compressed_tokens:,} tokens ({token_reduction_pct:.1f}%)")
except ImportError:
# If tiktoken is not available, just show character reduction
print(f"🗜️ Glossary compressed: {original_length:,} → {compressed_length:,} chars ({reduction_pct:.1f}% reduction)")
except Exception as e:
print(f"⚠️ Glossary compression failed: {e}")
# Continue with uncompressed glossary
if system:
system += "\n\n"
custom_prompt = os.getenv("APPEND_GLOSSARY_PROMPT", "").strip()
if not custom_prompt:
raise ValueError(
"APPEND_GLOSSARY_PROMPT environment variable is not set!\n"
"Please configure your glossary append format in:\n"
"Glossary Manager → Automatic Glossary → Glossary Append Format"
)
system += f"{custom_prompt}\n{glossary_text}"
print(f"✅ Glossary appended ({len(glossary_text):,} characters)")
# Check for glossary extension file (only if ADD_ADDITIONAL_GLOSSARY is enabled)
add_additional_glossary = os.getenv("ADD_ADDITIONAL_GLOSSARY", "0") == "1"
if add_additional_glossary:
glossary_dir = os.path.dirname(actual_glossary_path)
# Check for extension with any supported format
additional_glossary_path = None
for ext in ['.csv', '.md', '.txt', '.json']:
candidate = os.path.join(glossary_dir, f"glossary_extension{ext}")
if os.path.exists(candidate):
additional_glossary_path = candidate
break
if additional_glossary_path:
try:
print(f"✅ Loading glossary extension from: {os.path.basename(additional_glossary_path)}")
with open(additional_glossary_path, "r", encoding="utf-8") as af:
additional_glossary_text = af.read()
# Apply same compression logic if enabled
if compress_glossary_enabled and source_text:
try:
from glossary_compressor import compress_glossary
original_add_length = len(additional_glossary_text)
additional_glossary_text = compress_glossary(additional_glossary_text, source_text, glossary_format='auto')
compressed_add_length = len(additional_glossary_text)
add_reduction_pct = ((original_add_length - compressed_add_length) / original_add_length * 100) if original_add_length > 0 else 0
print(f"🗃️ Glossary extension compressed: {original_add_length:,} → {compressed_add_length:,} chars ({add_reduction_pct:.1f}% reduction)")
except Exception as e:
print(f"⚠️ Glossary extension compression failed: {e}")
# Append glossary extension
system += f"\n\n{additional_glossary_text}"
print(f"✅ Glossary extension appended ({len(additional_glossary_text):,} characters)")
except Exception as e:
print(f"⚠️ Failed to load glossary extension: {e}")
except Exception as e:
print(f"[ERROR] Could not load glossary: {e}")
import traceback
print(f"[ERROR] Full traceback: {traceback.format_exc()}")
else:
if not append_glossary:
#print(f"[DEBUG] ❌ Glossary append disabled")
pass
elif not actual_glossary_path:
# Check if we're translating CSV/JSON files (they typically don't need glossaries)
input_path = os.getenv('EPUB_PATH', '')
if not input_path.lower().endswith(('.csv', '.json')):
print(f"[DEBUG] ❌ No glossary path provided")
elif not os.path.exists(actual_glossary_path):
print(f"[DEBUG] ❌ Glossary file does not exist: {actual_glossary_path}")
# Calculate token count for system prompt
try:
import tiktoken
try:
enc = tiktoken.encoding_for_model(os.getenv("MODEL", "gpt-4"))
except:
enc = tiktoken.get_encoding("cl100k_base")
system_tokens = len(enc.encode(system))
print(f"🎯 Final system prompt: {len(system):,} chars, {system_tokens:,} tokens")
except ImportError:
print(f"🎯 Final system prompt length: {len(system)} characters")
return system
def translate_title(title, client, system_prompt, user_prompt, temperature=0.3):
"""Translate the book title using the configured settings"""
if not title or not title.strip():
return title
print(f"📚 Processing book title: {title}")
try:
if os.getenv("TRANSLATE_BOOK_TITLE", "1") == "0":
print(f"📚 Book title translation disabled - keeping original")
return title
# Check if we're using a translation service (not AI)
client_type = getattr(client, 'client_type', '')
is_translation_service = client_type in ['deepl', 'google_translate']
if is_translation_service:
# For translation services, send only the text without AI prompts
print(f"📚 Using translation service ({client_type}) - sending text directly")
messages = [
{"role": "user", "content": title}
]
max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens)
else:
# For AI services, use prompts as before
book_title_prompt = os.getenv("BOOK_TITLE_PROMPT",
"Translate this book title to English while retaining any acronyms:")
# Get the system prompt for book titles, with fallback to default
book_title_system_prompt = os.getenv("BOOK_TITLE_SYSTEM_PROMPT",
"You are a translator. Respond with only the translated text, nothing else. Do not add any explanation or additional content.")
# Replace {target_lang} variable with output language
output_lang = os.getenv("OUTPUT_LANGUAGE", "English")
book_title_prompt = book_title_prompt.replace("{target_lang}", output_lang)
book_title_system_prompt = book_title_system_prompt.replace("{target_lang}", output_lang)
messages = [
{"role": "system", "content": book_title_system_prompt},
{"role": "user", "content": f"{book_title_prompt}\n\n{title}"}
]
max_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
translated_title, _ = client.send(messages, temperature=temperature, max_tokens=max_tokens)
print(f"[DEBUG] Raw API response: '{translated_title}'")
print(f"[DEBUG] Response length: {len(translated_title)} (original: {len(title)})")
newline = '\n'
print(f"[DEBUG] Has newlines: {repr(translated_title) if newline in translated_title else 'No'}")
translated_title = translated_title.strip()
if ((translated_title.startswith('"') and translated_title.endswith('"')) or
(translated_title.startswith("'") and translated_title.endswith("'"))):
translated_title = translated_title[1:-1].strip()
if '\n' in translated_title:
print(f"⚠️ API returned multi-line content, keeping original title")
return title
# Check for JSON-like structured content, but allow simple brackets like [END]
if (any(char in translated_title for char in ['{', '}']) or
'"role":' in translated_title or
'"content":' in translated_title or
('[[' in translated_title and ']]' in translated_title)): # Only flag double brackets
print(f"⚠️ API returned structured content, keeping original title")
return title
if any(tag in translated_title.lower() for tag in ['
', '
', '
', '
', '= 2:
return True
# Single strong error indicator in very short response
if len(content_str) < 50 and error_count >= 1:
return True
return False
# Additional helper function for debugging
def get_failure_reason(content):
"""
Returns the specific reason why content was marked as qa_failed
Useful for debugging and logging
"""
if not content:
return "Empty content"
if os.getenv("DISABLE_QA_MARKER_CHECKS", "0") == "1":
return "QA marker checks disabled"
content_str = str(content).strip()
content_lower = content_str.lower()
# Check each category and return the first match
failure_categories = {
"Explicit Failure Marker": [
"[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]",
"[IMAGE TRANSLATION FAILED]",
"API response unavailable",
"[]"
],
"HTTP Error": [
"authentication_error", "rate_limit_error", "api_error"
],
"Content Filter": [
"content_filter", "safety filter", "blocked by safety"
],
"Timeout": [
"timeout", "timed out", "apitimeouterror"
],
"Rate Limit": [
"rate limit exceeded", "quota exceeded", "too many requests"
],
"Refusal Pattern": [
"i cannot", "i can't", "unable to process"
],
"Empty Response": [
'"text": ""', "choices: [ { text: ''"
]
}
for category, markers in failure_categories.items():
if len(content_str) >= _qa_marker_limit:
continue
for marker in markers:
if marker in content_str or marker in content_lower:
return f"{category}: {marker}"
if len(content_str) < 50:
return f"Short response with error indicators: {content_str[:30]}..."
return "Unknown failure pattern"
def is_prohibited_failure(content, failure_reason=None):
"""Best-effort detection of prohibited/blocked failures for save routing."""
try:
fr = str(failure_reason or "").lower()
if "content filter" in fr or "blocked" in fr or "prohibited" in fr:
return True
except Exception:
pass
try:
cl = str(content or "").lower()
if "content_filter" in cl or "content blocked" in cl or "blocked by safety" in cl:
return True
except Exception:
pass
return False
def convert_enhanced_text_to_html(plain_text, chapter_info=None):
"""Convert markdown/plain text back to HTML after translation (for enhanced mode)
This function handles the conversion of translated markdown back to HTML.
The input is the TRANSLATED text that was originally extracted using html2text.
"""
import re
preserve_structure = chapter_info.get('preserve_structure', False) if chapter_info else False
# Pre-process: Convert angle-bracket "tag-like" sequences into HTML entities.
# This prevents markdown converters from stripping/mangling them.
# IMPORTANT: Preserve real anchor tags ( and ) so EPUB TOC links remain clickable.
def _escape_tag_like(m):
inner = m.group(1) # e.g. 'a href="..."' or '/a'
try:
mname = re.match(r'\s*/?\s*([a-zA-Z0-9]+)', inner)
tag = (mname.group(1) if mname else "").lower()
except Exception:
tag = ""
# Allowlist: tags we must keep as real HTML in enhanced mode
if tag in ("a",):
return "<" + inner + ">"
return "<" + inner + ">"
plain_text = re.sub(r'<(/?[a-zA-Z][^>]*)>', _escape_tag_like, plain_text)
# Check if user prefers markdown2 (legacy behavior)
use_markdown2 = os.getenv('USE_MARKDOWN2_CONVERTER', '0') == '1'
if use_markdown2:
# Use markdown2 for conversion (legacy behavior)
try:
import markdown2
has_markdown = any([
'##' in plain_text,
'**' in plain_text,
'*' in plain_text and not '**' in plain_text,
'[' in plain_text and '](' in plain_text,
'```' in plain_text,
'> ' in plain_text,
'- ' in plain_text or '* ' in plain_text or '1. ' in plain_text
])
if has_markdown or preserve_structure:
html = markdown2.markdown(plain_text, extras=[
'cuddled-lists',
'fenced-code-blocks',
'break-on-newline',
'smarty-pants',
'tables',
])
# Post-process: Fix setext headers that were created from separator lines.
# These are NOT real headers—just text followed by ==== or ----.
# Restore both the text AND the underline so nothing is lost.
html = re.sub(r'
(.*?)
', r'
\1
\n
====
', html)
html = re.sub(r'
(.*?)
', r'
\1
\n
----
', html)
if not '
' in html:
lines = html.split('\n')
processed_lines = []
for line in lines:
line = line.strip()
if line and not line.startswith('<') and not line.endswith('>'):
processed_lines.append(f'
{line}
')
elif line:
processed_lines.append(line)
html = '\n'.join(processed_lines)
# CRITICAL: Unescape img, svg, picture, figure, figcaption, canvas, map, area tags that were converted to HTML entities
# Logic:
# 1. Closing tags (e.g. ) -> Always unescape
# 2. Container tags (svg, picture, figure, figcaption) -> Unescape even if bare (often used without attrs)
# 3. Ambiguous tags (img, area, map, canvas, source, image) -> Unescape ONLY if they have attributes (space followed by content)
# This avoids false positives like "The of effect" in fantasy text.
img_count = len(re.findall(r'</?(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)', html, flags=re.IGNORECASE))
# if img_count > 0:
# print(f"🖼️ Unescaping {img_count} image-related tag(s) from HTML entities (markdown2)")
html = re.sub(
r'<((?:/(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)(?:\s.*?)?)|(?:(?:svg|picture|figure|figcaption)(?:\s.*?)?)|(?:(?:img|image|source|area|map|canvas)\s.*?))>',
r'<\1>',
html,
flags=re.IGNORECASE | re.DOTALL
)
return html
except ImportError:
print("⚠️ markdown2 not available, falling back to markdown library")
# Use markdown library with setext headers disabled (default, recommended)
try:
import markdown
from markdown.extensions import Extension
from markdown.blockprocessors import BlockProcessor
# Custom extension to disable setext headers
class NoSetextHeadersExtension(Extension):
def extendMarkdown(self, md):
# Remove the setext header processor
if 'setextheader' in md.parser.blockprocessors:
md.parser.blockprocessors.deregister('setextheader')
# Check if the text contains markdown patterns
has_markdown = any([
'##' in plain_text, # Headers
'**' in plain_text, # Bold
'*' in plain_text and not '**' in plain_text, # Italic
'[' in plain_text and '](' in plain_text, # Links
'```' in plain_text, # Code blocks
'> ' in plain_text, # Blockquotes
'- ' in plain_text or '* ' in plain_text or '1. ' in plain_text # Lists
])
if has_markdown or preserve_structure:
# Use markdown with setext headers disabled
# Don't use 'extra' as it escapes parentheses and brackets
md = markdown.Markdown(extensions=[
'nl2br',
'sane_lists',
'fenced_code',
'tables',
NoSetextHeadersExtension()
])
html = md.convert(plain_text)
# Post-process to ensure proper paragraph structure
if not '
' in html:
# If markdown didn't create paragraphs, wrap content
lines = html.split('\n')
processed_lines = []
for line in lines:
line = line.strip()
if line and not line.startswith('<') and not line.endswith('>'):
processed_lines.append(f'
{line}
')
elif line:
processed_lines.append(line)
html = '\n'.join(processed_lines)
# CRITICAL: Unescape img, svg, picture, figure, figcaption, canvas, map, area tags that were converted to HTML entities
# Logic:
# 1. Closing tags (e.g. ) -> Always unescape
# 2. Container tags (svg, picture, figure, figcaption) -> Unescape even if bare
# 3. Ambiguous tags (img, area, map, canvas, source, image) -> Unescape ONLY if they have attributes
img_count = len(re.findall(r'</?(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)', html, flags=re.IGNORECASE))
# if img_count > 0:
# print(f"🖼️ Unescaping {img_count} image-related tag(s) from HTML entities (markdown)")
html = re.sub(
r'<((?:/(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)(?:\s.*?)?)|(?:(?:svg|picture|figure|figcaption)(?:\s.*?)?)|(?:(?:img|image|source|area|map|canvas)\s.*?))>',
r'<\1>',
html,
flags=re.IGNORECASE | re.DOTALL
)
return html
except ImportError:
print("⚠️ markdown not available, using fallback HTML conversion")
# Fallback: Manual markdown-to-HTML conversion
lines = plain_text.strip().split('\n')
html_parts = []
in_code_block = False
code_block_content = []
for line in lines:
# Handle code blocks
if line.strip().startswith('```'):
if in_code_block:
# End code block
html_parts.append('
' + '\n'.join(code_block_content) + '
')
code_block_content = []
in_code_block = False
else:
# Start code block
in_code_block = True
continue
if in_code_block:
code_block_content.append(line)
continue
line = line.strip()
if not line:
# Preserve empty lines as paragraph breaks
if html_parts and not html_parts[-1].endswith(''):
# Only add break if not already after a closing tag
html_parts.append(' ')
continue
# Check for markdown headers
if line.startswith('#'):
match = re.match(r'^(#+)\s*(.+)$', line)
if match:
level = min(len(match.group(1)), 6)
header_text = match.group(2).strip()
html_parts.append(f'{header_text}')
continue
# Check for blockquotes
if line.startswith('> '):
quote_text = line[2:].strip()
html_parts.append(f'
{quote_text}
')
continue
# Check for lists
if re.match(r'^[*\-+]\s+', line):
list_text = re.sub(r'^[*\-+]\s+', '', line)
html_parts.append(f'
')
continue
# Convert inline markdown
# Bold
line = re.sub(r'\*\*(.+?)\*\*', r'\1', line)
line = re.sub(r'__(.+?)__', r'\1', line)
# Italic
line = re.sub(r'\*(.+?)\*', r'\1', line)
line = re.sub(r'_(.+?)_', r'\1', line)
# Links
line = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', line)
# Code inline
line = re.sub(r'`([^`]+)`', r'\1', line)
# Regular paragraph
html_parts.append(f'
{line}
')
# Post-process lists to wrap in ul/ol tags
final_html = []
in_list = False
list_type = None
for part in html_parts:
if part.startswith('
'):
if not in_list:
# Determine list type based on context (simplified)
list_type = 'ul' # Default to unordered
final_html.append(f'<{list_type}>')
in_list = True
final_html.append(part)
else:
if in_list:
final_html.append(f'{list_type}>')
in_list = False
final_html.append(part)
# Close any open list
if in_list:
final_html.append(f'{list_type}>')
html = '\\n'.join(final_html)
# CRITICAL: Unescape img, svg, picture, figure, figcaption, canvas, map, area tags that were converted to HTML entities
# Logic:
# 1. Closing tags -> Always unescape
# 2. Container tags -> Unescape even if bare
# 3. Ambiguous tags -> Unescape ONLY if they have attributes
img_count = len(re.findall(r'</?(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)', html, flags=re.IGNORECASE))
# if img_count > 0:
# print(f"🖼️ Unescaping {img_count} image-related tag(s) from HTML entities (fallback)")
html = re.sub(
r'<((?:/(?:img|svg|picture|figure|figcaption|image|source|canvas|map|area)(?:\s.*?)?)|(?:(?:svg|picture|figure|figcaption)(?:\s.*?)?)|(?:(?:img|image|source|area|map|canvas)\s.*?))>',
r'<\1>',
html,
flags=re.IGNORECASE | re.DOTALL
)
return html
# =====================================================
# MAIN TRANSLATION FUNCTION
# =====================================================
def main(log_callback=None, stop_callback=None):
"""Main translation function with enhanced duplicate detection and progress tracking"""
global STOP_LOGGED, _stop_requested
STOP_LOGGED = False
_stop_requested = False # Reset stop flag for new translation instance
# Also reset unified_api_client global flags
try:
from unified_api_client import set_stop_flag as uac_set_stop_flag
uac_set_stop_flag(False)
except Exception:
pass
config = TranslationConfig()
builtins._DISABLE_ZERO_DETECTION = config.DISABLE_ZERO_DETECTION
if config.DISABLE_ZERO_DETECTION:
print("=" * 60)
print("⚠️ 0-BASED DETECTION DISABLED BY USER")
print("⚠️ All chapter numbers will be used exactly as found")
print("=" * 60)
args = None
chapters_completed = 0
chunks_completed = 0
args = None
chapters_completed = 0
chunks_completed = 0
input_path = config.input_path
if not input_path and len(sys.argv) > 1:
input_path = sys.argv[1]
is_text_file = input_path.lower().endswith(('.txt', '.csv', '.json', '.md'))
is_pdf_file = input_path.lower().endswith('.pdf')
if is_text_file:
os.environ["IS_TEXT_FILE_TRANSLATION"] = "1"
import json as _json
_original_load = _json.load
def debug_json_load(fp, *args, **kwargs):
result = _original_load(fp, *args, **kwargs)
if isinstance(result, list) and len(result) > 0:
if isinstance(result[0], dict) and 'original_name' in result[0]:
print(f"[DEBUG] Loaded glossary list with {len(result)} items from {fp.name if hasattr(fp, 'name') else 'unknown'}")
return result
_json.load = debug_json_load
if log_callback:
set_output_redirect(log_callback)
def check_stop():
if stop_callback and stop_callback():
# Don't log stop message if wait_for_chunks is active - let translation continue
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if not (graceful_stop_active and wait_for_chunks):
log_stop_once()
return True
return is_stop_requested()
if config.EMERGENCY_RESTORE:
print("✅ Emergency paragraph restoration is ENABLED")
else:
print("⚠️ Emergency paragraph restoration is DISABLED")
print(f"[DEBUG] REMOVE_AI_ARTIFACTS environment variable: {os.getenv('REMOVE_AI_ARTIFACTS', 'NOT SET')}")
print(f"[DEBUG] REMOVE_AI_ARTIFACTS parsed value: {config.REMOVE_AI_ARTIFACTS}")
if config.REMOVE_AI_ARTIFACTS:
print("⚠️ AI artifact removal is ENABLED - will clean AI response artifacts")
else:
print("✅ AI artifact removal is DISABLED - preserving all content as-is")
if '--epub' in sys.argv or (len(sys.argv) > 1 and sys.argv[1].endswith(('.epub', '.txt', '.csv', '.json', '.pdf', '.md'))):
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('epub', help='Input EPUB or text file')
args = parser.parse_args()
input_path = args.epub
is_text_file = input_path.lower().endswith(('.txt', '.csv', '.json', '.md'))
is_pdf_file = input_path.lower().endswith('.pdf')
# Disable Break Split Count for EPUB files (only works with plain text files)
if input_path.lower().endswith('.epub'):
if os.getenv('BREAK_SPLIT_COUNT', ''):
print("⚠️ Break Split Count disabled for EPUB files (only works with .txt files)")
os.environ['BREAK_SPLIT_COUNT'] = ''
if is_text_file:
file_base = os.path.splitext(os.path.basename(input_path))[0]
else:
epub_base = os.path.splitext(os.path.basename(input_path))[0]
file_base = epub_base
# Allow callers (e.g. Discord bot) to control where outputs are written.
# This avoids relying on process-wide cwd changes (os.chdir), which is unsafe in multi-threaded apps.
output_root = (os.getenv("OUTPUT_DIRECTORY") or os.getenv("OUTPUT_DIR") or "").strip()
if output_root:
try:
os.makedirs(output_root, exist_ok=True)
except Exception:
# If we can't create the root, fall back to relative output.
output_root = ""
out = os.path.join(output_root, file_base) if output_root else file_base
os.makedirs(out, exist_ok=True)
print(f"[DEBUG] Created output folder → {out}")
cleanup_previous_extraction(out)
os.environ["EPUB_OUTPUT_DIR"] = out
payloads_dir = out
# Manage translation history persistence based on contextual + rolling settings
history_file = os.path.join(payloads_dir, "translation_history.json")
if os.path.exists(history_file):
if config.CONTEXTUAL and config.TRANSLATION_HISTORY_ROLLING:
# Preserve existing history across runs when using rolling window
print(f"[DEBUG] Preserving translation history (rolling window enabled) → {history_file}")
elif config.CONTEXTUAL:
# Contextual on but rolling disabled: start fresh each run
os.remove(history_file)
print(f"[DEBUG] CONTEXTUAL enabled without rolling - purged translation history → {history_file}")
else:
# Contextual off: never keep history
os.remove(history_file)
print("[DEBUG] CONTEXTUAL disabled - cleared translation history")
history_manager = HistoryManager(payloads_dir)
chapter_splitter = ChapterSplitter(model_name=config.MODEL)
chunk_context_manager = ChunkContextManager()
progress_manager = ProgressManager(payloads_dir)
# Prepare progress callback for chapter extraction
# Filter to show only every 10% progress update
chapter_progress_callback = None
_progress_state = {} # Track last shown percentage for each progress type
if log_callback:
def chapter_progress_callback(msg):
# Check if this is a progress message with percentage
import re
# Try to extract percentage from formatted progress bars
percent_match = re.search(r'\((\d+)%\)', msg)
if percent_match:
percent = int(percent_match.group(1))
# Determine progress type from message
if '📂' in msg or 'Scanning' in msg:
prog_type = 'scan'
elif '📦' in msg or 'Extracting' in msg:
prog_type = 'extract'
elif '📚' in msg or 'Processing chapters' in msg:
prog_type = 'process'
elif '📊' in msg or 'metadata' in msg.lower():
prog_type = 'metadata'
else:
prog_type = 'other'
# Get last shown percentage for this type
last_percent = _progress_state.get(prog_type, -1)
# Show if: crossed a 10% threshold, or reached 100%
should_show = (percent // 10 > last_percent // 10) or (percent == 100)
if should_show:
_progress_state[prog_type] = percent
log_callback(msg)
else:
# Not a progress percentage message, always show
log_callback(msg)
# Import Chapter_Extractor module functions
import Chapter_Extractor
# GlossaryManager is now a module with functions, not a class
print("🔍 Checking for deleted output files...")
progress_manager.cleanup_missing_files(out)
progress_manager.save()
if check_stop():
return
# Check if model needs API key
model_needs_api_key = not (config.MODEL.lower() in ['google-translate', 'google-translate-free'] or
'@' in config.MODEL or config.MODEL.startswith('vertex/') or config.MODEL.startswith('authgpt/') or config.MODEL.startswith('antigravity/'))
if model_needs_api_key and not config.API_KEY:
print("❌ Error: Set API_KEY, OPENAI_API_KEY, or OPENAI_OR_Gemini_API_KEY in your environment.")
return
# Set dummy API key for models that don't need one
if not config.API_KEY:
config.API_KEY = 'dummy-key-not-required'
#print(f"[DEBUG] Found API key: {config.API_KEY[:10]}...")
print(f"[DEBUG] Using model = {config.MODEL}")
print(f"[DEBUG] Max output tokens = {config.MAX_OUTPUT_TOKENS}")
client = UnifiedClient(model=config.MODEL, api_key=config.API_KEY, output_dir=out)
if hasattr(client, 'use_multi_keys') and client.use_multi_keys:
stats = client.get_stats()
print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded")
print(f" Active keys: {stats.get('active_keys', 0)}")
else:
print(f"🔑 Single-key mode: Using {config.MODEL}")
# Reset cleanup state when starting new translation
if hasattr(client, 'reset_cleanup_state'):
client.reset_cleanup_state()
if is_pdf_file:
print("📄 Processing PDF file...")
try:
txt_processor = TextFileProcessor(input_path, out)
chapters = txt_processor.extract_chapters()
txt_processor.save_original_structure()
metadata = {
"title": os.path.splitext(os.path.basename(input_path))[0],
"type": "pdf",
"chapter_count": len(chapters)
}
except ImportError as e:
print(f"❌ Error: PDF processor not available: {e}")
if log_callback:
log_callback(f"❌ Error: PDF processor not available: {e}")
return
except Exception as e:
print(f"❌ Error processing PDF file: {e}")
if log_callback:
log_callback(f"❌ Error processing PDF file: {e}")
return
elif is_text_file:
print("📄 Processing text file...")
try:
txt_processor = TextFileProcessor(input_path, out)
chapters = txt_processor.extract_chapters()
txt_processor.save_original_structure()
metadata = {
"title": os.path.splitext(os.path.basename(input_path))[0],
"type": "text",
"chapter_count": len(chapters)
}
except ImportError as e:
print(f"❌ Error: Text file processor not available: {e}")
if log_callback:
log_callback(f"❌ Error: Text file processor not available: {e}")
return
except Exception as e:
print(f"❌ Error processing text file: {e}")
if log_callback:
log_callback(f"❌ Error processing text file: {e}")
return
else:
# Check if we should use async extraction (for GUI mode)
use_async_extraction = os.getenv("USE_ASYNC_CHAPTER_EXTRACTION", "0") == "1"
if use_async_extraction and log_callback:
print("🚀 Using async chapter extraction (subprocess mode)...")
from chapter_extraction_manager import ChapterExtractionManager
# Create manager with log callback
extraction_manager = ChapterExtractionManager(log_callback=log_callback)
# Get extraction mode
extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower()
# Define completion callback
extraction_result = {"completed": False, "result": None}
def on_extraction_complete(result):
extraction_result["completed"] = True
extraction_result["result"] = result
# Safety check for None result
if result is None:
log_callback("❌ Chapter extraction failed: No result returned")
return
if result.get("success"):
log_callback(f"✅ Chapter extraction completed: {result.get('chapters', 0)} chapters")
else:
log_callback(f"❌ Chapter extraction failed: {result.get('error', 'Unknown error')}")
# Start async extraction
extraction_manager.extract_chapters_async(
input_path,
out,
extraction_mode=extraction_mode,
progress_callback=lambda msg: log_callback(f"📊 {msg}"),
completion_callback=on_extraction_complete
)
# Wait for completion
while not extraction_result["completed"]:
if check_stop():
extraction_manager.stop_extraction()
return
time.sleep(0.1) # Check every 100ms
# Check if extraction was successful
if not extraction_result["result"] or not extraction_result["result"].get("success"):
log_callback("❌ Chapter extraction failed")
return
# Load the extracted data
metadata_path = os.path.join(out, "metadata.json")
if os.path.exists(metadata_path):
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
else:
metadata = extraction_result["result"].get("metadata", {})
# The async extraction should have saved chapters directly, similar to the sync version
# We need to reconstruct the chapters list with body content
# Check if the extraction actually created a chapters.json file with full content
chapters_full_path = os.path.join(out, "chapters_full.json")
chapters_info_path = os.path.join(out, "chapters_info.json")
chapters = []
# First try to load full chapters if saved
if os.path.exists(chapters_full_path):
log_callback("Loading full chapters data...")
with open(chapters_full_path, 'r', encoding='utf-8') as f:
chapters = json.load(f)
log_callback(f"✅ Loaded {len(chapters)} chapters with content")
elif os.path.exists(chapters_info_path):
# Fall back to loading from individual files
log_callback("Loading chapter info and searching for content files...")
with open(chapters_info_path, 'r', encoding='utf-8') as f:
chapters_info = json.load(f)
# List all files in the output directory
all_files = os.listdir(out)
log_callback(f"Found {len(all_files)} files in output directory")
# Try to match chapter files
for info in chapters_info:
chapter_num = info['num']
found = False
# Try different naming patterns
patterns = [
f"chapter_{chapter_num:04d}_", # With leading zeros
f"chapter_{chapter_num}_", # Without leading zeros
f"ch{chapter_num:04d}_", # Shortened with zeros
f"ch{chapter_num}_", # Shortened without zeros
f"{chapter_num:04d}_", # Just number with zeros
f"{chapter_num}_" # Just number
]
for pattern in patterns:
# Find files matching this pattern (any extension)
matching_files = [f for f in all_files if f.startswith(pattern)]
if matching_files:
# Prefer HTML/XHTML files
html_files = [f for f in matching_files if f.endswith(('.html', '.xhtml', '.htm'))]
if html_files:
chapter_file = html_files[0]
else:
chapter_file = matching_files[0]
chapter_path = os.path.join(out, chapter_file)
try:
with open(chapter_path, 'r', encoding='utf-8') as f:
content = f.read()
chapters.append({
"num": chapter_num,
"title": info.get("title", f"Chapter {chapter_num}"),
"body": content,
"filename": info.get("original_filename", ""),
"has_images": info.get("has_images", False),
"file_size": len(content),
"content_hash": info.get("content_hash", "")
})
found = True
break
except Exception as e:
log_callback(f"⚠️ Error reading {chapter_file}: {e}")
if not found:
log_callback(f"⚠️ No file found for Chapter {chapter_num}")
# Log available files for debugging
if len(all_files) < 50:
similar_files = [f for f in all_files if str(chapter_num) in f]
if similar_files:
log_callback(f" Similar files: {similar_files[:3]}")
if not chapters:
log_callback("❌ No chapters could be loaded!")
log_callback(f"❌ Output directory: {out}")
log_callback(f"❌ Files in directory: {len(os.listdir(out))} files")
# Show first few files for debugging
sample_files = os.listdir(out)[:10]
log_callback(f"❌ Sample files: {sample_files}")
return
# Sort chapters by OPF spine order if available
opf_path = os.path.join(out, 'content.opf')
if os.path.exists(opf_path) and chapters:
log_callback("📋 Sorting chapters according to OPF spine order...")
# Call module-level function directly
chapters = Chapter_Extractor._sort_by_opf_spine(chapters, opf_path)
log_callback("✅ Chapters sorted according to OPF reading order")
else:
print("🚀 Using comprehensive chapter extraction with resource handling...")
with zipfile.ZipFile(input_path, 'r') as zf:
metadata = Chapter_Extractor._extract_epub_metadata(zf)
chapters = Chapter_Extractor.extract_chapters(zf, out, progress_callback=chapter_progress_callback)
print(f"\n📚 Extraction Summary:")
print(f" Total chapters extracted: {len(chapters)}")
if chapters:
nums = [c.get('num', 0) for c in chapters]
print(f" Chapter range: {min(nums)} to {max(nums)}")
# Check for gaps in the sequence
expected_count = max(nums) - min(nums) + 1
if len(chapters) < expected_count:
print(f"\n⚠️ Potential missing chapters detected:")
print(f" Expected {expected_count} chapters (from {min(nums)} to {max(nums)})")
print(f" Actually found: {len(chapters)} chapters")
print(f" Potentially missing: {expected_count - len(chapters)} chapters")
validate_chapter_continuity(chapters)
print("\n" + "="*50)
validate_epub_structure(out)
print("="*50 + "\n")
progress_manager.migrate_to_content_hash(chapters)
progress_manager.save()
if check_stop():
return
metadata_path = os.path.join(out, "metadata.json")
if os.path.exists(metadata_path):
with open(metadata_path, 'r', encoding='utf-8') as mf:
metadata = json.load(mf)
metadata["chapter_count"] = len(chapters)
metadata["chapter_titles"] = {str(c["num"]): c["title"] for c in chapters}
print(f"[DEBUG] Initializing client with model = {config.MODEL}")
client = UnifiedClient(api_key=config.API_KEY, model=config.MODEL, output_dir=out)
# Log translation anti-duplicate parameters usage
if os.getenv("ENABLE_ANTI_DUPLICATE", "0") == "1":
ad_top_p = os.getenv("TOP_P", "1.0")
ad_top_k = os.getenv("TOP_K", "0")
ad_freq = os.getenv("FREQUENCY_PENALTY", "0.0")
ad_pres = os.getenv("PRESENCE_PENALTY", "0.0")
ad_rep = os.getenv("REPETITION_PENALTY", "1.0")
print(f"🎯 Anti-duplicate enabled for translation (top_p={ad_top_p}, top_k={ad_top_k}, freq_penalty={ad_freq}, presence_penalty={ad_pres}, repetition_penalty={ad_rep})")
if hasattr(client, 'use_multi_keys') and client.use_multi_keys:
stats = client.get_stats()
print(f"🔑 Multi-key mode active: {stats.get('total_keys', 0)} keys loaded")
print(f" Active keys: {stats.get('active_keys', 0)}")
else:
print(f"🔑 Single-key mode: Using {config.MODEL}")
# Reset cleanup state when starting new translation
if hasattr(client, 'reset_cleanup_state'):
client.reset_cleanup_state()
if "title" in metadata and config.TRANSLATE_BOOK_TITLE and not metadata.get("title_translated", False):
original_title = metadata["title"]
print(f"📚 Original title: {original_title}")
if not check_stop():
translated_title = translate_title(
original_title,
client,
None,
None,
config.TEMP
)
metadata["original_title"] = original_title
metadata["title"] = translated_title
metadata["title_translated"] = True
print(f"📚 Translated title: {translated_title}")
else:
print("❌ Title translation skipped due to stop request")
# Translate other metadata fields if configured
translate_metadata_fields_str = os.getenv('TRANSLATE_METADATA_FIELDS', '{}')
metadata_translation_mode = os.getenv('METADATA_TRANSLATION_MODE', 'together')
try:
translate_metadata_fields = json.loads(translate_metadata_fields_str)
if translate_metadata_fields and any(translate_metadata_fields.values()):
# Filter out fields that should be translated (excluding already translated fields)
fields_to_translate = {}
skipped_fields = []
for field_name, should_translate in translate_metadata_fields.items():
if should_translate and field_name != 'title' and field_name in metadata:
# Check if already translated
if metadata.get(f"{field_name}_translated", False):
skipped_fields.append(field_name)
print(f"✓ Skipping {field_name} - already translated")
else:
fields_to_translate[field_name] = should_translate
if fields_to_translate:
print("\n" + "="*50)
print("📋 METADATA TRANSLATION PHASE")
print("="*50)
print(f"🌐 Translating {len(fields_to_translate)} metadata fields...")
# Get metadata system prompt from environment
system_prompt = os.getenv('METADATA_SYSTEM_PROMPT', '')
if system_prompt:
# Get field-specific prompts
field_prompts_str = os.getenv('METADATA_FIELD_PROMPTS', '{}')
try:
field_prompts = json.loads(field_prompts_str)
except:
field_prompts = {}
if not field_prompts and not field_prompts.get('_default'):
print("❌ No field prompts configured, skipping metadata translation")
else:
# Get language configuration
lang_behavior = os.getenv('LANG_PROMPT_BEHAVIOR', 'auto')
forced_source_lang = os.getenv('FORCED_SOURCE_LANG', 'Korean')
output_language = os.getenv('OUTPUT_LANGUAGE', 'English')
# Determine source language
source_lang = metadata.get('language', '').lower()
if lang_behavior == 'never':
lang_str = ""
elif lang_behavior == 'always':
lang_str = forced_source_lang
else: # auto
if 'zh' in source_lang or 'chinese' in source_lang:
lang_str = 'Chinese'
elif 'ja' in source_lang or 'japanese' in source_lang:
lang_str = 'Japanese'
elif 'ko' in source_lang or 'korean' in source_lang:
lang_str = 'Korean'
else:
lang_str = ''
# Check if batch translation is enabled for parallel processing
batch_translate_enabled = os.getenv('BATCH_TRANSLATION', '0') == '1'
batch_size = int(os.getenv('BATCH_SIZE', '50')) # Default batch size
if batch_translate_enabled and len(fields_to_translate) > 1:
print(f"⚡ Using parallel metadata translation mode ({len(fields_to_translate)} fields, batch size: {batch_size})...")
# Import ThreadPoolExecutor for parallel processing
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Thread-safe results storage
translation_results = {}
results_lock = threading.Lock()
def translate_metadata_field(field_name, original_value):
"""Translate a single metadata field"""
try:
print(f"\n📋 Translating {field_name}: {original_value[:100]}..."
if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}")
# Get field-specific prompt
prompt_template = field_prompts.get(field_name, field_prompts.get('_default', ''))
if not prompt_template:
print(f"⚠️ No prompt configured for field '{field_name}', skipping")
return None
# Replace variables in prompt
field_prompt = prompt_template.replace('{source_lang}', lang_str)
field_prompt = field_prompt.replace('{output_lang}', output_language)
field_prompt = field_prompt.replace('{target_lang}', output_language)
field_prompt = field_prompt.replace('{field_value}', str(original_value))
# Check if we're using a translation service (not AI)
client_type = getattr(client, 'client_type', '')
is_translation_service = client_type in ['deepl', 'google_translate']
if is_translation_service:
# For translation services, send only the field value without AI prompts
print(f"🌐 Using translation service ({client_type}) - sending field directly")
messages = [
{"role": "user", "content": str(original_value)}
]
else:
# For AI services, use prompts as before
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"{field_prompt}\n\n{original_value}"}
]
# Add delay for rate limiting
if config.DELAY > 0:
time.sleep(config.DELAY)
# Make API call
content, finish_reason = client.send(
messages,
temperature=config.TEMP,
max_tokens=config.MAX_OUTPUT_TOKENS
)
translated_value = content.strip()
# Store result thread-safely
with results_lock:
translation_results[field_name] = {
'original': original_value,
'translated': translated_value,
'success': True
}
print(f"✅ Translated {field_name}: {translated_value}")
return translated_value
except Exception as e:
print(f"❌ Failed to translate {field_name}: {e}")
with results_lock:
translation_results[field_name] = {
'original': original_value,
'translated': None,
'success': False,
'error': str(e)
}
return None
# Execute parallel translations with limited workers
max_workers = min(len(fields_to_translate), batch_size)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all translation tasks
futures = {}
for field_name in fields_to_translate:
if field_name in metadata and not check_stop():
original_value = metadata[field_name]
future = executor.submit(translate_metadata_field, field_name, original_value)
futures[future] = field_name
# Wait for completion
for future in as_completed(futures):
if check_stop():
print("❌ Metadata translation stopped by user")
break
# Apply results to metadata
for field_name, result in translation_results.items():
if result['success'] and result['translated']:
metadata[f"original_{field_name}"] = result['original']
metadata[field_name] = result['translated']
metadata[f"{field_name}_translated"] = True
else:
# Sequential translation mode (individual translation)
mode_desc = "sequential" if not batch_translate_enabled else "sequential (single field)"
print(f"📝 Using {mode_desc} translation mode...")
for field_name in fields_to_translate:
if not check_stop() and field_name in metadata:
original_value = metadata[field_name]
print(f"\n📋 Translating {field_name}: {original_value[:100]}..."
if len(str(original_value)) > 100 else f"\n📋 Translating {field_name}: {original_value}")
# Get field-specific prompt
prompt_template = field_prompts.get(field_name, field_prompts.get('_default', ''))
if not prompt_template:
print(f"⚠️ No prompt configured for field '{field_name}', skipping")
continue
# Replace variables in prompt
field_prompt = prompt_template.replace('{source_lang}', lang_str)
field_prompt = field_prompt.replace('{output_lang}', output_language)
field_prompt = field_prompt.replace('{target_lang}', output_language)
field_prompt = field_prompt.replace('{field_value}', str(original_value))
# Check if we're using a translation service (not AI)
client_type = getattr(client, 'client_type', '')
is_translation_service = client_type in ['deepl', 'google_translate']
if is_translation_service:
# For translation services, send only the field value without AI prompts
print(f"🌐 Using translation service ({client_type}) - sending field directly")
messages = [
{"role": "user", "content": str(original_value)}
]
else:
# For AI services, use prompts as before
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"{field_prompt}\n\n{original_value}"}
]
try:
# Add delay using the config instance from main()
if config.DELAY > 0: # ✅ FIXED - use config.DELAY instead of config.SEND_INTERVAL
time.sleep(config.DELAY)
# Use the same client instance from main()
# ✅ FIXED - Properly unpack tuple response and provide max_tokens
content, finish_reason = client.send(
messages,
temperature=config.TEMP,
max_tokens=config.MAX_OUTPUT_TOKENS # ✅ FIXED - provide max_tokens to avoid NoneType error
)
translated_value = content.strip() # ✅ FIXED - use content from unpacked tuple
metadata[f"original_{field_name}"] = original_value
metadata[field_name] = translated_value
metadata[f"{field_name}_translated"] = True
print(f"✅ Translated {field_name}: {translated_value}")
except Exception as e:
print(f"❌ Failed to translate {field_name}: {e}")
else:
if check_stop():
print("❌ Metadata translation stopped by user")
break
else:
print("📋 No additional metadata fields to translate")
except Exception as e:
print(f"⚠️ Error processing metadata translation settings: {e}")
import traceback
traceback.print_exc()
with open(metadata_path, 'w', encoding='utf-8') as mf:
json.dump(metadata, mf, ensure_ascii=False, indent=2)
print(f"💾 Saved metadata with {'translated' if metadata.get('title_translated', False) else 'original'} title")
print("\n" + "="*50)
print("📑 GLOSSARY GENERATION PHASE")
print("="*50)
# Skip glossary generation for CSV/JSON/MD files (they are typically glossaries themselves)
if input_path.lower().endswith(('.csv', '.json', '.md')):
print("📑 Skipping glossary generation for CSV/JSON/MD file")
print(" CSV/JSON/MD files are treated as plain text and typically don't need glossaries")
else:
print(f"📑 DEBUG: ENABLE_AUTO_GLOSSARY = '{os.getenv('ENABLE_AUTO_GLOSSARY', 'NOT SET')}'")
print(f"📑 DEBUG: MANUAL_GLOSSARY = '{config.MANUAL_GLOSSARY}'")
print(f"📑 DEBUG: Manual glossary exists? {os.path.isfile(config.MANUAL_GLOSSARY) if config.MANUAL_GLOSSARY else False}")
print(f"📑 DEBUG: APPEND_GLOSSARY = '{os.getenv('APPEND_GLOSSARY', '1')}'")
print(f"📑 DEBUG: APPEND_GLOSSARY_PROMPT = '{os.getenv('APPEND_GLOSSARY_PROMPT', 'NOT SET')}'")
print(f"📑 DEBUG: Duplicate algorithm = '{os.getenv('GLOSSARY_DUPLICATE_ALGORITHM', 'auto')}'")
print(f"📑 DEBUG: Fuzzy threshold = '{os.getenv('GLOSSARY_FUZZY_THRESHOLD', '0.90')}'")
print(f"📑 DEBUG: Include gender context = '{os.getenv('GLOSSARY_INCLUDE_GENDER_CONTEXT', '0')}'")
print(f"📑 DEBUG: Context window size = '{os.getenv('GLOSSARY_CONTEXT_WINDOW', '2')}'")
print(f"📑 DEBUG: Min frequency = '{os.getenv('GLOSSARY_MIN_FREQUENCY', '1')}'")
print(f"📑 DEBUG: Max names = '{os.getenv('GLOSSARY_MAX_NAMES', '50')}'")
print(f"📑 DEBUG: Max titles = '{os.getenv('GLOSSARY_MAX_TITLES', '50')}'")
print(f"📑 DEBUG: Translation batch = '{os.getenv('GLOSSARY_BATCH_SIZE', '50')}'")
print(f"📑 DEBUG: Max text size = '{os.getenv('GLOSSARY_MAX_TEXT_SIZE', '50000')}'")
print(f"📑 DEBUG: Max sentences = '{os.getenv('GLOSSARY_MAX_SENTENCES', '200')}'")
print(f"📑 DEBUG: Use smart filter = '{os.getenv('GLOSSARY_USE_SMART_FILTER', '1')}'")
print(f"📑 DEBUG: Chapter split threshold = '{os.getenv('GLOSSARY_CHAPTER_SPLIT_THRESHOLD', '50000')}'")
print(f"📑 DEBUG: Target language = '{os.getenv('GLOSSARY_TARGET_LANGUAGE', 'English')}'")
# Check if glossary.csv already exists in the source folder
existing_glossary_csv = os.path.join(out, "glossary.csv")
existing_glossary_json = os.path.join(out, "glossary.json")
print(f"📑 DEBUG: Existing glossary.csv? {os.path.exists(existing_glossary_csv)}")
print(f"📑 DEBUG: Existing glossary.json? {os.path.exists(existing_glossary_json)}")
def _nonempty(path):
try:
return os.path.getsize(path) > 0
except Exception:
return False
def _has_glossary_data(path):
"""Return True only if the glossary file contains at least one entry."""
try:
ext = os.path.splitext(path)[1].lower()
if ext in [".csv", ".txt", ".md"]:
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
lines = [line for line in f.readlines() if line.strip()]
# Require at least one non-header data line
return len(lines) > 1
if ext == ".json":
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, dict):
if "entries" in data and isinstance(data["entries"], dict):
return len(data["entries"]) > 0
return len(data) > 0
if isinstance(data, list):
return len(data) > 0
# Unknown extension: fallback to non-empty size check
return _nonempty(path)
except Exception:
return False
# If manual glossary is present but empty/header-only, clear it so auto-gen can run
if config.MANUAL_GLOSSARY and os.path.isfile(config.MANUAL_GLOSSARY) and not _has_glossary_data(config.MANUAL_GLOSSARY):
print("📑 Manual glossary is empty; ignoring to allow automatic generation.")
config.MANUAL_GLOSSARY = ""
os.environ.pop("MANUAL_GLOSSARY", None)
if config.MANUAL_GLOSSARY and os.path.isfile(config.MANUAL_GLOSSARY) and _has_glossary_data(config.MANUAL_GLOSSARY):
ext = os.path.splitext(config.MANUAL_GLOSSARY)[1].lower()
# Treat .txt and .md files as CSV format (keep original extension)
if ext in [".csv", ".txt"]:
target_name = "glossary.csv"
elif ext == ".md":
target_name = "glossary.md"
elif ext == ".json":
target_name = "glossary.json"
else:
# Default to CSV for unknown extensions
target_name = "glossary.csv"
print(f"⚠️ Unknown glossary extension '{ext}', treating as CSV")
target_path = os.path.join(out, target_name)
if os.path.abspath(config.MANUAL_GLOSSARY) != os.path.abspath(target_path):
shutil.copy(config.MANUAL_GLOSSARY, target_path)
print("📑 Using manual glossary from:", config.MANUAL_GLOSSARY)
else:
print("📑 Using existing glossary:", config.MANUAL_GLOSSARY)
# Copy glossary extension if configured
if os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1':
additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '')
if additional_glossary_path and os.path.exists(additional_glossary_path):
# Preserve original extension
ext = os.path.splitext(additional_glossary_path)[1]
additional_target = os.path.join(out, f"glossary_extension{ext}")
# Only copy if target doesn't already exist
if not os.path.exists(additional_target):
try:
shutil.copy(additional_glossary_path, additional_target)
print(f"📑 Copied glossary extension: {os.path.basename(additional_glossary_path)}")
except Exception as e:
print(f"⚠️ Failed to copy glossary extension: {e}")
else:
print(f"📑 Using existing glossary extension in output folder")
# If existing glossaries in output are empty, delete them so they don't block auto-gen
if os.path.exists(existing_glossary_csv) and not _has_glossary_data(existing_glossary_csv):
try:
os.remove(existing_glossary_csv)
print("📑 Removed empty glossary.csv to allow automatic generation.")
except Exception as e:
print(f"⚠️ Could not remove empty glossary.csv: {e}")
if os.path.exists(existing_glossary_json) and not _has_glossary_data(existing_glossary_json):
try:
os.remove(existing_glossary_json)
print("📑 Removed empty glossary.json to allow automatic generation.")
except Exception as e:
print(f"⚠️ Could not remove empty glossary.json: {e}")
elif (os.path.exists(existing_glossary_csv) and _has_glossary_data(existing_glossary_csv)) or \
(os.path.exists(existing_glossary_json) and _has_glossary_data(existing_glossary_json)):
print("📑 Existing glossary file detected in source folder - skipping automatic generation")
target_glossary_path = None
if os.path.exists(existing_glossary_csv) and _has_glossary_data(existing_glossary_csv):
print(f"📑 Using existing glossary.csv: {existing_glossary_csv}")
target_glossary_path = existing_glossary_csv
elif os.path.exists(existing_glossary_json) and _has_glossary_data(existing_glossary_json):
print(f"📑 Using existing glossary.json: {existing_glossary_json}")
target_glossary_path = existing_glossary_json
# --- Check and inject book title if missing ---
if target_glossary_path and target_glossary_path.endswith('.csv'):
try:
include_title = os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1"
auto_inject = os.getenv("GLOSSARY_AUTO_INJECT_BOOK_TITLE", "0") == "1"
# Auto-inject applies only to already loaded existing glossary files (post-dedup context)
if include_title and auto_inject:
# Read existing content
with open(target_glossary_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Check if book entry exists
has_book_entry = False
for line in lines:
if line.lower().startswith("book,"):
has_book_entry = True
break
if not has_book_entry:
print("📑 Checking for missing book title entry in existing glossary...")
# Use GlossaryManager to find/translate title
import GlossaryManager
# Get raw title from input EPUB
epub_path_env = os.getenv("EPUB_PATH", "")
raw_title = GlossaryManager._extract_raw_title_from_epub(epub_path_env)
# Get translated title from output metadata
trans_title = GlossaryManager._extract_translated_title_from_metadata(out)
if raw_title or trans_title:
# Determine values (prefer distinct, fallback to what we have)
r_val = raw_title if raw_title else (trans_title if trans_title else "")
t_val = trans_title if trans_title else (raw_title if raw_title else "")
# Insert book entry in token-efficient format if detected, or standard CSV
is_token_format = any(l.strip().startswith("Glossary Columns:") for l in lines)
if is_token_format:
# Insert into token efficient format
# Find start of BOOKS section or create it at top
book_lines = [
f"=== BOOKS ===\n",
f"* {t_val} ({r_val})\n",
"\n"
]
# Find where to insert (after Glossary Columns)
insert_idx = 0
for i, l in enumerate(lines):
if l.strip().startswith("Glossary Columns:"):
insert_idx = i + 2 # Skip blank line
break
# Check if BOOKS section already exists to avoid duplication
has_books_section = any(l.strip() == "=== BOOKS ===" for l in lines)
if not has_books_section:
for bl in reversed(book_lines):
lines.insert(insert_idx, bl)
else:
# Standard CSV injection
book_line = f"book,{r_val},{t_val},,\n"
# Find insertion point (after header if present)
insert_idx = 0
if lines and "type," in lines[0].lower():
insert_idx = 1
lines.insert(insert_idx, book_line)
# Write back
with open(target_glossary_path, 'w', encoding='utf-8') as f:
f.writelines(lines)
print(f"📚 Auto-injected book title into existing glossary: {t_val} ({r_val})")
except Exception as e:
print(f"⚠️ Failed to inject book title: {e}")
# ----------------------------------------------
# Copy glossary extension if configured
if os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1':
additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '')
if additional_glossary_path and os.path.exists(additional_glossary_path):
# Preserve original extension
ext = os.path.splitext(additional_glossary_path)[1]
additional_target = os.path.join(out, f"glossary_extension{ext}")
# Only copy if target doesn't already exist
if not os.path.exists(additional_target):
try:
shutil.copy(additional_glossary_path, additional_target)
print(f"📑 Copied glossary extension: {os.path.basename(additional_glossary_path)}")
except Exception as e:
print(f"⚠️ Failed to copy glossary extension: {e}")
else:
print(f"📑 Using existing glossary extension in output folder")
elif os.getenv("ENABLE_AUTO_GLOSSARY", "0") == "1":
model = os.getenv("MODEL", "gpt-4")
if is_traditional_translation_api(model):
print("📑 Automatic glossary generation disabled")
print(f" {model} does not support glossary extraction")
print(" Traditional translation APIs cannot identify character names/terms")
else:
print("📑 Starting automatic glossary generation...")
try:
# Use the new process-safe glossary worker
from glossary_process_worker import generate_glossary_in_process
import concurrent.futures
import multiprocessing
instructions = ""
# Get extraction workers setting
extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
if extraction_workers == 1:
# Auto-detect for better performance
extraction_workers = min(os.cpu_count() or 4, 4)
print(f"📑 Using {extraction_workers} CPU cores for glossary generation")
# Collect environment variables to pass to subprocess
env_vars = {}
important_vars = [
'EXTRACTION_WORKERS', 'GLOSSARY_MIN_FREQUENCY', 'GLOSSARY_MAX_NAMES',
'GLOSSARY_MAX_TITLES', 'GLOSSARY_BATCH_SIZE', 'GLOSSARY_STRIP_HONORIFICS',
'GLOSSARY_FUZZY_THRESHOLD', 'GLOSSARY_MAX_TEXT_SIZE', 'GLOSSARY_MAX_SENTENCES',
'AUTO_GLOSSARY_PROMPT', 'GLOSSARY_USE_SMART_FILTER', 'GLOSSARY_USE_LEGACY_CSV',
'GLOSSARY_PARALLEL_ENABLED', 'GLOSSARY_FILTER_MODE', 'GLOSSARY_SKIP_FREQUENCY_CHECK',
'GLOSSARY_SKIP_ALL_VALIDATION', 'MODEL', 'API_KEY', 'OPENAI_API_KEY', 'GEMINI_API_KEY',
'MAX_OUTPUT_TOKENS', 'GLOSSARY_TEMPERATURE', 'MANUAL_GLOSSARY', 'ENABLE_AUTO_GLOSSARY',
'GLOSSARY_DUPLICATE_ALGORITHM', 'GLOSSARY_INCLUDE_GENDER_CONTEXT', 'GLOSSARY_CONTEXT_WINDOW',
'GLOSSARY_INCLUDE_BOOK_TITLE', 'EPUB_PATH',
# Match GUI batching settings
'BATCH_TRANSLATION', 'BATCH_SIZE', 'BATCHING_MODE', 'BATCH_GROUP_SIZE',
# Keep submission staggering consistent with GUI
'THREAD_SUBMISSION_DELAY_SECONDS',
]
for var in important_vars:
if var in os.environ:
env_vars[var] = os.environ[var]
# NOTE: Avoid multiprocessing.Manager() here.
# Instead, have the subprocess append logs to a file and tail it from the parent.
log_queue = None
# Write subprocess logs to a central logs folder (not the output book folder)
try:
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
except Exception:
_project_root = os.path.abspath(".")
logs_dir = os.path.join(_project_root, "logs")
try:
os.makedirs(logs_dir, exist_ok=True)
except Exception:
pass
glossary_log_fp = os.path.join(
logs_dir,
f"glossary_subprocess_{int(time.time() * 1000)}_{os.getpid()}.log"
)
try:
# Ensure file exists (fresh per run)
with open(glossary_log_fp, "w", encoding="utf-8") as _f:
_f.write("")
except Exception:
pass
# Use ProcessPoolExecutor for true parallelism (completely bypasses GIL)
print("📑 Starting glossary generation in separate process...")
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
# Submit to separate process WITH log queue
future = executor.submit(
generate_glossary_in_process,
out,
chapters,
instructions,
env_vars,
log_queue, # Queue disabled (None)
glossary_log_fp # log_file_path for parent tailing
)
# Poll for completion and stream logs in real-time
poll_count = 0
graceful_stop_notice_shown = False
# Tail the subprocess log file for visibility
_log_pos = 0
_seen_worker_output = False
_submit_ts = time.time()
# Start wait logging after 5s (avoid a noisy "0s" line)
_last_wait_log_ts = _submit_ts
while not future.done():
poll_count += 1
# Tail subprocess log file (best-effort)
try:
if glossary_log_fp and os.path.exists(glossary_log_fp):
with open(glossary_log_fp, "r", encoding="utf-8", errors="ignore") as _lf:
_lf.seek(_log_pos)
new = _lf.read()
_log_pos = _lf.tell()
if new:
_seen_worker_output = True
# Print as-is (already line-delimited)
for _ln in new.splitlines():
if _ln.strip():
print(_ln)
except Exception:
pass
# If the subprocess hasn't produced any output yet, show periodic wait logs.
# This helps during Windows spawn/import/pickling startup delays.
if not _seen_worker_output:
now = time.time()
if (now - _last_wait_log_ts) >= 5.0:
_last_wait_log_ts = now
try:
elapsed = int(now - _submit_ts)
except Exception:
elapsed = 0
# Best-effort status: future.running() may become True once the worker begins executing.
try:
running = future.running()
except Exception:
running = False
state = "booting" if running else "spawning"
print(f"⏳ Waiting for glossary subprocess to start ({state})… {elapsed}s")
# Super short sleep to yield to GUI
time.sleep(0.01)
# Check for stop every 100 polls
if poll_count % 100 == 0:
if check_stop():
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
def _glossary_all_chunks_submitted() -> bool:
"""Best-effort check via GlossaryManager status file.
IMPORTANT: We only consider chunks "submitted" once they have been
sent to the API (i.e., transitioned to in-flight after any stagger/delay).
Executor/thread submission is NOT sufficient for this.
"""
try:
import json as _json
import tempfile as _tempfile
stop_fp = os.environ.get('GLOSSARY_STOP_FILE') or os.path.join(_tempfile.gettempdir(), 'glossarion_glossary.stop')
status_fp = os.environ.get('GLOSSARY_STATUS_FILE') or f"{stop_fp}.status.json"
if not status_fp or not os.path.exists(status_fp):
return False
with open(status_fp, 'r', encoding='utf-8') as f:
st = _json.load(f)
if not isinstance(st, dict):
return False
# Prefer the explicit "all_sent" flag written by GlossaryManager.
if st.get('all_sent') is True:
return True
# Fallback: compare counts (sent_chunks is post-stagger/in-flight).
total = int(st.get('total_chunks', 0) or 0)
sent = int(st.get('sent_chunks', 0) or 0)
return total > 0 and sent >= total
except Exception:
return False
def _glossary_any_in_flight() -> bool:
"""Detect in-flight API calls from the glossary subprocess via watchdog files."""
try:
import json as _json
wd_dir = os.environ.get('GLOSSARION_WATCHDOG_DIR')
if not wd_dir or not os.path.isdir(wd_dir):
return False
import glob as _glob
for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json')):
try:
with open(fp, 'r', encoding='utf-8') as f:
st = _json.load(f)
if isinstance(st, dict) and int(st.get('in_flight', 0) or 0) > 0:
return True
except Exception:
continue
except Exception:
return False
return False
should_wait = False
if graceful_stop_active:
if wait_for_chunks:
# WAIT_FOR_CHUNKS=1: allow in-flight work to complete.
should_wait = True
else:
# WAIT_FOR_CHUNKS=0: wait if all chunks were already sent to the API.
# Also check the watchdog directly — the status file written by
# the subprocess _sent_monitor thread polls at 0.1s intervals and
# may lag behind the actual API state. The watchdog file is
# updated synchronously when an API call goes in-flight, so it
# is the reliable ground truth.
should_wait = _glossary_all_chunks_submitted() or _glossary_any_in_flight()
if should_wait:
if not graceful_stop_notice_shown:
if wait_for_chunks:
print("⏳ Graceful stop — waiting for glossary generation to finish...")
else:
print("⏳ Graceful stop — waiting (glossary API call in flight)...")
graceful_stop_notice_shown = True
continue
print("📑 ❌ Glossary generation cancelled")
# Escalate to FULL STOP for glossary cancellation:
# 1) Touch the shared stop file so the glossary subprocess stops starting new work.
try:
import tempfile as _tempfile
stop_fp = os.environ.get('GLOSSARY_STOP_FILE') or os.path.join(_tempfile.gettempdir(), 'glossarion_glossary.stop')
os.environ['GLOSSARY_STOP_FILE'] = stop_fp
with open(stop_fp, 'w', encoding='utf-8') as f:
f.write('stop')
except Exception:
pass
# 2) Hard-cancel HTTP sessions in THIS process (best-effort).
try:
import unified_api_client
if hasattr(unified_api_client, 'set_stop_flag'):
unified_api_client.set_stop_flag(True)
if hasattr(unified_api_client, 'global_stop_flag'):
unified_api_client.global_stop_flag = True
if hasattr(unified_api_client, 'UnifiedClient'):
unified_api_client.UnifiedClient._global_cancelled = True
if hasattr(unified_api_client, 'hard_cancel_all'):
unified_api_client.hard_cancel_all()
except Exception:
pass
# 3) Terminate glossary subprocess(es) if they are still running.
# ProcessPoolExecutor cancellation does NOT reliably kill a running worker.
try:
import psutil
wd_dir = os.environ.get('GLOSSARION_WATCHDOG_DIR')
if wd_dir and os.path.isdir(wd_dir):
import glob as _glob
import re as _re
for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json')):
try:
base = os.path.basename(fp)
m = _re.search(r"api_watchdog_(\d+)\.json$", base)
if not m:
continue
pid = int(m.group(1))
if pid == os.getpid():
continue
p = psutil.Process(pid)
# Best-effort terminate, then kill if needed.
try:
p.terminate()
except Exception:
pass
except Exception:
continue
# Give processes a brief moment to exit, then kill any remaining.
try:
time.sleep(0.2)
except Exception:
pass
if wd_dir and os.path.isdir(wd_dir):
import glob as _glob
import re as _re
for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json')):
try:
base = os.path.basename(fp)
m = _re.search(r"api_watchdog_(\d+)\.json$", base)
if not m:
continue
pid = int(m.group(1))
if pid == os.getpid():
continue
p = psutil.Process(pid)
try:
if p.is_running():
p.kill()
except Exception:
pass
except Exception:
continue
except Exception:
pass
# 4) Best-effort: clear watchdog files so the GUI progress bar doesn't stick.
try:
wd_dir = os.environ.get('GLOSSARION_WATCHDOG_DIR')
if wd_dir and os.path.isdir(wd_dir):
import glob as _glob
for fp in _glob.glob(os.path.join(wd_dir, 'api_watchdog_*.json*')):
try:
os.remove(fp)
except Exception:
pass
except Exception:
pass
try:
executor.shutdown(wait=False, cancel_futures=True)
except Exception:
pass
return
# Get any remaining logs from subprocess log file
try:
if glossary_log_fp and os.path.exists(glossary_log_fp):
with open(glossary_log_fp, "r", encoding="utf-8", errors="ignore") as _lf:
_lf.seek(_log_pos)
new = _lf.read()
_log_pos = _lf.tell()
if new:
for _ln in new.splitlines():
if _ln.strip():
print(_ln)
except Exception:
pass
# If Stop was requested at any point, treat as CANCELLED and do not continue.
try:
if check_stop():
print("📑 ❌ Automatic glossary generation CANCELLED")
return
except Exception:
pass
# Get result
if future.done():
try:
result = future.result(timeout=0.1)
if isinstance(result, dict):
if result.get('success'):
print(f"📑 ✅ Glossary generation completed successfully")
else:
print(f"📑 ❌ Glossary generation failed: {result.get('error')}")
if result.get('traceback'):
print(f"📑 Error details:\n{result.get('traceback')}")
except Exception as e:
print(f"📑 ❌ Error retrieving glossary result: {e}")
# Only mark completed when not stopping.
print("✅ Automatic glossary generation COMPLETED")
# If the user requested graceful stop (wait_for_chunks), stop here after glossary is done.
try:
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if graceful_stop_active and wait_for_chunks and check_stop():
print("✅ Glossary generation finished. Stopping as requested (wait for chunks).")
return
except Exception:
pass
# Copy glossary extension if configured (after auto-glossary generation)
if os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1':
additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '')
if additional_glossary_path and os.path.exists(additional_glossary_path):
# Preserve original extension
ext = os.path.splitext(additional_glossary_path)[1]
additional_target = os.path.join(out, f"glossary_extension{ext}")
# Only copy if target doesn't already exist
if not os.path.exists(additional_target):
try:
shutil.copy(additional_glossary_path, additional_target)
print(f"📑 Copied glossary extension: {os.path.basename(additional_glossary_path)}")
except Exception as e:
print(f"⚠️ Failed to copy glossary extension: {e}")
else:
print(f"📑 Using existing glossary extension in output folder")
# Handle deferred glossary appending
if os.getenv('DEFER_GLOSSARY_APPEND') == '1':
print("📑 Processing deferred glossary append to system prompt...")
glossary_path = find_glossary_file(out)
if glossary_path and os.path.exists(glossary_path):
try:
glossary_block = None
if glossary_path.lower().endswith('.csv'):
with open(glossary_path, 'r', encoding='utf-8') as f:
glossary_block = f.read()
else:
with open(glossary_path, 'r', encoding='utf-8') as f:
glossary_data = json.load(f)
formatted_entries = {}
if isinstance(glossary_data, dict) and 'entries' in glossary_data:
formatted_entries = glossary_data['entries']
elif isinstance(glossary_data, dict):
formatted_entries = {k: v for k, v in glossary_data.items() if k != "metadata"}
if formatted_entries:
glossary_block = json.dumps(formatted_entries, ensure_ascii=False, indent=2)
else:
glossary_block = None
if glossary_block:
glossary_prompt = os.getenv('GLOSSARY_APPEND_PROMPT',
"Character/Term Glossary (use these translations consistently):")
current_prompt = config.PROMPT
if current_prompt:
current_prompt += "\n\n"
current_prompt += f"{glossary_prompt}\n{glossary_block}"
config.PROMPT = current_prompt
print(f"✅ Added auto-generated glossary to system prompt ({os.path.basename(glossary_path)})")
if 'DEFER_GLOSSARY_APPEND' in os.environ:
del os.environ['DEFER_GLOSSARY_APPEND']
if 'GLOSSARY_APPEND_PROMPT' in os.environ:
del os.environ['GLOSSARY_APPEND_PROMPT']
else:
print("⚠️ Auto-generated glossary has no entries - skipping append")
if 'DEFER_GLOSSARY_APPEND' in os.environ:
del os.environ['DEFER_GLOSSARY_APPEND']
if 'GLOSSARY_APPEND_PROMPT' in os.environ:
del os.environ['GLOSSARY_APPEND_PROMPT']
except Exception as e:
print(f"⚠️ Failed to append auto-generated glossary: {e}")
else:
print("⚠️ No glossary file found after automatic generation")
except Exception as e:
print(f"❌ Glossary generation failed: {e}")
else:
print("📑 Automatic glossary generation disabled")
# Don't create an empty glossary - let any existing manual glossary remain
glossary_file = find_glossary_file(out)
# Only show glossary details if append glossary is enabled
append_glossary_enabled = os.getenv("APPEND_GLOSSARY", "1") == "1"
add_additional_enabled = os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1'
if glossary_file and os.path.exists(glossary_file):
if append_glossary_enabled:
try:
if glossary_file.lower().endswith(('.csv', '.txt', '.md')):
# Quick CSV/TXT/MD stats
with open(glossary_file, 'r', encoding='utf-8') as f:
lines = [ln.strip() for ln in f.readlines() if ln.strip()]
entry_count = max(0, len(lines) - 1) if lines and ',' in lines[0] else len(lines)
if glossary_file.lower().endswith('.txt'):
file_type = "TXT"
elif glossary_file.lower().endswith('.md'):
file_type = "MD"
else:
file_type = "CSV"
print(f"📑 Glossary ready ({file_type}) with {entry_count} entries")
print("📑 Sample glossary lines:")
for ln in lines[1:6]:
print(f" • {ln}")
elif glossary_file.lower().endswith('.json'):
with open(glossary_file, 'r', encoding='utf-8') as f:
glossary_data = json.load(f)
if isinstance(glossary_data, dict):
if 'entries' in glossary_data and isinstance(glossary_data['entries'], dict):
entry_count = len(glossary_data['entries'])
sample_items = list(glossary_data['entries'].items())[:3]
else:
entry_count = len(glossary_data)
sample_items = list(glossary_data.items())[:3]
print(f"📑 Glossary ready with {entry_count} entries")
print("📑 Sample glossary entries:")
for key, value in sample_items:
print(f" • {key} → {value}")
elif isinstance(glossary_data, list):
print(f"📑 Glossary ready with {len(glossary_data)} entries")
print("📑 Sample glossary entries:")
for i, entry in enumerate(glossary_data[:3]):
if isinstance(entry, dict):
original = entry.get('original_name', '?')
translated = entry.get('name', original)
print(f" • {original} → {translated}")
else:
print(f"⚠️ Unexpected glossary format: {type(glossary_data)}")
# Check for glossary extension (after all glossary types)
if add_additional_enabled:
# Check for extension with any supported format
additional_glossary = None
for ext in ['.csv', '.md', '.txt', '.json']:
candidate = os.path.join(out, f"glossary_extension{ext}")
if os.path.exists(candidate):
additional_glossary = candidate
break
if additional_glossary:
try:
with open(additional_glossary, 'r', encoding='utf-8') as f:
add_lines = [ln.strip() for ln in f.readlines() if ln.strip()]
add_entry_count = max(0, len(add_lines) - 1) if add_lines and ',' in add_lines[0] else len(add_lines)
print(f"📑 Glossary extension loaded with {add_entry_count} entries")
print("📑 Sample glossary extension lines:")
for ln in add_lines[1:4]:
print(f" • {ln}")
except Exception as e:
print(f"⚠️ Failed to read glossary extension: {e}")
else:
# Check if extension file exists but toggle is disabled
for ext in ['.csv', '.md', '.txt', '.json']:
additional_glossary = os.path.join(out, f"glossary_extension{ext}")
if os.path.exists(additional_glossary):
print("⏩ Skipping glossary extension - toggle disabled")
break
except Exception as e:
print(f"⚠️ Failed to inspect glossary file: {e}")
else:
print("⏩ Skipping glossary - toggle disabled")
else:
if append_glossary_enabled:
print("📑 No glossary file found")
print("="*50)
print("🚀 STARTING MAIN TRANSLATION PHASE")
print("="*50 + "\n")
glossary_path = find_glossary_file(out)
if glossary_path and os.path.exists(glossary_path) and glossary_path.lower().endswith('.json'):
try:
with open(glossary_path, 'r', encoding='utf-8') as f:
g_data = json.load(f)
print(f"[DEBUG] Glossary type before translation: {type(g_data)}")
if isinstance(g_data, list):
print(f"[DEBUG] Glossary is a list")
except Exception as e:
print(f"[DEBUG] Error checking glossary: {e}")
glossary_path = find_glossary_file(out)
# Build system prompt without glossary compression initially
# Compression will happen per-chapter when enabled
# Use get_system_prompt(1) for initial setup (no merging)
system = build_system_prompt(config.get_system_prompt(actual_merge_count=1), glossary_path, source_text=None)
base_msg = [{"role": "system", "content": system}]
# Preserve the original system prompt to avoid in-place mutations
original_system_prompt = system
# Log assistant prompt if configured
assistant_prompt = getattr(config, 'ASSISTANT_PROMPT', '') or ''
if assistant_prompt and assistant_prompt.strip():
print(f"🤖 Assistant Prompt: {assistant_prompt}")
last_summary_block_text = None # Will hold the last rolling summary text for the NEXT chapter only
last_summary_chapter_num = None # Chapter number associated with last_summary_block_text
image_translator = None
if config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Image translation enabled for model: {config.MODEL}")
print("🖼️ Image translation will use your custom system prompt and glossary")
image_translator = ImageTranslator(
client,
out,
config.PROFILE_NAME,
system,
config.TEMP,
log_callback ,
progress_manager,
history_manager,
chunk_context_manager
)
known_vision_models = [
'gemini-1.5-pro', 'gemini-1.5-flash', 'gemini-2.0-flash', 'gemini-2.5-flash', 'gemini-2.5-pro',
'gpt-4-turbo', 'gpt-4o', 'gpt-4.1-mini', 'gpt-4.1-nano', 'o4-mini', 'gpt-4.1-mini' 'gemini-3-pro-image-preview',
]
if config.MODEL.lower() not in known_vision_models:
print(f"⚠️ Note: {config.MODEL} may not have vision capabilities. Image translation will be attempted anyway.")
else:
print("ℹ️ Image translation disabled by user")
total_chapters = len(chapters)
# Only detect numbering if the toggle is not disabled
if config.DISABLE_ZERO_DETECTION:
print(f"📊 0-based detection disabled by user setting")
uses_zero_based = False
# Important: Set a flag that can be checked throughout the codebase
config._force_disable_zero_detection = True
else:
if chapters:
uses_zero_based = detect_novel_numbering(chapters)
print(f"📊 Novel numbering detected: {'0-based' if uses_zero_based else '1-based'}")
else:
uses_zero_based = False
config._force_disable_zero_detection = False
# Store this for later use
config._uses_zero_based = uses_zero_based
rng = os.getenv("CHAPTER_RANGE", "")
start = None
end = None
if rng and re.match(r"^\d+\s*-\s*\d+$", rng):
start, end = map(int, rng.split("-", 1))
if config.DISABLE_ZERO_DETECTION:
print(f"📊 0-based detection disabled - using range as specified: {start}-{end}")
elif uses_zero_based:
print(f"📊 0-based novel detected")
print(f"📊 User range {start}-{end} will be used as-is (chapters are already adjusted)")
else:
print(f"📊 1-based novel detected")
print(f"📊 Using range as specified: {start}-{end}")
print("📊 Calculating total chunks needed...")
total_chunks_needed = 0
chunks_per_chapter = {}
chapters_to_process = 0
# Check if special files translation is disabled
translate_special = os.getenv('TRANSLATE_SPECIAL_FILES', '0') == '1'
# Helper: sequential numbering with zero-phase.
# Start at 0; only start incrementing once a digit >0 is seen in the filename.
def _assign_chapter_num(name_noext, seq_counter, zero_phase):
nums = re.findall(r'\d+', name_noext) if name_noext else []
has_gt_zero = any(int(n) > 0 for n in nums)
if zero_phase:
if has_gt_zero:
# first positive digit: begin incrementing
if seq_counter == 0:
seq_counter = 1
num = seq_counter
seq_counter += 1
zero_phase = False
else:
# still zero phase
num = 0
else:
# already incrementing
num = seq_counter
seq_counter += 1
return num, seq_counter, zero_phase
# When setting actual chapter numbers (in the main function)
seq_counter = 0
zero_phase = True
for idx, c in enumerate(chapters):
chap_num = c["num"]
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Extract the raw chapter number from the file
raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)
#print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}")
# Spine position (reading order) fallback
spine_pos = c.get('spine_order')
if spine_pos is None:
spine_pos = c.get('opf_spine_position')
if spine_pos is None:
spine_pos = idx # ultimate fallback to list order
# Normalize chapter number using extracted number (spine/file aware)
normalized_num = raw_num if raw_num is not None else 0
offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num = normalized_num + offset
# When toggle is disabled, use raw numbers without any 0-based adjustment
if config.DISABLE_ZERO_DETECTION:
c['actual_chapter_num'] = raw_num
# Store raw number for consistency
c['raw_chapter_num'] = raw_num
c['zero_adjusted'] = False
else:
# Store raw number
c['raw_chapter_num'] = raw_num
# Apply adjustment only if this is a 0-based novel
if uses_zero_based:
c['actual_chapter_num'] = raw_num + 1
c['zero_adjusted'] = True
else:
c['actual_chapter_num'] = raw_num
c['zero_adjusted'] = False
# Now we can safely use actual_num
actual_num = c['actual_chapter_num']
# Skip special files (chapter 0) if translation is disabled
# IMPORTANT: Do NOT treat files with digits (including 0) in their name as special.
if not translate_special and raw_num == 0:
name = c.get('original_basename') or os.path.basename(c.get('filename', ''))
name_noext = os.path.splitext(name)[0] if name else ''
has_digits_in_name = bool(re.search(r'\d', name_noext))
if not has_digits_in_name:
# Track skipped special files
if not hasattr(config, '_skipped_special_files'):
config._skipped_special_files = []
config._skipped_special_files.append(c.get('original_basename', f'Chapter {actual_num}'))
chunks_per_chapter[idx] = 0
continue
if start is not None:
if not (start <= c['actual_chapter_num'] <= end):
# Track skipped chapters for summary (don't print individually)
if not hasattr(config, '_range_skipped_chapters'):
config._range_skipped_chapters = []
config._range_skipped_chapters.append(c['actual_chapter_num'])
continue
# IMPORTANT: pass chapter_obj so ProgressManager can resolve composite keys
# (e.g. when multiple spine items share the same chapter number).
needs_translation, skip_reason, _ = progress_manager.check_chapter_status(
idx, actual_num, content_hash, out, chapter_obj=c
)
if not needs_translation:
chunks_per_chapter[idx] = 0
continue
chapters_to_process += 1
chapter_key = str(actual_num)
if chapter_key in progress_manager.prog["chapters"] and progress_manager.prog["chapters"][chapter_key].get("status") == "in_progress":
pass
# Calculate based on effective OUTPUT limit only
max_output_tokens = config.get_effective_output_limit()
safety_margin_output = 500
# Korean to English typically compresses to 0.7-0.9x
compression_factor = config.COMPRESSION_FACTOR
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
# Ensure minimum
available_tokens = max(available_tokens, 1000)
# Debug output for first chapter
if os.getenv('DEBUG_CHUNK_SPLITTING', '0') == '1' and idx == 0:
print(f"\n[CHUNK CALC DEBUG] Configuration:")
print(f" MAX_OUTPUT_TOKENS: {max_output_tokens:,}")
print(f" safety_margin_output: {safety_margin_output:,}")
print(f" COMPRESSION_FACTOR: {compression_factor}")
print(f" Calculated available_tokens: {available_tokens:,}")
print(f" Formula: ({max_output_tokens:,} - {safety_margin_output:,}) / {compression_factor} = {available_tokens:,}\n")
#print(f"📊 Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
# For mixed content chapters, calculate on clean text
# For mixed content chapters, calculate on clean text
# Get filename for content type detection (prefer source_file to detect PDF context)
chapter_filename = c.get('source_file') or c.get('filename') or c.get('original_basename', '')
if c.get('has_images', False) and ContentProcessor.is_meaningful_text_content(c["body"]):
# Don't modify c["body"] at all during chunk calculation
# Just pass the body as-is, the chunking will be slightly off but that's OK
chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename)
else:
chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename)
chapter_key_str = content_hash
old_key_str = str(idx)
if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}):
progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str]
del progress_manager.prog["chapter_chunks"][old_key_str]
#print(f"[PROGRESS] Migrated chunks for chapter {actual_num} to new tracking system")
# Always count actual chunks - ignore "completed" tracking
chunks_per_chapter[idx] = len(chunks)
total_chunks_needed += chunks_per_chapter[idx]
# Print range skip summary if any chapters were skipped
if hasattr(config, '_range_skipped_chapters') and config._range_skipped_chapters:
skipped = config._range_skipped_chapters
print(f"📊 Skipped {len(skipped)} chapters outside range {start}-{end}")
if len(skipped) <= 10:
print(f" Skipped: {', '.join(map(str, sorted(skipped)))}")
else:
print(f" Range: {min(skipped)} to {max(skipped)}")
# Print special files skip summary
if hasattr(config, '_skipped_special_files') and config._skipped_special_files:
skipped = config._skipped_special_files
print(f"📊 Skipped {len(skipped)} special file(s) (TRANSLATE_SPECIAL_FILES is disabled)")
if len(skipped) <= 5:
for file in skipped:
print(f" • {file}")
# Check if no chapters will be processed and provide helpful error
if chapters_to_process == 0:
if start is not None and end is not None:
# Check if chapters in the range exist but are already completed
if chapters:
available_chapters = [c.get('actual_chapter_num', c['num']) for c in chapters]
chapters_in_range = [num for num in available_chapters if start <= num <= end]
if chapters_in_range:
# Chapters in range exist but are already completed
print(f"\n✅ All chapters in range {start}-{end} are already translated - nothing to do!")
else:
# No chapters exist in the specified range
min_chapter = min(available_chapters)
max_chapter = max(available_chapters)
print(f"\n❌ ERROR: Chapter range {start}-{end} doesn't match any chapters!")
print(f"📚 Available chapters in this EPUB: {min_chapter}-{max_chapter} ({len(chapters)} total)")
print(f"💡 Please adjust your chapter range in the settings to match the available chapters.")
if hasattr(config, '_range_skipped_chapters') and config._range_skipped_chapters:
print(f"\n📊 All {len(config._range_skipped_chapters)} chapters were outside the specified range.")
raise ValueError(f"Chapter range {start}-{end} doesn't match any available chapters ({min_chapter}-{max_chapter})")
else:
print(f"\n❌ ERROR: No chapters found in EPUB to translate!")
raise ValueError("No chapters found in EPUB")
elif not translate_special and total_chapters > 0:
print(f"\n⚠️ WARNING: All chapters are special files (chapter 0) and TRANSLATE_SPECIAL_FILES is disabled.")
print(f"💡 Enable 'Translate Special Files' in settings if you want to translate these files.")
elif total_chunks_needed == 0 and total_chapters > 0:
print(f"\n✅ All chapters already translated - nothing to do!")
else:
print(f"\n❌ ERROR: No chapters to process!")
terminology = "Sections" if is_text_file else "Chapters"
print(f"📊 Total chunks to translate: {total_chunks_needed}")
print(f"📚 {terminology} to process: {chapters_to_process}")
multi_chunk_chapters = [(idx, count) for idx, count in chunks_per_chapter.items() if count > 1]
if multi_chunk_chapters:
# Determine terminology based on file type
terminology = "Sections" if is_text_file else "Chapters"
print(f"📄 {terminology} requiring multiple chunks:")
for idx, chunk_count in multi_chunk_chapters:
chap = chapters[idx]
section_term = "Section" if is_text_file else "Chapter"
print(f" • {section_term} {idx+1} ({chap['title'][:30]}...): {chunk_count} chunks")
translation_start_time = time.time()
chunks_completed = 0
chapters_completed = 0
current_chunk_number = 0
if config.BATCH_TRANSLATION:
# Check if request merging is enabled (for PDF and EPUB files)
use_request_merging = config.REQUEST_MERGING_ENABLED and config.REQUEST_MERGE_COUNT > 1 and (is_pdf_file or not is_text_file)
if use_request_merging:
print(f"\n🔗 REQUEST MERGING + BATCH MODE ENABLED")
print(f"🔗 Merging {config.REQUEST_MERGE_COUNT} chapters per API request")
print(f"📦 Processing with up to {config.BATCH_SIZE} concurrent merged requests")
else:
print(f"\n📦 PARALLEL TRANSLATION MODE ENABLED")
print(f"📦 Processing chapters with up to {config.BATCH_SIZE} concurrent API calls")
import concurrent.futures
from threading import Lock
progress_lock = Lock()
chapters_to_translate = []
# FIX: First pass to set actual chapter numbers for ALL chapters
# This ensures batch mode has the same chapter numbering as non-batch mode
print("📊 Setting chapter numbers...")
seq_counter = 0
zero_phase = True
for idx, c in enumerate(chapters):
# PDF/TEXT CHUNK FIX: Skip extract_actual_chapter_number for chunks - preserve decimal from c['num']
if is_text_file and c.get('is_chunk', False):
# For text/PDF chunks, use the decimal number directly (1.0, 1.1, etc.)
c['actual_chapter_num'] = c['num']
c['raw_chapter_num'] = c['num']
c['zero_adjusted'] = False
continue
raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)
raw_num = raw_num if raw_num is not None else 0
# Apply offset if configured
offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num += offset
if config.DISABLE_ZERO_DETECTION:
# Use raw numbers without adjustment
c['actual_chapter_num'] = raw_num
c['raw_chapter_num'] = raw_num
c['zero_adjusted'] = False
else:
# Store raw number
c['raw_chapter_num'] = raw_num
# Apply 0-based adjustment if detected
if uses_zero_based:
c['actual_chapter_num'] = raw_num + 1
c['zero_adjusted'] = True
else:
c['actual_chapter_num'] = raw_num
c['zero_adjusted'] = False
for idx, c in enumerate(chapters):
chap_num = c["num"]
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Check if this is a pre-split text chunk with decimal number
# IMPORTANT: Check is_chunk FIRST, then use c['num'] regardless of float value
# This handles cases like 1.0 where float equals integer but should still be preserved
if is_text_file and c.get('is_chunk', False):
actual_num = c['num'] # Preserve the decimal for text/PDF chunks
c['actual_chapter_num'] = actual_num # UPDATE THE CHAPTER DICT!
else:
actual_num = c.get('actual_chapter_num', c['num']) # Now this will exist!
# Skip special files (chapter 0) if translation is disabled
raw_num = c.get('raw_chapter_num', FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config))
if not translate_special and raw_num == 0:
name = c.get('original_basename') or os.path.basename(c.get('filename', ''))
name_noext = os.path.splitext(name)[0] if name else ''
has_digits_in_name = bool(re.search(r'\d', name_noext))
if not has_digits_in_name:
continue
# Skip chapters outside the range
if start is not None and not (start <= actual_num <= end):
continue
# Check if chapter needs translation
needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status(
idx, actual_num, content_hash, out, c # Pass the chapter object
)
# Add explicit file check for supposedly completed chapters
if not needs_translation and existing_file:
file_path = os.path.join(out, existing_file)
if not os.path.exists(file_path):
print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}")
needs_translation = True
skip_reason = None
# Update status to file_missing
progress_manager.update(idx, actual_num, content_hash, None, status="file_missing", chapter_obj=c)
progress_manager.save()
# -------------------------------------------------------------------------
# BATCH PRE-PROCESSING
# -------------------------------------------------------------------------
if needs_translation and c.get("body"):
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
use_title_tag = os.getenv('USE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
if (not use_title_tag or ignore_header_tags):
try:
from bs4 import BeautifulSoup
content_soup = BeautifulSoup(c["body"], 'html.parser')
modified = False
if not use_title_tag:
for title_tag in content_soup.find_all('title'):
title_tag.decompose()
modified = True
if ignore_header_tags:
for header_tag in content_soup.find_all(['h1', 'h2', 'h3']):
header_tag.decompose()
modified = True
if modified:
c["body"] = str(content_soup)
except Exception as e:
print(f"⚠️ Failed to filter batch content for chapter {actual_num}: {e}")
# -------------------------------------------------------------------------
if not needs_translation:
# Track skips for summary instead of printing each one
if not hasattr(config, '_batch_skipped_chapters'):
config._batch_skipped_chapters = []
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
config._batch_skipped_chapters.append((actual_num, terminology, skip_reason))
chapters_completed += 1
continue
# Check for empty or image-only chapters
has_images = c.get('has_images', False)
has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"])
text_size = c.get('file_size', 0)
is_image_link_only = ContentProcessor.is_only_image_links(c["body"])
is_empty_chapter = (not has_images and (text_size < 1 or is_image_link_only))
is_image_only_chapter = (has_images and not has_meaningful_text)
# Handle empty chapters
if is_empty_chapter:
print(f"📄 Empty chapter {actual_num} detected (preserving original content as-is)")
safe_title = make_safe_filename(c['title'], c['num'])
if isinstance(c['num'], float):
fname = FileUtilities.create_chapter_filename(c, c['num'])
else:
fname = FileUtilities.create_chapter_filename(c, c['num'])
# IMPORTANT: For completed_empty, preserve the ORIGINAL XHTML/HTML markup.
# In enhanced/html2text extraction modes, c['body'] may be empty because it contains only extracted text.
original_markup = (
c.get("original_html")
or c.get("source_html")
or c.get("raw_html")
or c.get("body")
or ""
)
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(original_markup)
progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty", chapter_obj=c)
progress_manager.save()
chapters_completed += 1
continue
# Add to chapters to translate
chapters_to_translate.append((idx, c))
# Print skip summary for batch mode
if hasattr(config, '_batch_skipped_chapters') and config._batch_skipped_chapters:
skipped = config._batch_skipped_chapters
print(f"\n📊 Skipped {len(skipped)} already completed chapters")
if os.getenv('DEBUG_SKIP_MESSAGES', '0') == '1' and len(skipped) <= 5:
for num, term, reason in skipped[:5]:
print(f" • {term} {num}: {reason.split('(')[0].strip()}")
print(f"📊 Found {len(chapters_to_translate)} chapters to translate in parallel")
# Continue with the rest of the existing batch processing code...
batch_processor = BatchTranslationProcessor(
config, client, base_msg, out, progress_lock,
progress_manager.save,
lambda idx, actual_num, content_hash, output_file=None, status="completed", **kwargs: progress_manager.update(idx, actual_num, content_hash, output_file, status, **kwargs),
check_stop,
image_translator,
is_text_file=is_text_file,
history_manager=history_manager
)
# Batch-mode rolling summary: updated once per batch and injected into the NEXT batch.
rolling_summary_for_next_batch = "" # exact rolling_summary.txt contents
import threading
rolling_summary_update_lock = threading.Lock()
summary_translation_processor = None
if config.USE_ROLLING_SUMMARY:
# Dedicated processor for summarization between batches (no concurrency with translation threads).
summary_translation_processor = TranslationProcessor(config, client, out, log_callback, check_stop, uses_zero_based, is_text_file)
total_to_process = len(chapters_to_translate)
processed = 0
# ==========================
# Batching mode selection
# ==========================
batching_mode = getattr(config, 'BATCHING_MODE', 'direct')
batch_group_size_cfg = max(1, int(getattr(config, 'BATCH_GROUP_SIZE', 3)))
if batching_mode not in ('direct', 'conservative', 'aggressive'):
batching_mode = 'direct'
# Backwards compatibility with CONSERVATIVE_BATCHING env
if os.getenv('CONSERVATIVE_BATCHING', '0') == '1':
batching_mode = 'conservative'
if batching_mode == 'conservative':
batch_group_size = config.BATCH_SIZE * batch_group_size_cfg
print(f"📦 Using conservative batching: group size {batch_group_size} (batch size {config.BATCH_SIZE}, multiplier {batch_group_size_cfg})")
elif batching_mode == 'direct':
batch_group_size = config.BATCH_SIZE # legacy behavior
print(f"📦 Using direct batching: group size {batch_group_size}, parallel {config.BATCH_SIZE}")
else: # aggressive
batch_group_size = batch_group_size_cfg # not used for throttling, only for logging/summary grouping
print(f"⚡ Using AGGRESSIVE batching: keeps {config.BATCH_SIZE} parallel calls, auto-refills when any finishes")
# Create merge groups if request merging is enabled
if use_request_merging:
# Build proximity runs first (so we never merge far-apart chapters),
# then pack each run under the token budget. This avoids patterns like
# 2+1, 2+1 when REQUEST_MERGE_COUNT=3 but only 2 chapters fit; instead
# we repack into 2+2, 2+2 when possible.
proximity_runs = RequestMerger.create_merge_groups(
chapters_to_translate,
max(1, len(chapters_to_translate)),
)
max_output_tokens = config.get_effective_output_limit()
safety_margin_output = 500
compression_factor = getattr(config, 'COMPRESSION_FACTOR', 1.0) or 1.0
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
available_tokens = max(available_tokens, 1000)
merge_groups = []
for run in proximity_runs:
if len(run) <= 1:
merge_groups.append(run)
continue
i = 0
while i < len(run):
group = [run[i]]
i += 1
# Try to grow the group up to REQUEST_MERGE_COUNT, but stop
# when adding the next chapter would exceed the token budget.
while i < len(run) and len(group) < config.REQUEST_MERGE_COUNT:
candidate = run[i]
merge_input = [
(ch.get('actual_chapter_num', ch['num']), ch["body"], ch)
for (idx, ch) in (group + [candidate])
]
merged_preview = RequestMerger.merge_chapters(merge_input, log_injections=False)
merged_tokens = chapter_splitter.count_tokens(merged_preview)
if merged_tokens <= available_tokens:
group.append(candidate)
i += 1
else:
break
merge_groups.append(group)
print(f"🔗 Created {len(merge_groups)} merge groups from {total_to_process} chapters (after size adjustment)")
units_to_process = merge_groups
is_merged_mode = True
else:
units_to_process = [[ch] for ch in chapters_to_translate] # Wrap each chapter as single-item group
is_merged_mode = False
with concurrent.futures.ThreadPoolExecutor(max_workers=config.BATCH_SIZE) as executor:
if batching_mode == 'aggressive':
import threading
batch_submit_lock = threading.Lock()
active_futures = {}
next_unit_idx = 0
def submit_next_unit():
nonlocal next_unit_idx
if next_unit_idx >= len(units_to_process):
return False
unit = units_to_process[next_unit_idx]
if config.USE_ROLLING_SUMMARY:
batch_processor.set_batch_rolling_summary_text(rolling_summary_for_next_batch)
time.sleep(0.000001)
if is_merged_mode:
fut = executor.submit(batch_processor.process_merged_group, unit, progress_manager)
else:
fut = executor.submit(batch_processor.process_single_chapter, unit[0])
active_futures[fut] = unit
next_unit_idx += 1
return True
# Prime the executor
with batch_submit_lock:
while len(active_futures) < config.BATCH_SIZE and submit_next_unit():
pass
graceful_stop_message_shown = False # Track if we've shown the message
while active_futures or next_unit_idx < len(units_to_process):
# Check for graceful stop before submitting new work
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
# Ensure we have work submitted (but not during graceful stop)
if not active_futures:
with batch_submit_lock:
while len(active_futures) < config.BATCH_SIZE and submit_next_unit():
pass
if not active_futures:
break # No more work to do
elif not graceful_stop_active:
# Auto-refill: submit new work to maintain BATCH_SIZE parallel calls
# But DON'T submit new work if graceful stop is active
with batch_submit_lock:
while len(active_futures) < config.BATCH_SIZE and submit_next_unit():
pass
# Use wait() with FIRST_COMPLETED to properly handle dynamic future sets
done, _ = concurrent.futures.wait(active_futures.keys(), return_when=concurrent.futures.FIRST_COMPLETED)
# Track if we should exit the outer loop after processing done futures
should_exit_outer_loop = False
for future in done:
if check_stop():
# Check if wait_for_chunks is enabled - if so, let current chapters finish
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if graceful_stop_active and wait_for_chunks:
# Only print message once
if not graceful_stop_message_shown:
print("⏳ Graceful stop — waiting for current chapter(s) to finish...")
graceful_stop_message_shown = True
# Process only completed futures, skip cancelled ones
# Clear all remaining futures and exit both loops
active_futures.clear()
should_exit_outer_loop = True
break
else:
# print("❌ Translation stopped") # Redundant with "Translation stopped by user" from exception
executor.shutdown(wait=False, cancel_futures=True)
return
unit = active_futures.pop(future)
completed_in_batch = 0
failed_in_batch = 0
batch_history_map = {}
chapters_in_batch = sum(len(u) for u in [unit])
try:
if is_merged_mode:
results = future.result()
for result in results:
success, chap_num, hist_user, hist_assistant, raw_obj = result
if success:
completed_in_batch += 1
if hist_user and hist_assistant:
for idx, ch in unit:
if ch.get('actual_chapter_num', ch['num']) == chap_num:
batch_history_map[idx] = (hist_user, hist_assistant, raw_obj)
break
else:
failed_in_batch += 1
processed += 1
print(f"✅ Merged group done: {len(results)} chapters")
else:
success, chap_num, hist_user, hist_assistant, raw_obj = future.result()
idx, chapter = unit[0]
if success:
completed_in_batch += 1
if hist_user and hist_assistant:
batch_history_map[idx] = (hist_user, hist_assistant, raw_obj)
print(f"✅ Chapter {chap_num} done")
# Clear any stale watchdog entries for this chapter
try:
import unified_api_client
if hasattr(unified_api_client, '_api_watchdog_clear_chapter'):
unified_api_client._api_watchdog_clear_chapter(chap_num)
except Exception:
pass
else:
failed_in_batch += 1
# Error already printed by worker thread
processed += 1
except Exception as e:
if is_merged_mode:
failed_in_batch += len(unit)
processed += len(unit)
else:
failed_in_batch += 1
processed += 1
print(f"❌ Thread error: {e}")
progress_percent = (processed / total_to_process) * 100
print(f"📊 Overall Progress: {processed}/{total_to_process} ({progress_percent:.1f}%)")
# History append immediately for this unit
if config.CONTEXTUAL and getattr(config, 'HIST_LIMIT', 0) > 0:
hist_limit = getattr(config, 'HIST_LIMIT', 0)
sorted_chapters = sorted(unit, key=lambda x: x[0])
for idx, chapter in sorted_chapters:
if idx in batch_history_map:
user_content, assistant_content, raw_obj = batch_history_map[idx]
try:
time.sleep(0.000001)
history_manager.append_to_history(
user_content,
assistant_content,
hist_limit,
reset_on_limit=True,
rolling_window=config.TRANSLATION_HISTORY_ROLLING,
raw_assistant_object=raw_obj
)
except Exception as e:
actual_num_for_log = chapter.get('actual_chapter_num', chapter.get('num'))
print(f"⚠️ Failed to append Chapter {actual_num_for_log} to translation history (batch): {e}")
# Rolling summary update per unit
if config.USE_ROLLING_SUMMARY and summary_translation_processor is not None:
try:
batch_items = sorted(unit, key=lambda x: x[0])
translated_blocks = []
last_actual_num_in_batch = None
for idx, chapter in batch_items:
actual_num = chapter.get('actual_chapter_num', chapter.get('num'))
last_actual_num_in_batch = actual_num
fname_guess = FileUtilities.create_chapter_filename(chapter, actual_num)
candidates = [fname_guess]
if isinstance(fname_guess, str) and fname_guess.endswith('.html'):
candidates.insert(0, fname_guess.replace('.html', '.txt'))
elif isinstance(fname_guess, str) and fname_guess.endswith('.txt'):
candidates.append(fname_guess.replace('.txt', '.html'))
content = ""
for cand in candidates:
fp = os.path.join(out, cand)
if os.path.exists(fp):
with open(fp, 'r', encoding='utf-8') as f:
content = f.read()
if content:
break
if isinstance(content, str) and content:
translated_blocks.append(content)
batch_translations_text = "\n\n---\n\n".join(translated_blocks)
if batch_translations_text:
old_mode = getattr(config, 'ROLLING_SUMMARY_MODE', 'replace')
old_max_entries = getattr(config, 'ROLLING_SUMMARY_MAX_ENTRIES', 0)
try:
config.ROLLING_SUMMARY_MODE = 'replace'
config.ROLLING_SUMMARY_MAX_ENTRIES = int(chapters_in_batch or 0)
with rolling_summary_update_lock:
time.sleep(0.000001)
summary_translation_processor.generate_rolling_summary(
history_manager,
last_actual_num_in_batch,
base_system_content=None,
source_text=batch_translations_text,
previous_summary_text=None,
previous_summary_chapter_num=None,
prefer_translations_only_user=True,
)
summary_file = os.path.join(out, 'rolling_summary.txt')
if os.path.exists(summary_file):
with open(summary_file, 'r', encoding='utf-8') as sf:
rolling_summary_for_next_batch = (sf.read() or "")
else:
rolling_summary_for_next_batch = ""
finally:
config.ROLLING_SUMMARY_MODE = old_mode
config.ROLLING_SUMMARY_MAX_ENTRIES = old_max_entries
else:
rolling_summary_for_next_batch = ""
except Exception as e:
print(f"⚠️ Batch rolling summary update failed: {e}")
rolling_summary_for_next_batch = ""
# Refill slots aggressively (but not if stop requested)
if not check_stop():
with batch_submit_lock:
while len(active_futures) < config.BATCH_SIZE and submit_next_unit():
pass
# Exit outer loop if graceful stop was triggered
if should_exit_outer_loop:
break
# After all futures complete, if stop was requested with wait_for_chunks, exit
if check_stop():
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if graceful_stop_active and wait_for_chunks:
print("\n✅ All current chapter(s) completed. Stopping as requested (wait for chunks).")
return
else:
# direct or conservative: keep legacy batch grouping behaviour
for batch_start in range(0, len(units_to_process), batch_group_size if not is_merged_mode else config.BATCH_SIZE):
if check_stop():
print("❌ Translation stopped during parallel processing")
executor.shutdown(wait=False)
return
effective_batch_size = batch_group_size if not is_merged_mode else config.BATCH_SIZE
batch_end = min(batch_start + effective_batch_size, len(units_to_process))
current_batch_units = units_to_process[batch_start:batch_end]
# Count total chapters in this batch
chapters_in_batch = sum(len(unit) for unit in current_batch_units)
batch_number = (batch_start // effective_batch_size) + 1
if is_merged_mode:
print(f"\n📦 Submitting batch {batch_number}: {len(current_batch_units)} merged groups ({chapters_in_batch} chapters)")
else:
print(f"\n📦 Submitting batch {batch_number}: {chapters_in_batch} chapters")
if config.USE_ROLLING_SUMMARY:
batch_processor.set_batch_rolling_summary_text(rolling_summary_for_next_batch)
time.sleep(0.000001)
if is_merged_mode:
future_to_unit = {
executor.submit(batch_processor.process_merged_group, unit, progress_manager): unit
for unit in current_batch_units
}
else:
future_to_unit = {
executor.submit(batch_processor.process_single_chapter, unit[0]): unit
for unit in current_batch_units
}
completed_in_batch = 0
failed_in_batch = 0
batch_history_map = {}
for future in concurrent.futures.as_completed(future_to_unit):
if check_stop():
# Check if wait_for_chunks is enabled - if so, let current chapters finish
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if graceful_stop_active and wait_for_chunks:
print("⏳ Graceful stop — waiting for current chapter(s) to finish...")
# Don't shutdown - let this batch complete
else:
# print("❌ Translation stopped") # Redundant with "Translation stopped by user" from exception
executor.shutdown(wait=False)
return
unit = future_to_unit[future]
try:
if is_merged_mode:
results = future.result()
for result in results:
success, chap_num, hist_user, hist_assistant, raw_obj = result
if success:
completed_in_batch += 1
if hist_user and hist_assistant:
for idx, ch in unit:
if ch.get('actual_chapter_num', ch['num']) == chap_num:
batch_history_map[idx] = (hist_user, hist_assistant, raw_obj)
break
else:
failed_in_batch += 1
processed += 1
print(f"✅ Merged group done: {len(results)} chapters")
else:
success, chap_num, hist_user, hist_assistant, raw_obj = future.result()
idx, chapter = unit[0]
if success:
completed_in_batch += 1
print(f"✅ Chapter {chap_num} done ({completed_in_batch + failed_in_batch}/{chapters_in_batch} in batch)")
# Clear any stale watchdog entries for this chapter
try:
import unified_api_client
if hasattr(unified_api_client, '_api_watchdog_clear_chapter'):
unified_api_client._api_watchdog_clear_chapter(chap_num)
except Exception:
pass
if hist_user and hist_assistant:
batch_history_map[idx] = (hist_user, hist_assistant, raw_obj)
else:
failed_in_batch += 1
# Error already printed by worker thread
processed += 1
except Exception as e:
if is_merged_mode:
failed_in_batch += len(unit)
processed += len(unit)
else:
failed_in_batch += 1
processed += 1
print(f"❌ Thread error: {e}")
progress_percent = (processed / total_to_process) * 100
print(f"📊 Overall Progress: {processed}/{total_to_process} ({progress_percent:.1f}%)")
# After all futures in this batch complete, append their history entries
if config.CONTEXTUAL and getattr(config, 'HIST_LIMIT', 0) > 0:
hist_limit = getattr(config, 'HIST_LIMIT', 0)
all_chapters_in_batch = []
for unit in current_batch_units:
all_chapters_in_batch.extend(unit)
sorted_chapters = sorted(all_chapters_in_batch, key=lambda x: x[0])
for idx, chapter in sorted_chapters:
if idx in batch_history_map:
user_content, assistant_content, raw_obj = batch_history_map[idx]
try:
time.sleep(0.000001)
history_manager.append_to_history(
user_content,
assistant_content,
hist_limit,
reset_on_limit=True,
rolling_window=config.TRANSLATION_HISTORY_ROLLING,
raw_assistant_object=raw_obj
)
except Exception as e:
actual_num_for_log = chapter.get('actual_chapter_num', chapter.get('num'))
print(f"⚠️ Failed to append Chapter {actual_num_for_log} to translation history (batch): {e}")
# After the batch completes, update rolling_summary.txt ONCE (for the next batch).
if config.USE_ROLLING_SUMMARY and summary_translation_processor is not None:
try:
batch_items = []
for unit in current_batch_units:
batch_items.extend(unit)
batch_items = sorted(batch_items, key=lambda x: x[0])
translated_blocks = []
last_actual_num_in_batch = None
for idx, chapter in batch_items:
try:
actual_num = chapter.get('actual_chapter_num', chapter.get('num'))
last_actual_num_in_batch = actual_num
fname_guess = FileUtilities.create_chapter_filename(chapter, actual_num)
candidates = [fname_guess]
if isinstance(fname_guess, str) and fname_guess.endswith('.html'):
candidates.insert(0, fname_guess.replace('.html', '.txt'))
elif isinstance(fname_guess, str) and fname_guess.endswith('.txt'):
candidates.append(fname_guess.replace('.txt', '.html'))
content = ""
for cand in candidates:
fp = os.path.join(out, cand)
if os.path.exists(fp):
with open(fp, 'r', encoding='utf-8') as f:
content = f.read()
if content:
break
if isinstance(content, str) and content:
translated_blocks.append(content)
except Exception:
continue
batch_translations_text = "\n\n---\n\n".join(translated_blocks)
if batch_translations_text:
old_mode = getattr(config, 'ROLLING_SUMMARY_MODE', 'replace')
old_max_entries = getattr(config, 'ROLLING_SUMMARY_MAX_ENTRIES', 0)
try:
config.ROLLING_SUMMARY_MODE = 'replace'
try:
config.ROLLING_SUMMARY_MAX_ENTRIES = int(chapters_in_batch or 0)
except Exception:
config.ROLLING_SUMMARY_MAX_ENTRIES = 0
with rolling_summary_update_lock:
time.sleep(0.000001)
summary_translation_processor.generate_rolling_summary(
history_manager,
last_actual_num_in_batch,
base_system_content=None,
source_text=batch_translations_text,
previous_summary_text=None,
previous_summary_chapter_num=None,
prefer_translations_only_user=True,
)
summary_file = os.path.join(out, 'rolling_summary.txt')
if os.path.exists(summary_file):
with open(summary_file, 'r', encoding='utf-8') as sf:
rolling_summary_for_next_batch = (sf.read() or "")
else:
rolling_summary_for_next_batch = ""
finally:
config.ROLLING_SUMMARY_MODE = old_mode
config.ROLLING_SUMMARY_MAX_ENTRIES = old_max_entries
else:
rolling_summary_for_next_batch = ""
except Exception as e:
print(f"⚠️ Batch rolling summary update failed: {e}")
rolling_summary_for_next_batch = ""
print(f"\n📦 Batch Summary:")
print(f" ✅ Successful: {completed_in_batch}")
print(f" ❌ Failed: {failed_in_batch}")
# After batch completes, if stop was requested with wait_for_chunks, exit
if check_stop():
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if graceful_stop_active and wait_for_chunks:
print("\n✅ Current batch completed. Stopping as requested (wait for chunks).")
return
if batch_end < total_to_process:
print(f"⏳ Waiting {config.DELAY}s before next batch...")
time.sleep(config.DELAY)
chapters_completed = batch_processor.chapters_completed
chunks_completed = batch_processor.chunks_completed
print(f"\n🎉 Parallel translation complete!")
print(f" Total chapters processed: {processed}")
# Count qa_failed chapters correctly
qa_failed_count = 0
actual_successful = 0
for idx, c in enumerate(chapters):
# Get the chapter's actual number
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num']
else:
actual_num = c.get('actual_chapter_num', c['num'])
# Check if this chapter was processed and has qa_failed status
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Check if this chapter exists in progress
chapter_info = progress_manager.prog["chapters"].get(content_hash, {})
status = chapter_info.get("status")
if status == "qa_failed":
qa_failed_count += 1
elif status == "completed":
actual_successful += 1
# Correct the displayed counts
print(f" Successful: {actual_successful}")
if qa_failed_count > 0:
print(f"\n⚠️ {qa_failed_count} chapters failed due to content policy violations:")
qa_failed_chapters = []
for idx, c in enumerate(chapters):
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num']
else:
actual_num = c.get('actual_chapter_num', c['num'])
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
chapter_info = progress_manager.prog["chapters"].get(content_hash, {})
if chapter_info.get("status") == "qa_failed":
qa_failed_chapters.append(actual_num)
print(f" Failed chapters: {', '.join(map(str, sorted(qa_failed_chapters)))}")
# Stop translation completely after batch mode
print("\n📌 Batch translation completed.")
elif not config.BATCH_TRANSLATION:
translation_processor = TranslationProcessor(config, client, out, log_callback, check_stop, uses_zero_based, is_text_file)
# Only initialize AI Hunter when both the detection mode AND duplicate retry are enabled.
if config.DUPLICATE_DETECTION_MODE == 'ai-hunter' and getattr(config, 'RETRY_DUPLICATE_BODIES', False):
# Build the main config from environment variables and config object
main_config = {
'duplicate_lookback_chapters': config.DUPLICATE_LOOKBACK_CHAPTERS,
'duplicate_detection_mode': config.DUPLICATE_DETECTION_MODE,
}
# Check if AI Hunter config was passed via environment variable
ai_hunter_config_str = os.getenv('AI_HUNTER_CONFIG')
if ai_hunter_config_str:
try:
ai_hunter_config = json.loads(ai_hunter_config_str)
main_config['ai_hunter_config'] = ai_hunter_config
print("🤖 AI Hunter: Loaded configuration from environment")
except json.JSONDecodeError:
print("⚠️ AI Hunter: Failed to parse AI_HUNTER_CONFIG from environment")
# If no AI Hunter config in environment, try to load from file as fallback
if 'ai_hunter_config' not in main_config:
# Try multiple locations for config.json
config_paths = [
os.path.join(os.getcwd(), 'config.json'),
os.path.join(out, '..', 'config.json'),
]
if getattr(sys, 'frozen', False):
config_paths.append(os.path.join(os.path.dirname(sys.executable), 'config.json'))
else:
script_dir = os.path.dirname(os.path.abspath(__file__))
config_paths.extend([
os.path.join(script_dir, 'config.json'),
os.path.join(os.path.dirname(script_dir), 'config.json')
])
for config_path in config_paths:
if os.path.exists(config_path):
try:
with open(config_path, 'r', encoding='utf-8') as f:
file_config = json.load(f)
if 'ai_hunter_config' in file_config:
main_config['ai_hunter_config'] = file_config['ai_hunter_config']
print(f"🤖 AI Hunter: Loaded configuration from {config_path}")
break
except Exception as e:
print(f"⚠️ Failed to load config from {config_path}: {e}")
# Always create and inject the improved AI Hunter when ai-hunter mode is selected
ai_hunter = ImprovedAIHunterDetection(main_config)
# The TranslationProcessor class has a method that checks for duplicates
# We need to replace it with our enhanced AI Hunter
# Create a wrapper to match the expected signature
def enhanced_duplicate_check(self, result, idx, prog, out, actual_num=None):
# If actual_num is not provided, try to get it from progress
if actual_num is None:
# Look for the chapter being processed
for ch_key, ch_info in prog.get("chapters", {}).items():
if ch_info.get("chapter_idx") == idx:
actual_num = ch_info.get("actual_num", idx + 1)
break
# Fallback to idx+1 if not found
if actual_num is None:
actual_num = idx + 1
return ai_hunter.detect_duplicate_ai_hunter_enhanced(result, idx, prog, out, actual_num)
# Bind the enhanced method to the processor instance
translation_processor.check_duplicate_content = enhanced_duplicate_check.__get__(translation_processor, TranslationProcessor)
print("🤖 AI Hunter: Using enhanced detection with configurable thresholds")
# First pass: set actual chapter numbers respecting the config
for idx, c in enumerate(chapters):
raw_num = FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config)
#print(f"[DEBUG] Extracted raw_num={raw_num} from {c.get('original_basename', 'unknown')}")
# Apply offset if configured
offset = config.CHAPTER_NUMBER_OFFSET if hasattr(config, 'CHAPTER_NUMBER_OFFSET') else 0
raw_num += offset
if config.DISABLE_ZERO_DETECTION:
# Use raw numbers without adjustment
c['actual_chapter_num'] = raw_num
c['raw_chapter_num'] = raw_num
c['zero_adjusted'] = False
else:
# Store raw number
c['raw_chapter_num'] = raw_num
# Apply 0-based adjustment if detected
if uses_zero_based:
c['actual_chapter_num'] = raw_num + 1
c['zero_adjusted'] = True
else:
c['actual_chapter_num'] = raw_num
c['zero_adjusted'] = False
# Request merging preprocessing
merge_groups = {} # Maps parent_idx -> list of child (idx, chapter) tuples
merged_children = set() # Set of idx that are merged into another chapter
# Request merging for EPUB/PDF (non-text) in non-batch mode
if config.REQUEST_MERGING_ENABLED and config.REQUEST_MERGE_COUNT > 1 and (is_pdf_file or not is_text_file):
print(f"\n🔗 REQUEST MERGING ENABLED: Combining up to {config.REQUEST_MERGE_COUNT} chapters per request")
# Collect chapters that need translation
chapters_needing_translation = []
for idx, c in enumerate(chapters):
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num']
else:
actual_num = c.get('actual_chapter_num', c['num'])
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Skip special files (chapter 0) if translation is disabled
raw_num = c.get('raw_chapter_num', FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config))
if not translate_special and raw_num == 0:
name = c.get('original_basename') or os.path.basename(c.get('filename', ''))
name_noext = os.path.splitext(name)[0] if name else ''
has_digits_in_name = bool(re.search(r'\d', name_noext))
if not has_digits_in_name:
continue
if start is not None and not (start <= actual_num <= end):
continue
needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status(
idx, actual_num, content_hash, out, c
)
# Check file exists
if not needs_translation and existing_file:
file_path = os.path.join(out, existing_file)
if not os.path.exists(file_path):
needs_translation = True
# Skip empty/image-only chapters from merging
has_images = c.get('has_images', False)
has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"])
text_size = c.get('file_size', 0)
is_image_link_only = ContentProcessor.is_only_image_links(c["body"])
is_empty_chapter = (not has_images and (text_size < 1 or is_image_link_only))
is_image_only_chapter = (has_images and not has_meaningful_text)
if needs_translation and not is_empty_chapter and not is_image_only_chapter:
chapters_needing_translation.append((idx, c, actual_num, content_hash))
# Create merge groups
groups = RequestMerger.create_merge_groups(
chapters_needing_translation,
config.REQUEST_MERGE_COUNT
)
# Build proximity runs first (so we never merge far-apart chapters),
# then pack each run under the token budget (repacking avoids 2+1,2+1 patterns).
max_output_tokens = config.get_effective_output_limit()
safety_margin_output = 500
compression_factor = getattr(config, 'COMPRESSION_FACTOR', 1.0) or 1.0
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
available_tokens = max(available_tokens, 1000)
proximity_runs = RequestMerger.create_merge_groups(
chapters_needing_translation,
max(1, len(chapters_needing_translation)),
)
groups = []
for run in proximity_runs:
if len(run) <= 1:
groups.append(run)
continue
i = 0
while i < len(run):
group = [run[i]]
i += 1
while i < len(run) and len(group) < config.REQUEST_MERGE_COUNT:
candidate = run[i]
merge_input = [
(g_actual_num, g_chapter["body"], g_chapter)
for (g_idx, g_chapter, g_actual_num, g_content_hash) in (group + [candidate])
]
merged_preview = RequestMerger.merge_chapters(merge_input, log_injections=False)
merged_tokens = chapter_splitter.count_tokens(merged_preview)
if merged_tokens <= available_tokens:
group.append(candidate)
i += 1
else:
break
groups.append(group)
# Check graceful stop before logging merge groups
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
for group in groups:
if len(group) > 1:
parent_idx = group[0][0] # First chapter in group is the parent
parent_actual_num = group[0][2]
merge_groups[parent_idx] = group
# Track children to skip - but DON'T mark as merged yet
# (they'll be marked as merged only after parent completes)
for i, (idx, c, actual_num, content_hash) in enumerate(group):
if i > 0:
merged_children.add(idx)
# Only log merge planning if not in graceful stop
if not graceful_stop_active:
child_nums = [g[2] for g in group[1:]]
print(f" 📎 Chapters {parent_actual_num} + {child_nums} will be merged into one request")
if not graceful_stop_active:
print(f" 📊 Created {len(merge_groups)} merge groups from {len(chapters_needing_translation)} chapters")
# Second pass: process chapters
for idx, c in enumerate(chapters):
chap_num = c["num"]
# Graceful stop check: stop processing new chapters when graceful stop is active
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active or os.environ.get('GRACEFUL_STOP_COMPLETED') == '1':
print("✅ Graceful stop: Stopping new chapter processing...")
break
# Skip if this chapter was merged into another
if idx in merged_children:
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num']
else:
actual_num = c.get('actual_chapter_num', c['num'])
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f"\n⏭️ Skipping {terminology} {actual_num} (merged into parent)")
chapters_completed += 1
continue
# Check if this is a pre-split text chunk with decimal number
if (is_text_file and c.get('is_chunk', False) and isinstance(c['num'], float)):
actual_num = c['num'] # Preserve the decimal for text files only
else:
actual_num = c.get('actual_chapter_num', c['num'])
content_hash = c.get("content_hash") or ContentProcessor.get_content_hash(c["body"])
# Skip special files (chapter 0) if translation is disabled
raw_num = c.get('raw_chapter_num', FileUtilities.extract_actual_chapter_number(c, patterns=None, config=config))
if not translate_special and raw_num == 0:
name = c.get('original_basename') or os.path.basename(c.get('filename', ''))
name_noext = os.path.splitext(name)[0] if name else ''
has_digits_in_name = bool(re.search(r'\d', name_noext))
if not has_digits_in_name:
continue
if start is not None and not (start <= actual_num <= end):
# Skip silently (already summarized in earlier pass)
continue
needs_translation, skip_reason, existing_file = progress_manager.check_chapter_status(
idx, actual_num, content_hash, out, c # Pass the chapter object
)
# Add explicit file check for supposedly completed chapters
if not needs_translation and existing_file:
file_path = os.path.join(out, existing_file)
if not os.path.exists(file_path):
print(f"⚠️ Output file missing for chapter {actual_num}: {existing_file}")
needs_translation = True
skip_reason = None
# Update status to file_missing
progress_manager.update(idx, actual_num, content_hash, None, status="file_missing", chapter_obj=c)
progress_manager.save()
if not needs_translation:
# Track skips for summary (already printed in batch mode section above)
if not hasattr(config, '_sequential_skipped_chapters'):
config._sequential_skipped_chapters = []
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
config._sequential_skipped_chapters.append((actual_num, terminology, skip_reason))
continue
chapter_position = f"{chapters_completed + 1}/{chapters_to_process}"
# Determine if this is a text file
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
# Determine file reference based on type
if c.get('is_chunk', False):
file_ref = f"Section_{c['num']}"
else:
file_ref = c.get('original_basename', f'{terminology}_{actual_num}')
print(f"\n🔄 Processing #{idx+1}/{total_chapters} (Actual: {terminology} {actual_num}) ({chapter_position} to translate): {c['title']} [File: {file_ref}]")
chunk_context_manager.start_chapter(chap_num, c['title'])
# Initialize merge_info for this chapter (will be populated if this is a parent in a merge group)
merge_info = None
has_images = c.get('has_images', False)
has_meaningful_text = ContentProcessor.is_meaningful_text_content(c["body"])
text_size = c.get('file_size', 0)
is_image_link_only = ContentProcessor.is_only_image_links(c["body"])
is_empty_chapter = (not has_images and (text_size < 1 or is_image_link_only))
is_image_only_chapter = (has_images and not has_meaningful_text)
is_mixed_content = (has_images and has_meaningful_text)
is_text_only = (not has_images and has_meaningful_text)
if is_empty_chapter:
print(f"📄 Empty chapter {actual_num} detected (preserving original content as-is)")
# Create filename for empty chapter
if isinstance(c['num'], float):
fname = FileUtilities.create_chapter_filename(c, c['num'])
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
# Save ORIGINAL markup for empty chapters.
# In enhanced/html2text extraction modes, c['body'] can be blank (it may only contain extracted text).
original_markup = (
c.get("original_html")
or c.get("source_html")
or c.get("raw_html")
or c.get("body")
or ""
)
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(original_markup)
# Update progress tracking
progress_manager.update(idx, actual_num, content_hash, fname, status="completed_empty", chapter_obj=c)
progress_manager.save()
chapters_completed += 1
# CRITICAL: Skip translation!
continue
elif is_image_only_chapter:
print(f"📸 Image-only chapter: {c.get('image_count', 0)} images")
translated_html = c["body"]
image_translations = {}
# Step 1: Process images if image translation is enabled
if image_translator and config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Translating {c.get('image_count', 0)} images...")
image_translator.set_current_chapter(chap_num)
translated_html, image_translations = process_chapter_images(
c["body"],
actual_num,
image_translator,
check_stop
)
if image_translations:
print(f"✅ Translated {len(image_translations)} images")
# Step 2: Check for headers/titles that need translation
from bs4 import BeautifulSoup
soup = BeautifulSoup(c["body"], 'html.parser')
# Look for headers
headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'])
# If we have headers, we should translate them even in "image-only" chapters
if headers and any(h.get_text(strip=True) for h in headers):
print(f"📝 Found headers to translate in image-only chapter")
# Create a minimal HTML with just the headers for translation
headers_html = ""
for header in headers:
if header.get_text(strip=True):
headers_html += str(header) + "\n"
if headers_html:
print(f"📤 Translating chapter headers...")
# Send just the headers for translation
header_msgs = base_msg + [{"role": "user", "content": headers_html}]
# Use the standard filename
fname = FileUtilities.create_chapter_filename(c, actual_num)
client.set_output_filename(fname)
# Simple API call for headers
header_result, _ = client.send(
header_msgs,
temperature=config.TEMP,
max_tokens=config.MAX_OUTPUT_TOKENS
)
if header_result:
# Clean the result
header_result = re.sub(r"^```(?:html)?\s*\n?", "", header_result, count=1, flags=re.MULTILINE)
header_result = re.sub(r"\n?```\s*$", "", header_result, count=1, flags=re.MULTILINE)
# Parse both the translated headers and the original body
soup_headers = BeautifulSoup(header_result, 'html.parser')
soup_body = BeautifulSoup(translated_html, 'html.parser')
# Replace headers in the body with translated versions
translated_headers = soup_headers.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'])
original_headers = soup_body.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title'])
# Match and replace headers
for orig, trans in zip(original_headers, translated_headers):
if trans and trans.get_text(strip=True):
orig.string = trans.get_text(strip=True)
translated_html = str(soup_body)
print(f"✅ Headers translated successfully")
status = "completed"
else:
print(f"⚠️ Failed to translate headers")
status = "completed_image_only"
else:
status = "completed_image_only"
else:
print(f"ℹ️ No headers found to translate")
status = "completed_image_only"
# Step 3: Save with correct filename
fname = FileUtilities.create_chapter_filename(c, actual_num)
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(translated_html)
print(f"[Chapter {idx+1}/{total_chapters}] ✅ Saved image-only chapter")
progress_manager.update(idx, actual_num, content_hash, fname, status=status, chapter_obj=c)
progress_manager.save()
chapters_completed += 1
continue
else:
# Set default text to translate
text_to_translate = c["body"]
image_translations = {}
if is_mixed_content and image_translator and config.ENABLE_IMAGE_TRANSLATION:
print(f"🖼️ Processing {c.get('image_count', 0)} images first...")
print(f"[DEBUG] Content before image processing (first 200 chars):")
print(c["body"][:200])
print(f"[DEBUG] Has h1 tags: {'
' in c['body']}")
print(f"[DEBUG] Has h2 tags: {'
' in c['body']}")
image_translator.set_current_chapter(chap_num)
# Store the original body before processing
original_body = c["body"]
# Calculate original chapter tokens before modification
original_chapter_tokens = chapter_splitter.count_tokens(original_body)
# Process images and get body with translations
body_with_images, image_translations = process_chapter_images(
c["body"],
actual_num,
image_translator,
check_stop
)
if image_translations:
print(f"✅ Translated {len(image_translations)} images")
# Store the body with images for later merging
c["body_with_images"] = body_with_images
# For chapters with only images and title, we still need to translate the title
# Extract clean text for translation from ORIGINAL body
from bs4 import BeautifulSoup
soup_clean = BeautifulSoup(original_body, 'html.parser')
# Remove images from the original to get pure text
for img in soup_clean.find_all('img'):
img.decompose()
# Set clean text for translation - use prettify() or str() on the full document
c["body"] = str(soup_clean) if soup_clean.body else original_body
# If there's no meaningful text content after removing images,
# the text translation will just translate the title, which is correct
print(f" 📝 Clean text for translation: {len(c['body'])} chars")
# Update text_size to reflect actual text to translate
text_size = len(c["body"])
# Recalculate the actual token count for clean text
actual_text_tokens = chapter_splitter.count_tokens(c["body"])
print(f" 📊 Actual text tokens: {actual_text_tokens} (was counting {original_chapter_tokens} with images)")
# IMPORTANT: use the cleaned text for downstream chunking/translation
chapter_body = c["body"]
# If render mode is image and there's essentially no text, skip text translation
render_mode = os.getenv("PDF_RENDER_MODE", "xhtml").lower()
stripped_text_len = len(soup_clean.get_text(strip=True))
if render_mode == "image" and image_translations and stripped_text_len < 20:
print("🖼️ Image-rendered page with no meaningful text — skipping text translation.")
fname = FileUtilities.create_chapter_filename(c, actual_num)
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(body_with_images)
progress_manager.update(idx, actual_num, content_hash, fname, status="completed_image_only", chapter_obj=c)
progress_manager.save()
chapters_completed += 1
continue
else:
print(f"ℹ️ No translatable text found in images")
# Keep original body if no image translations
c["body"] = original_body
print(f"📖 Translating text content ({text_size} characters)")
# Determine output filename for tracking
fname = FileUtilities.create_chapter_filename(c, actual_num)
progress_manager.update(idx, actual_num, content_hash, fname, status="in_progress", chapter_obj=c)
progress_manager.save()
# REQUEST MERGING: If this is a parent chapter, merge content from child chapters
merge_info = None # Will store info for response splitting
if idx in merge_groups:
group = merge_groups[idx]
if len(group) > 1:
print(f"\n🔗 MERGING {len(group)} chapters into single request...")
# Mark all chapters in the group as in_progress
for g_idx, g_chapter, g_actual_num, g_content_hash in group:
if g_idx != idx: # Parent already marked above
g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
progress_manager.update(g_idx, g_actual_num, g_content_hash, g_fname, status="in_progress", chapter_obj=g_chapter)
progress_manager.save()
# Build merged content with separators
chapters_data = []
for g_idx, g_chapter, g_actual_num, g_content_hash in group:
chapters_data.append((g_actual_num, g_chapter["body"], g_chapter))
if g_idx != idx: # Don't print for parent
print(f" → Including chapter {g_actual_num}")
# Merge the content
original_body = c["body"] # Save original for later
c["body"] = RequestMerger.merge_chapters(chapters_data)
# Store merge info for response splitting
merge_info = {
'group': group,
'expected_chapters': [g[2] for g in group], # actual_nums
'original_body': original_body
}
merged_char_count = len(c["body"])
print(f" 📊 Merged content: {merged_char_count:,} characters")
# Apply ignore filtering to the content before chunk splitting
# IMPORTANT: Skip header removal if request merging is active, because
# synthetic merge headers are critical for split-the-merge functionality
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
use_title_tag = os.getenv('USE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
# Don't remove headers if this is a merged request
if merge_info is not None:
ignore_header_tags = False
if (not use_title_tag or ignore_header_tags) and c["body"]:
from bs4 import BeautifulSoup
content_soup = BeautifulSoup(c["body"], 'html.parser')
# Remove title tags if ignored
if not use_title_tag:
for title_tag in content_soup.find_all('title'):
title_tag.decompose()
# Remove header tags if ignored
if ignore_header_tags:
for header_tag in content_soup.find_all(['h1', 'h2', 'h3']):
header_tag.decompose()
c["body"] = str(content_soup) # Update the chapter body
# Check if this chapter is already a chunk from text file splitting
if c.get('is_chunk', False):
# This is already a pre-split chunk, but still check if it needs further splitting
# Calculate based on effective OUTPUT limit only
max_output_tokens = config.get_effective_output_limit()
safety_margin_output = 500
# CJK to English typically compresses to 0.7-0.9x
compression_factor = config.COMPRESSION_FACTOR
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
# Ensure minimum
available_tokens = max(available_tokens, 1000)
print(f"📊 Max Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
chapter_tokens = chapter_splitter.count_tokens(c["body"])
# Get filename for content type detection (prefer source_file for PDFs)
chapter_filename = c.get('source_file') or c.get('filename') or c.get('original_basename', '')
if chapter_tokens > available_tokens:
# Even pre-split chunks might need further splitting
chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename)
print(f"📄 Section {c['num']} (pre-split from text file) needs further splitting into {len(chunks)} chunks")
else:
chunks = [(c["body"], 1, 1)]
print(f"📄 Section {c['num']} (pre-split from text file)")
else:
# Normal splitting logic for non-text files
# Calculate based on effective OUTPUT limit only
max_output_tokens = config.get_effective_output_limit()
safety_margin_output = 500
# CJK to English typically compresses to 0.7-0.9x
compression_factor = config.COMPRESSION_FACTOR
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
# Ensure minimum
available_tokens = max(available_tokens, 1000)
print(f"📊 Max Chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
# Get filename for content type detection (prefer source_file for PDFs)
chapter_filename = c.get('source_file') or c.get('filename') or c.get('original_basename', '')
chunks = chapter_splitter.split_chapter(c["body"], available_tokens, filename=chapter_filename)
# Use consistent terminology
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f"📄 {terminology} will be processed in {len(chunks)} chunk(s)")
# Recalculate tokens on the actual text to be translated
actual_chapter_tokens = chapter_splitter.count_tokens(c["body"])
if len(chunks) > 1:
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (limit: {available_tokens:,} tokens per chunk)")
else:
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
print(f" ℹ️ {terminology} size: {actual_chapter_tokens:,} tokens (within limit of {available_tokens:,} tokens)")
chapter_key_str = str(idx)
if chapter_key_str not in progress_manager.prog["chapter_chunks"]:
progress_manager.prog["chapter_chunks"][chapter_key_str] = {
"total": len(chunks),
"completed": [],
"chunks": {}
}
progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks)
translated_chunks = []
chunk_abort = False # Flag to abort chapter processing on QA failures
for chunk_idx_enumerate, (chunk_html, chunk_idx, total_chunks) in enumerate(chunks):
# Apply thread delay before processing chunk (including first, when multiple chunks)
if total_chunks > 1:
thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
if thread_delay > 0:
print(f"🧵 Chapter {actual_num}: Delaying {thread_delay}s before processing chunk {chunk_idx}/{total_chunks}")
# Interruptible sleep - check stop flag every 0.1s
# But respect WAIT_FOR_CHUNKS setting during graceful stop
elapsed = 0
check_interval = 0.1
chunk_delay_interrupted = False
while elapsed < thread_delay:
# Read env vars INSIDE loop to catch stop pressed mid-delay
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
if check_stop():
if graceful_stop_active and wait_for_chunks:
pass # Continue processing all chunks
elif graceful_stop_active:
# Graceful stop without wait_for_chunks: break to save partial
print(f"⏳ Graceful stop — saving completed chunks...")
chunk_delay_interrupted = True
break
else:
# No graceful stop: return immediately
print(f"🛑 Chunk delay interrupted")
return
sleep_chunk = min(check_interval, thread_delay - elapsed)
time.sleep(sleep_chunk)
elapsed += sleep_chunk
if chunk_delay_interrupted:
break # Exit the chunk loop to save partial results
chapter_key_str = content_hash
old_key_str = str(idx)
if chapter_key_str not in progress_manager.prog.get("chapter_chunks", {}) and old_key_str in progress_manager.prog.get("chapter_chunks", {}):
progress_manager.prog["chapter_chunks"][chapter_key_str] = progress_manager.prog["chapter_chunks"][old_key_str]
del progress_manager.prog["chapter_chunks"][old_key_str]
#print(f"[PROGRESS] Migrated chunks for chapter {chap_num} to new tracking system")
if chapter_key_str not in progress_manager.prog["chapter_chunks"]:
progress_manager.prog["chapter_chunks"][chapter_key_str] = {
"total": len(chunks),
"completed": [],
"chunks": {}
}
progress_manager.prog["chapter_chunks"][chapter_key_str]["total"] = len(chunks)
# Get chapter status to check for qa_failed
chapter_info = progress_manager.prog["chapters"].get(chapter_key_str, {})
chapter_status = chapter_info.get("status")
if chapter_status == "qa_failed":
# Force retranslation of qa_failed chapters
print(f" [RETRY] Chunk {chunk_idx}/{total_chunks} - retranslating due to QA failure")
# Check stop - but if graceful stop + wait_for_chunks is enabled, skip this check
# to allow all chunks of the current chapter to complete
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
# Check if stop was requested (use stop_callback directly to avoid premature logging)
stop_requested = (stop_callback and stop_callback()) or is_stop_requested()
if stop_requested:
if graceful_stop_active and wait_for_chunks and total_chunks > 1:
# Don't stop yet - let chunks complete
print(f"⏳ Graceful stop — waiting for remaining chunks ({chunk_idx}/{total_chunks}) of chapter {actual_num}...")
elif graceful_stop_active and total_chunks > 1 and len(translated_chunks) > 0:
# Graceful stop without wait_for_chunks, but we have some chunks: save partial
print(f"⏳ Graceful stop — saving {len(translated_chunks)} completed chunk(s), skipping remaining...")
break # Exit chunk loop to save partial results
elif graceful_stop_active and total_chunks == 1:
# Single chunk chapter with graceful stop - already completed, continue to save
pass
else:
# No graceful stop - actually stop immediately
log_stop_once()
print(f"❌ Translation stopped during chapter {actual_num}, chunk {chunk_idx}")
# Mark any in_progress chapter(s) as failed so the UI reflects the stop
if merge_info is not None:
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']:
g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
progress_manager.update(
g_idx,
g_actual_num,
g_content_hash,
g_fname,
status="failed",
chapter_obj=g_chapter,
)
progress_manager.save()
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
progress_manager.update(
idx,
actual_num,
content_hash,
fname,
status="failed",
chapter_obj=c,
)
progress_manager.save()
return
current_chunk_number += 1
progress_percent = (current_chunk_number / total_chunks_needed) * 100 if total_chunks_needed > 0 else 0
if chunks_completed > 0:
elapsed_time = time.time() - translation_start_time
avg_time_per_chunk = elapsed_time / chunks_completed
remaining_chunks = total_chunks_needed - current_chunk_number + 1
eta_seconds = remaining_chunks * avg_time_per_chunk
eta_hours = int(eta_seconds // 3600)
eta_minutes = int((eta_seconds % 3600) // 60)
eta_str = f"{eta_hours}h {eta_minutes}m" if eta_hours > 0 else f"{eta_minutes}m"
else:
eta_str = "calculating..."
# For logging, strip data URIs so inline images don't explode char counts
display_len = len(re.sub(r'data:image/[^;]+;base64,[A-Za-z0-9+/=]+', 'data:image;base64,', chunk_html))
if total_chunks > 1:
print(f" 🔄 Translating chunk {chunk_idx}/{total_chunks} for #{idx+1} (Overall: {current_chunk_number}/{total_chunks_needed} - {progress_percent:.1f}% - ETA: {eta_str})")
print(f" ⏳ Chunk size: {display_len:,} characters (~{chapter_splitter.count_tokens(chunk_html):,} tokens)")
else:
# Determine terminology and file reference
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
# Consistent file reference
if c.get('is_chunk', False):
file_ref = f"Section_{c['num']}"
else:
file_ref = c.get('original_basename', f'{terminology}_{actual_num}')
chunk_tokens = chapter_splitter.count_tokens(chunk_html)
print(f" 📄 {terminology} {actual_num} [{display_len:,} chars, {chunk_tokens:,} tokens]")
print(f" ℹ️ This may take 30-60 seconds. Stop will take effect after completion.")
if log_callback:
if hasattr(log_callback, '__self__') and hasattr(log_callback.__self__, 'append_chunk_progress'):
if total_chunks == 1:
# Determine terminology based on source type
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
log_callback.__self__.append_chunk_progress(
1, 1, "text",
f"{terminology} {actual_num}",
overall_current=current_chunk_number,
overall_total=total_chunks_needed,
extra_info=f"{display_len:,} chars"
)
else:
log_callback.__self__.append_chunk_progress(
chunk_idx,
total_chunks,
"text",
f"{terminology} {actual_num}",
overall_current=current_chunk_number,
overall_total=total_chunks_needed
)
else:
# Determine terminology based on source type
is_text_source = is_text_file or c.get('filename', '').endswith('.txt') or c.get('is_chunk', False)
terminology = "Section" if is_text_source else "Chapter"
terminology_lower = "section" if is_text_source else "chapter"
if total_chunks == 1:
log_callback(f"📄 Processing {terminology} {actual_num} ({chapters_completed + 1}/{chapters_to_process}) - {progress_percent:.1f}% complete")
else:
log_callback(f"📄 processing chunk {chunk_idx}/{total_chunks} for {terminology_lower} {actual_num} - {progress_percent:.1f}% complete")
# Get custom chunk prompt template from environment; send as a separate assistant message
chunk_prompt_template = os.getenv("TRANSLATION_CHUNK_PROMPT", "[PART {chunk_idx}/{total_chunks}]")
chunk_prompt_msg = []
if total_chunks > 1:
chunk_prompt_msg = [{
"role": "assistant",
"content": chunk_prompt_template.format(
chunk_idx=chunk_idx,
total_chunks=total_chunks,
chunk_html="" # Provide empty string for backward compatibility
)
}]
user_prompt = chunk_html
if config.CONTEXTUAL:
history = history_manager.load_history()
trimmed = history[-config.HIST_LIMIT*2:]
chunk_context = chunk_context_manager.get_context_messages(limit=2)
include_source = os.getenv("INCLUDE_SOURCE_IN_HISTORY", "0") == "1"
model_name = getattr(config, 'MODEL', '').lower()
is_gemini_3 = ('gemini-3' in model_name) or ('gemini-exp-' in model_name)
memory_msgs = []
if is_gemini_3:
# Pass-through for Gemini 3 (raw objects preserved)
for h in trimmed:
if not isinstance(h, dict):
continue
role = h.get('role', 'user')
raw_obj = h.get('_raw_content_object')
content = h.get('content') or ""
if (not content) and raw_obj:
content = extract_text_from_raw_content(raw_obj)
if role == 'user' and not include_source:
continue
if (not content) and raw_obj is None:
continue
msg = {'role': role}
if content:
msg['content'] = content
if raw_obj is not None:
msg['_raw_content_object'] = raw_obj
memory_msgs.append(msg)
else:
# Prefix+content+footer for non-Gemini models
memory_blocks = []
for h in trimmed:
if not isinstance(h, dict):
continue
role = h.get('role', 'user')
content = h.get('content', '')
if not content:
continue
if role == 'user' and not include_source:
continue
if role == 'user':
prefix = (
"[MEMORY - PREVIOUS SOURCE TEXT]\\n"
"This is prior source content provided for context only.\\n"
"Do NOT translate or repeat this text directly in your response.\\n\\n"
)
else:
prefix = (
"[MEMORY - PREVIOUS TRANSLATION]\\n"
"This is prior translated content provided for context only.\\n"
"Do NOT repeat or re-output this translation.\\n\\n"
)
footer = "\\n\\n[END MEMORY BLOCK]\\n"
memory_blocks.append(prefix + content + footer)
if memory_blocks:
combined_memory = "\\n".join(memory_blocks)
memory_msgs = [{'role': 'assistant', 'content': combined_memory}]
else:
memory_msgs = []
else:
history = [] # Set empty history when not contextual
trimmed = []
chunk_context = []
memory_msgs = []
# Build the current system prompt from the original each time.
# Apply per-chunk glossary compression if enabled
# Use get_system_prompt() with actual merge count to conditionally include split marker instruction
actual_merge_count = len(merge_info['group']) if merge_info else 1
base_prompt = config.get_system_prompt(actual_merge_count=actual_merge_count)
if os.getenv("COMPRESS_GLOSSARY_PROMPT", "0") == "1" and glossary_path and os.path.exists(glossary_path):
# Rebuild system prompt with compressed glossary for THIS SPECIFIC CHUNK
current_system_content = build_system_prompt(base_prompt, glossary_path, source_text=chunk_html)
else:
# Use base prompt with glossary from original_system_prompt but without stale split marker
current_system_content = build_system_prompt(base_prompt, glossary_path, source_text=None)
current_base = [{"role": "system", "content": current_system_content}]
# Inject rolling_summary.txt verbatim as an assistant message.
# IMPORTANT: Do NOT parse, re-header, or otherwise modify rolling_summary.txt here.
summary_msgs_list = []
if config.USE_ROLLING_SUMMARY:
rolling_summary_text = ""
try:
summary_file = os.path.join(out, "rolling_summary.txt")
if os.path.exists(summary_file):
with open(summary_file, "r", encoding="utf-8") as sf:
rolling_summary_text = (sf.read() or "")
except Exception:
rolling_summary_text = ""
# Only inject if the file has content
if isinstance(rolling_summary_text, str) and rolling_summary_text:
summary_content = (
"CONTEXT ONLY - DO NOT INCLUDE IN TRANSLATION:\n"
"[MEMORY] Previous context summary:\n\n"
+ rolling_summary_text + "\n\n"
"[END MEMORY]\n"
"END OF CONTEXT - BEGIN ACTUAL CONTENT TO TRANSLATE:"
)
summary_msgs_list = [{"role": "assistant", "content": summary_content}]
# Build optional assistant prefill message if configured
assistant_prefill_msgs = []
if getattr(config, 'ASSISTANT_PROMPT', '') and config.ASSISTANT_PROMPT.strip():
assistant_prefill_msgs = [{"role": "assistant", "content": config.ASSISTANT_PROMPT.strip()}]
# Build final message list for this chunk
msgs = current_base + summary_msgs_list + chunk_context + memory_msgs + chunk_prompt_msg + assistant_prefill_msgs + [{"role": "user", "content": user_prompt}]
c['__index'] = idx
c['__progress'] = progress_manager.prog
c['history_manager'] = history_manager
# Prepare merge_group_len and merged_chapters if this is a merged request
merge_group_len = len(merge_info['group']) if merge_info else None
merged_chapters = merge_info['expected_chapters'] if merge_info else None
result, finish_reason, raw_obj = translation_processor.translate_with_retry(
msgs, chunk_html, c, chunk_idx, total_chunks, merge_group_len=merge_group_len, merged_chapters=merged_chapters
)
# If this chunk was blocked/prohibited, stop remaining chunks and mark QA fail
if finish_reason in ("content_filter", "prohibited_content", "error"):
fname = FileUtilities.create_chapter_filename(c, actual_num)
save_prohibited_results = os.getenv('SAVE_PROHIBITED_RESULTS', '0') == '1' or bool(getattr(config, 'save_prohibited_results', False))
if save_prohibited_results:
# Do NOT preserve original; save AI output if any, otherwise empty
try:
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(result if isinstance(result, str) else "")
except Exception:
pass
progress_manager.update(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["PROHIBITED_CONTENT"],
chapter_obj=c
)
progress_manager.save()
print(f"❌ Chunk {chunk_idx}/{total_chunks} hit content filter/prohibited; aborting chapter {actual_num}")
chunk_abort = True
break
# Handle graceful-stop skipped chunks
if finish_reason == "graceful_stop":
fname = FileUtilities.create_chapter_filename(c, actual_num)
save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(config, 'save_partial_results', False))
if save_partial_results:
# If we have a truncated partial response, save it and mark TRUNCATED
partial_content = None
try:
tls = translation_processor.client._get_thread_local_client()
partial_content = getattr(tls, '_last_truncated_content', None)
except Exception:
partial_content = getattr(translation_processor.client, '_last_truncated_content', None)
if isinstance(partial_content, str) and partial_content:
try:
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(partial_content)
except Exception:
pass
progress_manager.update(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["TRUNCATED"],
chapter_obj=c
)
progress_manager.save()
print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — saved truncated output")
else:
progress_manager.update(
idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["PARTIAL"],
chapter_obj=c
)
progress_manager.save()
print(f"⚠️ Chapter {actual_num} stopped (graceful stop) — marked QA failed (PARTIAL)")
else:
progress_manager.update(idx, actual_num, content_hash, fname, status="pending")
progress_manager.save()
print(f"⏸️ Chapter {actual_num} skipped (graceful stop)")
chunk_abort = True
break
# Check if result is None or contains failure markers
# Only check for failure markers if response is short (< 50 chars)
# Longer responses are likely legitimate translations even if they contain error keywords
is_failed = result is None or (len(str(result).strip()) < 50 and is_qa_failed_response(result))
if is_failed:
fname = FileUtilities.create_chapter_filename(c, actual_num)
# Check if it's a timeout failure
if result == "[TIMEOUT]" or finish_reason == "timeout":
progress_manager.update(idx, actual_num, content_hash, fname, status="qa_failed", qa_issues_found=["TIMEOUT"], chapter_obj=c)
print(f"❌ Chunk {chunk_idx}/{total_chunks} timed out; aborting chapter {actual_num}")
chunk_abort = True
else:
progress_manager.update(idx, actual_num, content_hash, fname, status="failed")
print(f"❌ Translation failed for chapter {actual_num} - marked as failed, aborting chapter")
chunk_abort = True
progress_manager.save()
break
# ENHANCED TRUNCATION CHECK: Compare input vs output character counts
# Skip this check if base64 images are present (they skew the character count)
has_base64_image = 'data:image' in chunk_html or 'base64,' in chunk_html
# Check if this result came from a fallback key
used_fallback = hasattr(translation_processor.client, '_used_fallback_key') and translation_processor.client._used_fallback_key
# Check if we're already in a nested truncation retry (prevents infinite loops)
already_in_retry = c.get('__in_truncation_retry', False)
char_ratio_retry_count = c.get('__char_ratio_retry_count', 0)
# Char-ratio truncation settings (silent truncation detector)
char_ratio_enabled = os.getenv("CHAR_RATIO_TRUNCATION_ENABLED", "1") == "1"
try:
char_ratio_threshold_pct = float(os.getenv("CHAR_RATIO_TRUNCATION_PERCENT", "50"))
except Exception:
char_ratio_threshold_pct = 50.0
try:
char_ratio_retry_limit = int(os.getenv("CHAR_RATIO_TRUNCATION_ATTEMPTS", "1"))
except Exception:
char_ratio_retry_limit = 1
try:
char_ratio_min_output_chars = int(os.getenv("CHAR_RATIO_MIN_OUTPUT_CHARS", "100"))
except Exception:
char_ratio_min_output_chars = 100
# Sanitize
if char_ratio_threshold_pct < 0:
char_ratio_threshold_pct = 0.0
if char_ratio_threshold_pct > 100:
char_ratio_threshold_pct = 100.0
char_ratio_threshold = char_ratio_threshold_pct / 100.0
if char_ratio_retry_limit < 1:
char_ratio_retry_limit = 1
if char_ratio_min_output_chars < 0:
char_ratio_min_output_chars = 0
# Char-ratio retry loop
while char_ratio_enabled and not has_base64_image and not already_in_retry:
# Check for stop signal before each retry
if os.environ.get('GRACEFUL_STOP') != '1' and check_stop():
print("❌ Char-ratio retry stopped by user")
break
input_char_count = len(chunk_html)
output_char_count = len(result)
char_ratio = output_char_count / input_char_count if input_char_count > 0 else 0
# If output is much shorter than input, likely silently truncated
if char_ratio < char_ratio_threshold and output_char_count > char_ratio_min_output_chars: # Only check if output has substance
if used_fallback:
# For fallback keys, just warn - don't retry (would go back to refusing model)
print(f" ⚠️ Truncated output from fallback key - accepting as-is")
break
else:
# Override finish_reason to trigger retry logic WITHIN translate_with_retry
# This will be caught by the internal retry loop if RETRY_TRUNCATED is enabled
if finish_reason != "length" and finish_reason != "max_tokens" and finish_reason not in ["content_filter", "prohibited_content"]:
retry_truncated_enabled = os.getenv("RETRY_TRUNCATED", "0") == "1"
if not retry_truncated_enabled:
break
# Check if we've hit the retry limit
if char_ratio_retry_count >= char_ratio_retry_limit:
# All retries exhausted - mark as QA_failed with TRUNCATED
print(f" ❌ All char-ratio retries ({char_ratio_retry_limit}) exhausted for Chapter {actual_num} Chunk {chunk_idx}/{total_chunks} - marking as QA_failed")
fname = FileUtilities.create_chapter_filename(c, actual_num)
save_partial_results = os.getenv('SAVE_PARTIAL_RESULTS', '0') == '1' or bool(getattr(config, 'save_partial_results', False))
if save_partial_results:
try:
with open(os.path.join(out, fname), 'w', encoding='utf-8') as f:
f.write(result if isinstance(result, str) else "")
except Exception:
pass
progress_manager.update(idx, actual_num, content_hash, fname,
status="qa_failed",
qa_issues_found=["TRUNCATED"],
chapter_obj=c)
progress_manager.save()
# Set flag to skip further processing of this chapter
chunk_abort = True
break
# Log truncation detection on first attempt
if char_ratio_retry_count == 0:
print(
f" ⚠️ TRUNCATION DETECTED (char comparison) Chapter {actual_num} Chunk {chunk_idx}/{total_chunks}: "
f"Input={input_char_count:,} chars, Output={output_char_count:,} chars ({char_ratio:.1%} ratio, threshold={char_ratio_threshold:.0%}) "
f"- {char_ratio_retry_limit} retry attempt(s) available"
)
char_ratio_retry_count += 1
c['__char_ratio_retry_count'] = char_ratio_retry_count
print(f" 🔄 Character ratio retry attempt {char_ratio_retry_count}/{char_ratio_retry_limit} [Chapter {actual_num} Chunk {chunk_idx}/{total_chunks}]")
# Set flag to prevent nested retries at BOTH levels
c['__in_truncation_retry'] = True
# CRITICAL: Set thread-local flag to prevent unified_api_client from doing its own truncation retries
if hasattr(translation_processor.client, '_get_thread_local_client'):
tls = translation_processor.client._get_thread_local_client()
tls._in_truncation_retry = True
original_max = config.MAX_OUTPUT_TOKENS
target_tokens = config.MAX_RETRY_TOKENS if config.MAX_RETRY_TOKENS > 0 else original_max
config.MAX_OUTPUT_TOKENS = max(original_max, target_tokens)
result_retry, finish_reason_retry, raw_obj_retry = translation_processor.translate_with_retry(
msgs, chunk_html, c, chunk_idx, total_chunks
)
# Clear retry flags and restore original token limit
c.pop('__in_truncation_retry', None)
if hasattr(translation_processor.client, '_get_thread_local_client'):
tls = translation_processor.client._get_thread_local_client()
tls._in_truncation_retry = False
config.MAX_OUTPUT_TOKENS = original_max
# Check if retry improved the output
retry_output_count = len(result_retry) if result_retry else 0
if result_retry and retry_output_count > output_char_count:
print(f" ✅ Char-ratio retry succeeded: {output_char_count:,} → {retry_output_count:,} chars")
result = result_retry
finish_reason = finish_reason_retry
raw_obj = raw_obj_retry
# Don't break - check if this new result is STILL truncated
# Loop will continue and check char_ratio again
else:
print(f" ⚠️ Char-ratio retry did not improve output ({output_char_count} chars)")
# Continue to next retry attempt
else:
# finish_reason is already 'length' - unified_api_client already retried
break
else:
# Not truncated - exit loop
break
# If truncation retries were exhausted, skip further processing
if chunk_abort:
break
if config.REMOVE_AI_ARTIFACTS:
result = ContentProcessor.clean_ai_artifacts(result, True)
if config.EMERGENCY_RESTORE:
result = ContentProcessor.emergency_restore_paragraphs(result, chunk_html)
if config.REMOVE_AI_ARTIFACTS:
lines = result.split('\n')
json_line_count = 0
for i, line in enumerate(lines[:5]):
if line.strip() and any(pattern in line for pattern in [
'"role":', '"content":', '"messages":',
'{"role"', '{"content"', '[{', '}]'
]):
json_line_count = i + 1
else:
break
if json_line_count > 0 and json_line_count < len(lines):
remaining = '\n'.join(lines[json_line_count:])
if remaining.strip() and len(remaining) > 100:
result = remaining
print(f"✂️ Removed {json_line_count} lines of JSON artifacts")
result = re.sub(r'\[PART \d+/\d+\]\s*', '', result, flags=re.IGNORECASE)
translated_chunks.append((result, chunk_idx, total_chunks))
chunk_context_manager.add_chunk(user_prompt, result, chunk_idx, total_chunks)
progress_manager.prog["chapter_chunks"][chapter_key_str]["completed"].append(chunk_idx)
progress_manager.prog["chapter_chunks"][chapter_key_str]["chunks"][str(chunk_idx)] = result
progress_manager.save()
chunks_completed += 1
will_reset = history_manager.will_reset_on_next_append(
config.HIST_LIMIT if config.CONTEXTUAL else 0,
config.TRANSLATION_HISTORY_ROLLING
)
# Check if we captured thought signatures
if raw_obj:
# print("🧠 Captured thought signature for history")
pass
# Add microsecond delay before history append to prevent race conditions
time.sleep(0.000001) # 1 microsecond delay
history = history_manager.append_to_history(
user_prompt,
result,
config.HIST_LIMIT if config.CONTEXTUAL else 0,
reset_on_limit=True,
rolling_window=config.TRANSLATION_HISTORY_ROLLING,
raw_assistant_object=raw_obj
)
if chunk_idx < total_chunks:
# Handle float delays while checking for stop
full_seconds = int(config.DELAY)
fractional_second = config.DELAY - full_seconds
# Check stop signal every second for full seconds
# During graceful stop, skip these checks to complete all chunks
for i in range(full_seconds):
if os.environ.get('GRACEFUL_STOP') != '1' and check_stop():
print("❌ Translation stopped during delay")
# Mark any in_progress chapter(s) as failed so the UI reflects the stop
if merge_info is not None:
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']:
g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
progress_manager.update(
g_idx,
g_actual_num,
g_content_hash,
g_fname,
status="failed",
chapter_obj=g_chapter,
)
progress_manager.save()
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
progress_manager.update(
idx,
actual_num,
content_hash,
fname,
status="failed",
chapter_obj=c,
)
progress_manager.save()
return
time.sleep(1)
# Handle the fractional part if any
if fractional_second > 0:
if os.environ.get('GRACEFUL_STOP') != '1' and check_stop():
print("❌ Translation stopped during delay")
# Mark any in_progress chapter(s) as failed so the UI reflects the stop
if merge_info is not None:
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']:
g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
progress_manager.update(
g_idx,
g_actual_num,
g_content_hash,
g_fname,
status="failed",
chapter_obj=g_chapter,
)
progress_manager.save()
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
progress_manager.update(
idx,
actual_num,
content_hash,
fname,
status="failed",
chapter_obj=c,
)
progress_manager.save()
return
time.sleep(fractional_second)
# During graceful stop, skip this check to save the completed API response
if os.environ.get('GRACEFUL_STOP') != '1' and check_stop():
print(f"❌ Translation stopped before saving chapter {actual_num}")
# Mark any in_progress chapter(s) as failed so the UI reflects the stop
if merge_info is not None:
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']:
g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
progress_manager.update(
g_idx,
g_actual_num,
g_content_hash,
g_fname,
status="failed",
chapter_obj=g_chapter,
)
progress_manager.save()
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
progress_manager.update(
idx,
actual_num,
content_hash,
fname,
status="failed",
chapter_obj=c,
)
progress_manager.save()
return
# Check for partial results (graceful stop during multi-chunk processing)
is_partial_result = False
expected_total = len(chunks) if 'chunks' in dir() else 1
if len(translated_chunks) < expected_total and len(translated_chunks) > 0:
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
if graceful_stop_active:
print(f"⚠️ Chapter {actual_num}: partial translation ({len(translated_chunks)}/{expected_total} chunks) due to graceful stop")
is_partial_result = True
if len(translated_chunks) > 1:
print(f" 📎 Merging {len(translated_chunks)} chunks...")
translated_chunks.sort(key=lambda x: x[1])
merged_result = chapter_splitter.merge_translated_chunks(translated_chunks)
else:
merged_result = translated_chunks[0][0] if translated_chunks else ""
if config.CONTEXTUAL and len(translated_chunks) > 1:
user_summary, assistant_summary = chunk_context_manager.get_summary_for_history()
if user_summary and assistant_summary:
# Add microsecond delay before summary append
time.sleep(0.000001) # 1 microsecond delay
history_manager.append_to_history(
user_summary,
assistant_summary,
config.HIST_LIMIT,
reset_on_limit=False,
rolling_window=config.TRANSLATION_HISTORY_ROLLING
)
print(f" 📝 Added chapter summary to history")
chunk_context_manager.clear()
# For text file chunks, ensure we pass the decimal number
if is_text_file and c.get('is_chunk', False) and isinstance(c.get('num'), float):
fname = FileUtilities.create_chapter_filename(c, c['num']) # Use the decimal num directly
print(f"[DEBUG] Text file chunk - using decimal num {c['num']} -> filename: {fname}")
else:
fname = FileUtilities.create_chapter_filename(c, actual_num)
if is_text_file:
print(f"[DEBUG] Text file - using actual_num {actual_num} -> filename: {fname}")
client.set_output_filename(fname)
cleaned = re.sub(r"^```(?:html)?\s*\n?", "", merged_result, count=1, flags=re.MULTILINE)
cleaned = re.sub(r"\n?```\s*$", "", cleaned, count=1, flags=re.MULTILINE)
cleaned = ContentProcessor.clean_ai_artifacts(cleaned, remove_artifacts=config.REMOVE_AI_ARTIFACTS)
# Post-process: Remove duplicate H1+P pairs from translated OUTPUT if enabled
remove_duplicate_h1_p = os.getenv('REMOVE_DUPLICATE_H1_P', '0') == '1'
if remove_duplicate_h1_p and cleaned:
# First: HTML-based duplicate removal
from bs4 import BeautifulSoup
output_soup = BeautifulSoup(cleaned, 'html.parser')
for h1_tag in output_soup.find_all('h1'):
h1_id = h1_tag.get('id', '')
if h1_id and h1_id.startswith('split-'):
continue
h1_text = h1_tag.get_text(strip=True)
if 'SPLIT MARKER' in h1_text:
continue
# Check next sibling (P after H1)
next_sibling = h1_tag.find_next_sibling()
if next_sibling and next_sibling.name == 'p':
if h1_text == next_sibling.get_text(strip=True):
next_sibling.decompose()
continue
# Check previous sibling (P before H1)
prev_sibling = h1_tag.find_previous_sibling()
if prev_sibling and prev_sibling.name == 'p':
if h1_text == prev_sibling.get_text(strip=True):
prev_sibling.decompose()
cleaned = str(output_soup)
# Second: Markdown-based duplicate removal (for enhanced extraction mode)
# Pattern: "Title Text\n\n# Title Text" - remove the plain text line before markdown header
def remove_markdown_duplicate_headers(text):
lines = text.split('\n')
result = []
i = 0
while i < len(lines):
line = lines[i]
# Check if this is a non-empty line followed by blank lines and then a markdown header
if line.strip() and not line.strip().startswith('#'):
# Look ahead for pattern: [blank lines] [# header with same text]
j = i + 1
# Skip blank lines
while j < len(lines) and not lines[j].strip():
j += 1
# Check if next non-blank line is a markdown header
if j < len(lines):
next_line = lines[j]
header_match = re.match(r'^(#{1,6})\s+(.+)$', next_line)
if header_match:
header_text = header_match.group(2).strip()
# Compare with current line (stripped)
if line.strip() == header_text:
# Skip this duplicate line, keep blanks and header
i += 1
continue
result.append(line)
i += 1
return '\n'.join(result)
cleaned = remove_markdown_duplicate_headers(cleaned)
# If the cleaned translation is empty/whitespace, treat as failure and skip file write
if not cleaned or not str(cleaned).strip():
print(f"❌ Translation empty for chapter {actual_num} — skipping file write")
chapter_key = progress_manager._get_chapter_key(actual_num, FileUtilities.create_chapter_filename(c, actual_num), c, content_hash)
existing = progress_manager.prog.get("chapters", {}).get(chapter_key, {})
# If already qa_failed (e.g., prohibited content), keep that; otherwise mark qa_failed with EMPTY_OUTPUT
new_status = existing.get("status") if existing.get("status") == "qa_failed" else "qa_failed"
qa_issues = existing.get("qa_issues_found") or []
if "EMPTY_OUTPUT" not in qa_issues:
qa_issues = qa_issues + ["EMPTY_OUTPUT"]
progress_manager.update(
idx,
actual_num,
content_hash,
FileUtilities.create_chapter_filename(c, actual_num),
status=new_status,
qa_issues_found=qa_issues,
chapter_obj=c,
)
progress_manager.save()
# Move to next chapter without writing a file
continue
if is_mixed_content and image_translations:
print(f"🔀 Merging {len(image_translations)} image translations with text...")
from bs4 import BeautifulSoup
# Parse the translated text (which has the translated title/header)
soup_translated = BeautifulSoup(cleaned, 'html.parser')
# For each image translation, insert it into the document
for img_path, translation_html in image_translations.items():
if translation_html and '
]*id="split-\d+"[^>]*>.*?\s*',
'',
cleaned_to_save,
flags=re.IGNORECASE | re.DOTALL,
)
with open(os.path.join(out, parent_fname), 'w', encoding='utf-8') as f:
f.write(cleaned_to_save)
except Exception:
pass
# Mark ALL chapters in the merge group as qa_failed using
# their own expected filenames so we overwrite existing
# in_progress entries instead of creating composite keys.
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']:
g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
progress_manager.update(
g_idx,
g_actual_num,
g_content_hash,
g_fname,
status="qa_failed",
chapter_obj=g_chapter,
)
progress_manager.save()
print(f" ⚠️ Merged group marked as qa_failed")
continue
# Check if Split the Merge is enabled
split_the_merge = os.getenv('SPLIT_THE_MERGE', '0') == '1'
disable_fallback = os.getenv('DISABLE_MERGE_FALLBACK', '0') == '1'
split_sections = None
if split_the_merge and len(merge_info['group']) > 1:
# Try to split by invisible markers
split_sections = RequestMerger.split_by_markers(cleaned, len(merge_info['group']))
# If disable fallback is enabled and split failed, mark as qa_failed
if split_the_merge and disable_fallback and (not split_sections or len(split_sections) != len(merge_info['group'])):
print(f" ⚠️ Split failed and fallback disabled - marking merged group as qa_failed")
# Update watchdog: Record this as a "split_failed" event before the request technically finishes
# Using the first request ID from the group if possible, though this runs post-request.
# Since the original request is finished by now, we can't update its watchdog state directly.
# But we can log it clearly.
# Only save file for debugging if it contains meaningful content beyond error markers
cleaned_stripped = cleaned.strip()
is_only_error_marker = cleaned_stripped in [
"[TRANSLATION FAILED]",
"[Content Blocked]",
"[IMAGE TRANSLATION FAILED]",
"[EXTRACTION FAILED]",
"[RATE LIMITED]",
"[]"
] or cleaned_stripped.startswith("[TRANSLATION FAILED - ORIGINAL TEXT PRESERVED]") or cleaned_stripped.startswith("[CONTENT BLOCKED - ORIGINAL TEXT PRESERVED]")
if not is_only_error_marker:
parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num)
try:
cleaned_to_save = cleaned
if split_the_merge:
cleaned_to_save = re.sub(
r'
]*id="split-\d+"[^>]*>.*?
\s*',
'',
cleaned_to_save,
flags=re.IGNORECASE | re.DOTALL,
)
with open(os.path.join(out, parent_fname), 'w', encoding='utf-8') as f:
f.write(cleaned_to_save)
except Exception:
pass
# Mark ALL chapters in the merge group as qa_failed
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']:
g_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
progress_manager.update(
g_idx,
g_actual_num,
g_content_hash,
g_fname,
status="qa_failed",
chapter_obj=g_chapter,
qa_issues_found=["SPLIT_FAILED"],
)
progress_manager.save()
print(f" ⚠️ Merged group ({len(merge_info['group'])} chapters) marked as qa_failed with SPLIT_FAILED")
continue
if split_sections and len(split_sections) == len(merge_info['group']):
# Split successful - save each section as individual file
print(f" ✂️ Splitting merged content into {len(split_sections)} individual files")
saved_files = []
for i, (g_idx, g_chapter, g_actual_num, g_content_hash) in enumerate(merge_info['group']):
section_content = split_sections[i]
# Generate filename for this chapter using content.opf naming
split_fname = FileUtilities.create_chapter_filename(g_chapter, g_actual_num)
# Handle text file mode
if is_text_file:
split_fname = split_fname.replace('.html', '.txt')
from bs4 import BeautifulSoup
soup = BeautifulSoup(section_content, 'html.parser')
section_content = soup.get_text(strip=True)
# Save the section
split_output_path = os.path.join(out, split_fname)
with open(split_output_path, 'w', encoding='utf-8') as f:
f.write(section_content)
# Verify file was written successfully
if os.path.exists(split_output_path):
saved_files.append((g_idx, g_chapter, g_actual_num, g_content_hash, split_fname))
print(f" 💾 Saved Chapter {g_actual_num}: {split_fname} ({len(section_content)} chars)")
else:
print(f" ⚠️ ERROR: Failed to write file {split_fname} - file does not exist after write")
# Mark all chapters as completed or qa_failed (for truncated)
for g_idx, g_chapter, g_actual_num, g_content_hash, split_fname in saved_files:
chapter_status = "qa_failed" if was_truncated else "completed"
qa_issues = ["TRUNCATED"] if was_truncated else None
progress_manager.update(
g_idx, g_actual_num, g_content_hash, split_fname,
status=chapter_status, chapter_obj=g_chapter, qa_issues_found=qa_issues
)
chapters_completed += 1
# Save once after all updates
progress_manager.save()
print(f" ✅ Split the Merge complete: {len(saved_files)} files created")
continue
# Normal merged behavior (split not enabled or header count mismatch)
# Save entire merged response to parent chapter's file
cleaned_to_save = cleaned
if split_the_merge and len(merge_info['group']) > 1:
cleaned_to_save = re.sub(
r'
]*id="split-\d+"[^>]*>.*?
\s*',
'',
cleaned_to_save,
flags=re.IGNORECASE | re.DOTALL,
)
if is_text_file and not is_pdf_file:
parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num).replace('.html', '.txt')
from bs4 import BeautifulSoup
soup = BeautifulSoup(cleaned_to_save, 'html.parser')
text_content = soup.get_text(strip=True)
parent_output_path = os.path.join(out, parent_fname)
with open(parent_output_path, 'w', encoding='utf-8') as f:
f.write(text_content)
else:
parent_fname = FileUtilities.create_chapter_filename(parent_chapter, parent_actual_num)
parent_output_path = os.path.join(out, parent_fname)
with open(parent_output_path, 'w', encoding='utf-8') as f:
f.write(cleaned_to_save)
# Verify file was actually written before marking as completed
if not os.path.exists(parent_output_path):
print(f" ⚠️ ERROR: Failed to write merged file {parent_fname} - file does not exist after write")
# Mark all chapters in the group as failed since parent file wasn't written
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group']:
progress_manager.update(g_idx, g_actual_num, g_content_hash, None, status="failed", chapter_obj=g_chapter)
progress_manager.save()
continue
print(f" 💾 Saved merged content to Chapter {parent_actual_num}: {parent_fname} ({len(cleaned_to_save)} chars)")
if was_truncated:
# For truncated merged responses, mark ALL chapters as qa_failed
qa_issues = ["TRUNCATED"]
progress_manager.update(
parent_idx, parent_actual_num, parent_content_hash, parent_fname,
status="qa_failed", chapter_obj=parent_chapter, qa_issues_found=qa_issues
)
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group'][1:]:
progress_manager.update(
g_idx, g_actual_num, g_content_hash, None,
status="qa_failed", chapter_obj=g_chapter, qa_issues_found=qa_issues
)
chapters_completed += len(merge_info['group'])
# Save once after all updates
progress_manager.save()
print(f" ⚠️ Merged group marked as qa_failed due to truncation")
else:
# Normal success path: parent completed, children marked as merged
progress_manager.update(
parent_idx, parent_actual_num, parent_content_hash, parent_fname,
status="completed", chapter_obj=parent_chapter,
merged_chapters=merged_child_nums
)
chapters_completed += 1
# Mark child chapters as merged (point to parent's output file) - atomically after parent
for g_idx, g_chapter, g_actual_num, g_content_hash in merge_info['group'][1:]:
progress_manager.mark_as_merged(g_idx, g_actual_num, g_content_hash, parent_actual_num, g_chapter, parent_output_file=parent_fname)
chapters_completed += 1
# Save once after all updates
progress_manager.save()
print(f" 📊 Saved merged content for {len(merge_info['group'])} chapters")
# Skip normal save since we handled it above and exit this translation run
continue
# CRITICAL: Unescape img tags that were converted to HTML entities (applies to ALL HTML)
# Pattern matches: <img ... /> where the tag ends with /
img_count = len(re.findall(r'<img\s[^>]*?/>', cleaned, flags=re.IGNORECASE))
if img_count > 0:
print(f"🖼️ Unescaping {img_count} img tag(s) from HTML entities (post-processing)")
cleaned = re.sub(
r'<(img\s[^>]*?/)>',
r'<\1>',
cleaned,
flags=re.IGNORECASE
)
if is_text_file and not is_pdf_file:
# For text files (but NOT PDFs), save as plain text instead of HTML
fname_txt = fname.replace('.html', '.txt') # Change extension to .txt
# Extract text from HTML
from bs4 import BeautifulSoup
soup = BeautifulSoup(cleaned, 'html.parser')
text_content = soup.get_text(strip=True)
# Write plain text file
output_path = os.path.join(out, fname_txt)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text_content)
# Verify file was actually written before marking as completed
if not os.path.exists(output_path):
print(f"⚠️ ERROR: Failed to write file {fname_txt} - file does not exist after write")
# Keep status as in_progress or mark as failed
progress_manager.save() # Save current in_progress state
continue
print(f"💾 Saved text file: {fname_txt} (Chapter {actual_num})")
final_title = c['title'] or make_safe_filename(c['title'], actual_num)
# Don't print individual "Processed" messages - these are redundant with the main progress display
if os.getenv('DEBUG_CHAPTER_SAVES', '0') == '1':
print(f"[Processed {idx+1}/{total_chapters}] ✅ Saved Chapter {actual_num}: {final_title}")
# Determine status based on comprehensive failure detection
qa_issues = None
if is_qa_failed_response(cleaned):
chapter_status = "qa_failed"
failure_reason = get_failure_reason(cleaned)
print(f"⚠️ Chapter {actual_num} marked as qa_failed: {failure_reason}")
elif finish_reason in ["length", "max_tokens"]:
chapter_status = "qa_failed"
qa_issues = ["TRUNCATED"]
print(f"⚠️ Chapter {actual_num} marked as qa_failed: truncated (finish_reason: {finish_reason})")
elif is_partial_result:
chapter_status = "qa_failed"
qa_issues = ["PARTIAL"]
print(f"⚠️ Chapter {actual_num} marked as qa_failed: partial translation (graceful stop)")
else:
chapter_status = "completed"
progress_manager.update(idx, actual_num, content_hash, fname_txt, status=chapter_status, chapter_obj=c, qa_issues_found=qa_issues)
# Clear any stale watchdog entries for this chapter
try:
import unified_api_client
if hasattr(unified_api_client, '_api_watchdog_clear_chapter'):
unified_api_client._api_watchdog_clear_chapter(actual_num)
except Exception:
pass
else:
# For EPUB files, keep original HTML behavior
output_path = os.path.join(out, fname)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(cleaned)
# Verify file was actually written before marking as completed
if not os.path.exists(output_path):
print(f"⚠️ ERROR: Failed to write file {fname} - file does not exist after write")
# Keep status as in_progress or mark as failed
progress_manager.save() # Save current in_progress state
continue
final_title = c['title'] or make_safe_filename(c['title'], actual_num)
# Don't print individual "Processed" messages - these are redundant with the main progress display
if os.getenv('DEBUG_CHAPTER_SAVES', '0') == '1':
print(f"[Processed {idx+1}/{total_chapters}] ✅ Saved Chapter {actual_num}: {final_title}")
# Determine status based on comprehensive failure detection
qa_issues = None
if is_qa_failed_response(cleaned):
chapter_status = "qa_failed"
failure_reason = get_failure_reason(cleaned)
print(f"⚠️ Chapter {actual_num} marked as qa_failed: {failure_reason}")
elif finish_reason in ["length", "max_tokens"]:
chapter_status = "qa_failed"
qa_issues = ["TRUNCATED"]
print(f"⚠️ Chapter {actual_num} marked as qa_failed: truncated (finish_reason: {finish_reason})")
elif is_partial_result:
chapter_status = "qa_failed"
qa_issues = ["PARTIAL"]
print(f"⚠️ Chapter {actual_num} marked as qa_failed: partial translation (graceful stop)")
else:
chapter_status = "completed"
progress_manager.update(idx, actual_num, content_hash, fname, status=chapter_status, chapter_obj=c, qa_issues_found=qa_issues)
# Clear any stale watchdog entries for this chapter
try:
import unified_api_client
if hasattr(unified_api_client, '_api_watchdog_clear_chapter'):
unified_api_client._api_watchdog_clear_chapter(actual_num)
except Exception:
pass
progress_manager.save()
# After completing this chapter, check if we should stop
graceful_stop_active = os.environ.get('GRACEFUL_STOP') == '1'
wait_for_chunks = os.environ.get('WAIT_FOR_CHUNKS') == '1'
stop_requested = (stop_callback and stop_callback()) or is_stop_requested()
# Stop after saving if: partial result OR graceful stop + wait_for_chunks completed
if is_partial_result:
print(f"\n✅ Partial chapter {actual_num} saved. Stopping as requested (graceful stop).")
log_stop_once()
return
if stop_requested and graceful_stop_active and wait_for_chunks:
print(f"\n✅ Chapter {actual_num} completed. Stopping as requested (wait for chunks).")
log_stop_once()
return
# After completing this chapter, produce a rolling summary and store it for the NEXT chapter
if config.USE_ROLLING_SUMMARY:
# Use the original system prompt to build the summary system prompt
base_system_content = original_system_prompt
summary_mode = str(getattr(config, 'ROLLING_SUMMARY_MODE', 'replace') or 'replace').strip().lower()
def _load_previous_rolling_summary_text(*, full_file: bool = False) -> str:
"""Load rolling_summary.txt to use as assistant context (no parsing)."""
try:
summary_file = os.path.join(out, "rolling_summary.txt")
if not os.path.exists(summary_file):
return ""
with open(summary_file, "r", encoding="utf-8") as f:
content = f.read().strip()
return content
except Exception:
return ""
def _get_last_translated_outputs(n: int) -> str:
"""Build the user text from the last N translated chapter outputs (by completed_list)."""
try:
n = int(n or 0)
if n <= 0:
return cleaned
# completed_list is saved (sorted) by ProgressManager.save()
completed_list = progress_manager.prog.get("completed_list") or []
if not isinstance(completed_list, list) or not completed_list:
return cleaned
last_items = completed_list[-n:]
blocks = []
for item in last_items:
try:
chap_num = item.get("num")
rel_file = item.get("file")
if not rel_file:
continue
fp = os.path.join(out, rel_file)
if not os.path.exists(fp):
continue
with open(fp, "r", encoding="utf-8") as f:
txt = f.read().strip()
if not txt:
continue
blocks.append(
f"=== Previous Translated Text: Chapter {chap_num} ===\n"
f"{txt}\n"
f"=== End Previous Translated Text ==="
)
except Exception:
continue
return "\n\n".join(blocks) if blocks else cleaned
except Exception:
return cleaned
if summary_mode == 'replace':
# In replace mode, update the rolling summary using:
# - assistant: previous rolling summary (from rolling_summary.txt)
# - user: last N translated chapter outputs (configured by ROLLING_SUMMARY_EXCHANGES)
prev_summary = _load_previous_rolling_summary_text()
n = int(getattr(config, 'ROLLING_SUMMARY_EXCHANGES', 5) or 5)
user_text = _get_last_translated_outputs(n)
summary_text = translation_processor.generate_rolling_summary(
history_manager,
actual_num,
base_system_content,
source_text=user_text,
previous_summary_text=prev_summary,
previous_summary_chapter_num=None,
prefer_translations_only_user=True,
)
else:
# append (and any unknown value): summarize ONLY this chapter's translated output.
# Do NOT send the previous rolling summary in append mode.
summary_text = translation_processor.generate_rolling_summary(
history_manager,
actual_num,
base_system_content,
source_text=cleaned,
previous_summary_text=None,
previous_summary_chapter_num=None,
)
if summary_text:
last_summary_block_text = summary_text
last_summary_chapter_num = actual_num
chapters_completed += 1
# Check if PDF should output as PDF or EPUB
pdf_output_format = os.getenv('PDF_OUTPUT_FORMAT', 'pdf').lower()
should_create_pdf = is_text_file or (is_pdf_file and pdf_output_format == 'pdf')
if should_create_pdf:
print("📄 Text file translation complete!")
try:
# Collect all translated chapters with their metadata
translated_chapters = []
for chapter in chapters:
# Look for .txt files for text files, .html for PDFs
fname_base = FileUtilities.create_chapter_filename(chapter, chapter['num'])
if is_pdf_file:
fname_to_check = fname_base # PDFs use .html files
else:
fname_to_check = fname_base.replace('.html', '.txt') # Text files use .txt
if os.path.exists(os.path.join(out, fname_to_check)):
with open(os.path.join(out, fname_to_check), 'r', encoding='utf-8') as f:
content = f.read()
translated_chapters.append({
'num': chapter['num'],
'title': chapter['title'],
'content': content,
'is_chunk': chapter.get('is_chunk', False),
'chunk_info': chapter.get('chunk_info', {}),
'filename': fname_to_check # Store filename for debugging
})
elif os.path.exists(os.path.join(out, fname_base)):
# Fallback to HTML if txt doesn't exist
with open(os.path.join(out, fname_base), 'r', encoding='utf-8') as f:
content = f.read()
# For PDFs, keep HTML content; for text files, extract text
if is_pdf_file:
# Keep the HTML as-is for PDFs
text = content
else:
# Extract text from HTML for text files
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text(strip=True)
translated_chapters.append({
'num': chapter['num'],
'title': chapter['title'],
'content': text,
'is_chunk': chapter.get('is_chunk', False),
'chunk_info': chapter.get('chunk_info', {}),
'filename': fname_base # Store filename for debugging
})
# Sort chapters by number to ensure correct order
# Handle both integer and float chapter numbers (e.g., 1.0, 1.1, etc.)
translated_chapters.sort(key=lambda x: float(x['num']))
print(f"✅ Translation complete! {len(translated_chapters)} section files created:")
for chapter_data in translated_chapters:
print(f" • Section {chapter_data['num']}: {chapter_data['title']} (from {chapter_data.get('filename', 'unknown')})")
# Create a combined file with proper section structure
if input_path.lower().endswith('.pdf'):
# Check if content is HTML or plain text
is_html_content = any('' in chapter_data.get('content', '') or
'
PDF renderers.
try:
from bs4 import BeautifulSoup
frag = BeautifulSoup(content, 'html.parser')
for tag in frag.find_all(id='page0'):
tag['id'] = f'mupdf-page0-{i + 1}'
content = str(frag)
except Exception:
pass
# Always insert a page break before every combined page after the first.
if i > 0:
html_parts.append('\n')
if chapter_data.get('is_chunk'):
chunk_info = chapter_data.get('chunk_info', {})
original_chapter = chunk_info.get('original_chapter')
chunk_idx = chunk_info.get('chunk_idx', 1)
total_chunks = chunk_info.get('total_chunks', 1)
if original_chapter != current_main_chapter:
current_main_chapter = original_chapter
html_parts.append(content)
if chunk_idx < total_chunks:
html_parts.append('\n')
else:
current_main_chapter = chapter_data['num']
html_parts.append(content)
full_html_body = "".join(html_parts)
# Post-process: merge paragraphs that span across pages
full_html_body = _merge_split_paragraphs(full_html_body)
# Post-process: merge image-only page containers into the previous page
# (reduces wasted whitespace for "image-only" pages)
full_html_body = _merge_image_only_pages(full_html_body)
# Post-process: wrap last text block with a following image to reduce image-only pages
full_html_body = _keep_text_with_following_image(full_html_body)
# Replace/insert a clean Table of Contents built from h1/h2 headers
full_html_body = _generate_and_replace_toc(full_html_body)
# Wrap in full HTML document with CSS
css_path = os.path.join(out, 'styles.css')
css_link = '' if os.path.exists(css_path) else ''
# Extra inline CSS for PDF-derived HTML:
# - h3 is used as body text in our PDF extraction; normalize it to paragraph-like styling
# - reduce margins around images
# - keep-with-image wrapper helps reduce image-only PDF pages
extra_css = """
"""
full_html = f"""
{txt_processor.file_base} - Translated
{css_link}
{extra_css}
{full_html_body}
"""
# Save HTML file for reference
html_path = os.path.join(out, f"{txt_processor.file_base}_translated.html")
with open(html_path, 'w', encoding='utf-8') as f:
f.write(full_html)
print(f" • Created HTML file: {html_path}")
# Convert HTML to PDF
try:
from pdf_extractor import create_pdf_from_html
images_dir = os.path.join(out, 'images')
css_arg = css_path if os.path.exists(css_path) else None
images_arg = images_dir if os.path.exists(images_dir) else None
# Check if images directory exists and has images
has_images = False
if images_arg and os.path.exists(images_arg):
image_files = [f for f in os.listdir(images_arg) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp'))]
has_images = len(image_files) > 0
if has_images:
print(f" • Found {len(image_files)} images to include in PDF")
if create_pdf_from_html(full_html, combined_path, css_path=css_arg, images_dir=images_arg):
print(f" • Created translated PDF file: {combined_path}")
if has_images:
print(f" • PDF includes images from images folder")
else:
print("⚠️ Failed to create PDF from HTML, using HTML file")
combined_path = html_path
except Exception as e:
print(f"⚠️ Error creating PDF from HTML: {e}")
import traceback
traceback.print_exc()
print(f" • Using HTML file instead: {html_path}")
combined_path = html_path
else:
# Plain text content - use text-based PDF creation
combined_path = os.path.join(out, f"{txt_processor.file_base}_translated.pdf")
print(f"📄 Creating PDF from plain text...")
# Build full text content
full_text_parts = []
current_main_chapter = None
# Note: translated_chapters is already sorted at this point
for i, chapter_data in enumerate(translated_chapters):
content = chapter_data['content']
if chapter_data.get('is_chunk'):
chunk_info = chapter_data.get('chunk_info', {})
original_chapter = chunk_info.get('original_chapter')
chunk_idx = chunk_info.get('chunk_idx', 1)
total_chunks = chunk_info.get('total_chunks', 1)
if original_chapter != current_main_chapter:
current_main_chapter = original_chapter
if i > 0:
full_text_parts.append(f"\n\n{'='*50}\n\n")
full_text_parts.append(content)
if chunk_idx < total_chunks:
full_text_parts.append("\n")
else:
current_main_chapter = chapter_data['num']
if i > 0:
full_text_parts.append(f"\n\n{'='*50}\n\n")
full_text_parts.append(content)
full_text = "".join(full_text_parts)
from pdf_extractor import create_pdf_from_text
if create_pdf_from_text(full_text, combined_path):
print(f" • Created translated PDF file: {combined_path}")
else:
print("⚠️ Failed to create PDF, falling back to text output")
combined_path = os.path.join(out, f"{txt_processor.file_base}_translated.txt")
with open(combined_path, 'w', encoding='utf-8') as f:
f.write(full_text)
print(f" • Created fallback text file: {combined_path}")
else:
combined_path = os.path.join(out, f"{txt_processor.file_base}_translated.txt")
with open(combined_path, 'w', encoding='utf-8') as combined:
current_main_chapter = None
# Note: translated_chapters is already sorted at this point
for i, chapter_data in enumerate(translated_chapters):
content = chapter_data['content']
# Check if this is a chunk of a larger chapter
if chapter_data.get('is_chunk'):
chunk_info = chapter_data.get('chunk_info', {})
original_chapter = chunk_info.get('original_chapter')
chunk_idx = chunk_info.get('chunk_idx', 1)
total_chunks = chunk_info.get('total_chunks', 1)
# Only add the chapter header for the first chunk
if original_chapter != current_main_chapter:
current_main_chapter = original_chapter
# Add separator if not first chapter
if i > 0:
combined.write(f"\n\n{'='*50}\n\n")
# Add the chunk content
combined.write(content)
# Add spacing between chunks of the same chapter
if chunk_idx < total_chunks:
combined.write("\n")
else:
# This is a standalone chapter
current_main_chapter = chapter_data['num']
# Add separator if not first chapter
if i > 0:
combined.write(f"\n\n{'='*50}\n\n")
# Add the content
combined.write(content)
print(f" • Combined file with preserved sections: {combined_path}")
total_time = time.time() - translation_start_time
hours = int(total_time // 3600)
minutes = int((total_time % 3600) // 60)
seconds = int(total_time % 60)
print(f"\n⏱️ Total translation time: {hours}h {minutes}m {seconds}s")
print(f"📊 Chapters completed: {chapters_completed}")
print(f"✅ Text file translation complete!")
if log_callback:
log_callback(f"✅ Text file translation complete! Created {combined_path}")
# Exit here for text files and PDFs - don't fall through to EPUB generation
print("TRANSLATION_COMPLETE_SIGNAL")
return
except Exception as e:
print(f"❌ Error creating combined text file: {e}")
if log_callback:
log_callback(f"❌ Error creating combined text file: {e}")
print("TRANSLATION_COMPLETE_SIGNAL")
return
else:
# Skip EPUB building if graceful stop was triggered
graceful_stop_triggered = os.environ.get('GRACEFUL_STOP') == '1' or os.environ.get('GRACEFUL_STOP_COMPLETED') == '1'
if graceful_stop_triggered:
print("⏳ Graceful stop triggered - skipping EPUB building")
print("TRANSLATION_COMPLETE_SIGNAL")
return
print("🔍 Checking for translated chapters...")
# Respect retain extension toggle: if enabled, don't look for response_ prefix
if should_retain_source_extension():
response_files = [f for f in os.listdir(out) if f.endswith('.html') and not f.startswith('chapter_')]
else:
response_files = [f for f in os.listdir(out) if f.startswith('response_') and f.endswith('.html')]
chapter_files = [f for f in os.listdir(out) if f.startswith('chapter_') and f.endswith('.html')]
if not response_files and chapter_files:
if should_retain_source_extension():
print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist")
print("ℹ️ Retain-source-extension mode is ON: skipping placeholder creation and using original files for EPUB compilation.")
else:
print(f"⚠️ No translated files found, but {len(chapter_files)} original chapters exist")
print("📝 Creating placeholder response files for EPUB compilation...")
for chapter_file in chapter_files:
response_file = chapter_file.replace('chapter_', 'response_', 1)
src = os.path.join(out, chapter_file)
dst = os.path.join(out, response_file)
try:
with open(src, 'r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
notice = soup.new_tag('p')
notice.string = "[Note: This chapter could not be translated - showing original content]"
notice['style'] = "color: red; font-style: italic;"
if soup.body:
soup.body.insert(0, notice)
with open(dst, 'w', encoding='utf-8') as f:
f.write(str(soup))
except Exception as e:
print(f"⚠️ Error processing {chapter_file}: {e}")
try:
shutil.copy2(src, dst)
except:
pass
print(f"✅ Created {len(chapter_files)} placeholder response files")
print("⚠️ Note: The EPUB will contain untranslated content")
print("📘 Building final EPUB…")
try:
from epub_converter import fallback_compile_epub
fallback_compile_epub(out, log_callback=log_callback)
print("✅ All done: your final EPUB is in", out)
total_time = time.time() - translation_start_time
hours = int(total_time // 3600)
minutes = int((total_time % 3600) // 60)
seconds = int(total_time % 60)
print(f"\n📊 Translation Statistics:")
print(f" • Total chunks processed: {chunks_completed}")
print(f" • Total time: {hours}h {minutes}m {seconds}s")
if chunks_completed > 0:
avg_time = total_time / chunks_completed
print(f" • Average time per chunk: {avg_time:.1f} seconds")
stats = progress_manager.get_stats(out)
print(f"\n📊 Progress Tracking Summary:")
print(f" • Total chapters tracked: {stats['total_tracked']}")
print(f" • Successfully completed: {stats['completed']}")
print(f" • Missing files: {stats['missing_files']}")
print(f" • In progress: {stats['in_progress']}")
except Exception as e:
print("❌ EPUB build failed:", e)
print("TRANSLATION_COMPLETE_SIGNAL")
if __name__ == "__main__":
from shutdown_utils import run_cli_main
run_cli_main(main)