# extract_glossary_from_txt.py import os from typing import List from txt_processor import TextFileProcessor from chapter_splitter import ChapterSplitter from bs4 import BeautifulSoup def extract_chapters_from_txt(txt_path: str) -> List[str]: """Extract chapters from text file for glossary extraction""" processor = TextFileProcessor(txt_path, os.path.dirname(txt_path)) chapters = processor.extract_chapters() # Initialize chapter splitter model_name = os.getenv("MODEL", "gpt-3.5-turbo") chapter_splitter = ChapterSplitter(model_name=model_name) # Translation-aligned safe chunk budget compression_factor = float(os.getenv("GLOSSARY_COMPRESSION_FACTOR", os.getenv("COMPRESSION_FACTOR", "1.0"))) raw_output_env = os.getenv("GLOSSARY_MAX_OUTPUT_TOKENS", os.getenv("MAX_OUTPUT_TOKENS", "65536")) try: effective_output = int(str(raw_output_env).strip()) except Exception: effective_output = 65536 if effective_output <= 0: effective_output = 65536 safety_margin_output = 500 available_tokens = int((effective_output - safety_margin_output) / max(compression_factor, 0.01)) available_tokens = max(available_tokens, 1000) chapter_split_enabled = os.getenv("GLOSSARY_ENABLE_CHAPTER_SPLIT", "1") == "1" print(f"📊 Chapter chunk budget: {available_tokens:,} tokens (output limit {effective_output:,}, compression {compression_factor})") text_chapters = [] for idx, chapter in enumerate(chapters): # Check if chapter needs splitting chapter_tokens = chapter_splitter.count_tokens(chapter['body']) if chapter_split_enabled and chapter_tokens > available_tokens: print(f"Chapter {idx+1} has {chapter_tokens} tokens, splitting into smaller chunks (budget {available_tokens})...") # Use ChapterSplitter to split the HTML content # Pass filename for content type detection chunks = chapter_splitter.split_chapter(chapter['body'], available_tokens, filename=txt_path) # Extract text from each chunk for chunk_html, chunk_idx, total_chunks in chunks: soup = BeautifulSoup(chunk_html, 'html.parser') text = soup.get_text(strip=True) if text: text_chapters.append(text) print(f" Added chunk {chunk_idx}/{total_chunks} ({chapter_splitter.count_tokens(text)} tokens)") else: # Chapter is small enough or splitting disabled, extract text as-is soup = BeautifulSoup(chapter['body'], 'html.parser') text = soup.get_text(strip=True) if text: text_chapters.append(text) print(f"Total text chunks for glossary extraction: {len(text_chapters)}") return text_chapters