Spaces:
Sleeping
Sleeping
| """ | |
| backend/services/translator.py | |
| ============================== | |
| Translates complex JSON structures (topics, highlights, qa_pairs, transcript) | |
| using OpenAI. | |
| """ | |
| import os | |
| import json | |
| from typing import Dict, List, Any | |
| from backend.utils.config import settings | |
| from backend.utils.logger import get_logger | |
| logger = get_logger(__name__) | |
| TRANSLATE_JSON_PROMPT = """You are an expert translator. | |
| Translate the string values in the provided JSON array into {language}. | |
| Keep the exact same JSON structure, keys, and array length. Only translate the text values. | |
| CRITICAL INSTRUCTION: Return ONLY valid JSON, do NOT wrap in markdown fences. | |
| Input JSON: | |
| {json_str} | |
| """ | |
| class Translator: | |
| def __init__(self, api_key: str = None): | |
| self.api_key = api_key | |
| self._openai_client = None | |
| def _get_client(self): | |
| if self._openai_client is None: | |
| from openai import OpenAI | |
| kwargs = {"api_key": self.api_key or os.environ.get("OPENAI_API_KEY") or settings.OPENAI_API_KEY} | |
| if settings.OPENAI_BASE_URL: | |
| kwargs["base_url"] = settings.OPENAI_BASE_URL | |
| self._openai_client = OpenAI(**kwargs) | |
| return self._openai_client | |
| def translate_json_array(self, data: List[Dict[str, Any]], language: str) -> List[Dict[str, Any]]: | |
| """Translate a JSON array of dicts by feeding it to the LLM.""" | |
| if not data: | |
| return [] | |
| has_key = bool(self.api_key or os.environ.get("OPENAI_API_KEY") or settings.OPENAI_API_KEY) | |
| if not has_key: | |
| logger.warning("No OpenAI API key for translation. Returning original data.") | |
| return data | |
| # Batch translation if too large? For now, we will do it in one go for | |
| # topics, qa_pairs, and highlights since they are small. | |
| # For transcript segments, we might need batching, but let's try direct first. | |
| try: | |
| client = self._get_client() | |
| prompt = TRANSLATE_JSON_PROMPT.format( | |
| language=language, | |
| json_str=json.dumps(data, ensure_ascii=False) | |
| ) | |
| response = client.chat.completions.create( | |
| model=settings.OPENAI_MODEL, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.3, | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| # Clean markdown | |
| import re | |
| raw = re.sub(r"```(?:json)?", "", raw).strip().rstrip("```").strip() | |
| translated_data = json.loads(raw) | |
| return translated_data | |
| except Exception as e: | |
| logger.error(f"Failed to translate JSON array: {e}") | |
| return data | |
| def translate_transcript(self, segments: List[Dict[str, Any]], language: str) -> List[Dict[str, Any]]: | |
| """ | |
| Translates transcript segments in batches to avoid token limits. | |
| """ | |
| if not segments: | |
| return [] | |
| batch_size = 30 | |
| translated_segments = [] | |
| for i in range(0, len(segments), batch_size): | |
| batch = segments[i:i+batch_size] | |
| # Create a simplified structure to reduce tokens | |
| simplified = [{"id": j, "text": seg["text"]} for j, seg in enumerate(batch)] | |
| try: | |
| translated_batch = self.translate_json_array(simplified, language) | |
| # Re-merge | |
| for j, seg in enumerate(batch): | |
| new_seg = seg.copy() | |
| if j < len(translated_batch): | |
| new_seg["text"] = translated_batch[j].get("text", seg["text"]) | |
| translated_segments.append(new_seg) | |
| except Exception as e: | |
| logger.error(f"Failed to translate transcript batch: {e}") | |
| translated_segments.extend(batch) | |
| return translated_segments | |