Spaces:
Sleeping
Sleeping
File size: 3,997 Bytes
6b64d63 e3b2b89 6b64d63 28db209 6b64d63 28db209 6b64d63 28db209 283aa34 6b64d63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | """
backend/services/translator.py
==============================
Translates complex JSON structures (topics, highlights, qa_pairs, transcript)
using OpenAI.
"""
import os
import json
from typing import Dict, List, Any
from backend.utils.config import settings
from backend.utils.logger import get_logger
logger = get_logger(__name__)
TRANSLATE_JSON_PROMPT = """You are an expert translator.
Translate the string values in the provided JSON array into {language}.
Keep the exact same JSON structure, keys, and array length. Only translate the text values.
CRITICAL INSTRUCTION: Return ONLY valid JSON, do NOT wrap in markdown fences.
Input JSON:
{json_str}
"""
class Translator:
def __init__(self, api_key: str = None):
self.api_key = api_key
self._openai_client = None
def _get_client(self):
if self._openai_client is None:
from openai import OpenAI
kwargs = {"api_key": self.api_key or os.environ.get("OPENAI_API_KEY") or settings.OPENAI_API_KEY}
if settings.OPENAI_BASE_URL:
kwargs["base_url"] = settings.OPENAI_BASE_URL
self._openai_client = OpenAI(**kwargs)
return self._openai_client
def translate_json_array(self, data: List[Dict[str, Any]], language: str) -> List[Dict[str, Any]]:
"""Translate a JSON array of dicts by feeding it to the LLM."""
if not data:
return []
has_key = bool(self.api_key or os.environ.get("OPENAI_API_KEY") or settings.OPENAI_API_KEY)
if not has_key:
logger.warning("No OpenAI API key for translation. Returning original data.")
return data
# Batch translation if too large? For now, we will do it in one go for
# topics, qa_pairs, and highlights since they are small.
# For transcript segments, we might need batching, but let's try direct first.
try:
client = self._get_client()
prompt = TRANSLATE_JSON_PROMPT.format(
language=language,
json_str=json.dumps(data, ensure_ascii=False)
)
response = client.chat.completions.create(
model=settings.OPENAI_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
)
raw = response.choices[0].message.content.strip()
# Clean markdown
import re
raw = re.sub(r"```(?:json)?", "", raw).strip().rstrip("```").strip()
translated_data = json.loads(raw)
return translated_data
except Exception as e:
logger.error(f"Failed to translate JSON array: {e}")
return data
def translate_transcript(self, segments: List[Dict[str, Any]], language: str) -> List[Dict[str, Any]]:
"""
Translates transcript segments in batches to avoid token limits.
"""
if not segments:
return []
batch_size = 30
translated_segments = []
for i in range(0, len(segments), batch_size):
batch = segments[i:i+batch_size]
# Create a simplified structure to reduce tokens
simplified = [{"id": j, "text": seg["text"]} for j, seg in enumerate(batch)]
try:
translated_batch = self.translate_json_array(simplified, language)
# Re-merge
for j, seg in enumerate(batch):
new_seg = seg.copy()
if j < len(translated_batch):
new_seg["text"] = translated_batch[j].get("text", seg["text"])
translated_segments.append(new_seg)
except Exception as e:
logger.error(f"Failed to translate transcript batch: {e}")
translated_segments.extend(batch)
return translated_segments
|