""" backend/services/topic_extractor.py ====================================== Extracts key topics from a video transcript and summarizes each one in a structured, easy-to-understand format using LLM. """ import os import re from typing import Dict, List from backend.utils.config import settings from backend.utils.logger import get_logger logger = get_logger(__name__) # ── Prompt ──────────────────────────────────────────────────────────────────── TOPIC_EXTRACTION_PROMPT = """You are an expert educator creating structured study notes from a video transcript. Analyze the transcript below and identify the KEY TOPICS discussed. CRITICAL INSTRUCTION: You MUST generate the output in the following language: {language} For each topic, provide: - A clear topic title - A simple 1-2 sentence summary - 3-5 key points that explain the concept in simple, easy-to-understand language Format your output STRICTLY as a JSON array: [ {{ "topic": "Topic Title Here", "summary": "One or two sentence overview of this topic.", "key_points": [ "First key point explained simply", "Second key point explained simply", "Third key point explained simply" ] }} ] Transcript: \"\"\" {text} \"\"\" JSON Array (return ONLY valid JSON, no markdown fences):""" class TopicExtractor: """ Extracts structured topic summaries from video transcripts. Uses OpenAI when available; falls back to a basic heuristic. """ def __init__(self, api_key: str = None): self.api_key = api_key pass # No cached client — fresh one per call # ── Public API ──────────────────────────────────────────── def extract(self, chunks: List[Dict], language: str = "English") -> List[Dict]: """ Extract structured topic summaries from all transcript chunks. Args: chunks: List of chunk dicts from TextChunker. language: The language to generate topics in. Returns: List of topic dicts with keys: topic, summary, key_points. """ combined_text = " ".join([c["text"] for c in chunks]) text_to_process = combined_text[:15000] has_key = bool(self.api_key or os.environ.get("OPENAI_API_KEY") or settings.OPENAI_API_KEY) logger.info("Extracting structured topics...") if settings.LLM_PROVIDER == "openai" and has_key: topics = self._extract_with_llm(text_to_process, language) else: logger.warning("OpenAI not configured. Using fallback topic extraction.") topics = self._fallback_topics(text_to_process) logger.info(f"Extracted {len(topics)} topics total") return topics # ── Private ─────────────────────────────────────────────── def _extract_with_llm(self, text: str, language: str) -> List[Dict]: """Use OpenAI to extract structured topics.""" import json as _json try: from openai import OpenAI kwargs = {"api_key": self.api_key or os.environ.get("OPENAI_API_KEY") or settings.OPENAI_API_KEY} if settings.OPENAI_BASE_URL: kwargs["base_url"] = settings.OPENAI_BASE_URL client = OpenAI(**kwargs) prompt = TOPIC_EXTRACTION_PROMPT.format(text=text[:12000], language=language) response = client.chat.completions.create( model=settings.OPENAI_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0.3, max_tokens=1500, ) raw = response.choices[0].message.content if not raw: return self._fallback_topics(text) raw = raw.strip() # Strip any accidental markdown fences raw = re.sub(r"```(?:json)?", "", raw).strip().rstrip("```").strip() topics = _json.loads(raw) return topics if isinstance(topics, list) else [] except Exception as e: logger.warning(f"LLM topic extraction failed: {e}. Using fallback.") return self._fallback_topics(text) def _fallback_topics(self, text: str) -> List[Dict]: """Generate actual topics using simple extractive NLP fallback.""" from backend.utils.helper import extract_sentences, extract_top_words, get_key_sentences sentences = extract_sentences(text) if not sentences: return [ { "topic": "Video Content Overview", "summary": "No text content available for summary.", "key_points": [] } ] # Split text into up to 3 main sections to create 3 topics num_sections = min(3, max(1, len(sentences) // 10)) section_size = len(sentences) // num_sections topics = [] for i in range(num_sections): start = i * section_size end = (i + 1) * section_size if i < num_sections - 1 else len(sentences) section_sentences = sentences[start:end] section_text = " ".join(section_sentences) top_words = extract_top_words(section_text, 3) if top_words: title = f"Focus on {', '.join(top_words)}" else: title = f"Section {i + 1} Analysis" summary_sentences = get_key_sentences(section_text, 2) summary = " ".join(summary_sentences) key_points = get_key_sentences(section_text, 3) # Make sure we don't repeat the exact summary key_points = [kp for kp in key_points if kp not in summary_sentences] if not key_points: key_points = section_sentences[:3] topics.append({ "topic": title, "summary": summary, "key_points": key_points }) return topics