"""
backend/services/note_generator.py
=====================================
Assembles all pipeline outputs into a final, structured note document.
Produces Markdown and JSON formats.
"""
import os
from datetime import datetime
from typing import Dict, List, Optional
from backend.utils.config import settings
from backend.utils.helper import ensure_dir, save_json, save_text, format_duration
from backend.utils.logger import get_logger
logger = get_logger(__name__)
class NoteGenerator:
"""
Final stage of the pipeline.
Combines transcript, summaries, timestamps, and action items
into a polished, structured note document.
"""
def __init__(self):
self.output_dir = settings.OUTPUT_DIR
# ── Public API ────────────────────────────────────────────
def generate(
self,
job_id: str,
filename: str,
transcript: Dict,
summarized_chunks: List[Dict],
final_notes: str,
quiz: List[Dict],
topics: List[Dict] = None,
qa_pairs: List[Dict] = None,
action_items: List[Dict] = None,
highlights: List[Dict] = None,
chapters: List[Dict] = None,
duration: Optional[float] = None,
) -> Dict:
"""
Assemble all pipeline outputs into the final note document.
Returns:
Dict containing:
- markdown_path: path to .md file
- json_path: path to .json file
- markdown: Markdown string
- data: Full structured JSON data
"""
logger.info(f"Generating final notes for job: {job_id}")
qa_pairs = qa_pairs or []
markdown = self._build_markdown(
filename=filename,
transcript=transcript,
final_notes=final_notes,
quiz=quiz,
topics=topics or [],
qa_pairs=qa_pairs,
action_items=action_items or [],
chapters=chapters or [],
duration=duration,
)
data = self._build_json(
job_id=job_id,
filename=filename,
transcript=transcript,
summarized_chunks=summarized_chunks,
final_notes=final_notes,
quiz=quiz,
topics=topics or [],
qa_pairs=qa_pairs,
action_items=action_items or [],
highlights=highlights or [],
chapters=chapters or [],
duration=duration,
)
# Save outputs
md_dir = os.path.join(self.output_dir, "final_notes")
json_dir = os.path.join(self.output_dir, "final_notes")
ensure_dir(md_dir)
md_path = os.path.join(md_dir, f"{job_id}_notes.md")
json_path = os.path.join(json_dir, f"{job_id}_notes.json")
qz_path = os.path.join(self.output_dir, f"{job_id}_quiz.json")
ai_path = os.path.join(self.output_dir, f"{job_id}_topics.json")
qa_path = os.path.join(self.output_dir, f"{job_id}_qa.json")
ensure_dir(os.path.join(self.output_dir, "qa_pairs"))
save_text(markdown, md_path)
save_json(data, json_path)
save_json(quiz, qz_path)
save_json(topics or [], ai_path)
save_json(qa_pairs, qa_path)
logger.info(f"Notes saved: {md_path}")
return {
"markdown_path": md_path,
"json_path": json_path,
"markdown": markdown,
"data": data,
}
# ── Private: Markdown builder ─────────────────────────────
def _build_markdown(
self,
filename: str,
transcript: Dict,
final_notes: str,
quiz: List[Dict],
topics: List[Dict],
qa_pairs: List[Dict],
action_items: List[Dict],
chapters: List[Dict],
duration: Optional[float],
) -> str:
from datetime import timezone
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
dur_str = format_duration(duration) if duration else "Unknown"
lang = transcript.get("language", "en").upper()
word_count = len(transcript.get("text", "").split())
lines = [
f"# 📝 Video Notes: {filename}",
"",
f"> **Generated:** {now} | **Duration:** {dur_str} "
f"| **Language:** {lang} | **Words:** {word_count:,}",
"",
"---",
"",
"## 📋 Structured Summary",
"",
final_notes,
"",
"---",
"",
]
# Interactive Quiz
lines += ["## 🎯 Interactive Quiz", ""]
if quiz:
for idx, q in enumerate(quiz):
lines.append(f"### Q{idx+1}: {q.get('question')}")
for opt_idx, opt in enumerate(q.get('options', [])):
lines.append(f"- {opt}")
lines.append("")
# Put the answer in a details block so it's hidden by default in MD
correct_opt = q.get('options', [])[q.get('correct_index', 0)] if q.get('options') else 'Unknown'
lines.append(f"Reveal Answer
")
lines.append(f"**Correct Answer:** {correct_opt}")
lines.append(" ")
lines.append("")
else:
lines += ["_No quiz generated._", ""]
# Topic Summaries
lines += ["## 📚 Topic Summaries", ""]
if topics:
for item in topics:
lines.append(f"### 🔹 {item.get('topic', 'Topic')}")
lines.append("")
lines.append(f"> {item.get('summary', '')}")
lines.append("")
for kp in item.get('key_points', []):
lines.append(f"- {kp}")
lines.append("")
else:
lines += ["_No topics extracted._", ""]
lines += ["---", ""]
# Q&A section
if qa_pairs:
lines += ["## ❓ Generated Q&A", ""]
for qa in qa_pairs:
lines.append(f"**Q: {qa.get('question', '')}**")
lines.append(f"A: {qa.get('answer', '')}")
lines.append("")
lines += ["---", ""]
# Action Items
if action_items:
lines += ["## ✅ Action Items", ""]
for action in action_items:
lines.append(f"- [ ] **{action.get('task', 'Task')}**")
meta = []
if action.get('assignee'): meta.append(f"Assignee: {action['assignee']}")
if action.get('deadline'): meta.append(f"Due: {action['deadline']}")
if meta:
lines.append(f" > _{' | '.join(meta)}_")
lines += ["", "---", ""]
# Chapters
if chapters:
lines += ["## 🕒 Chapters", ""]
for chapter in chapters:
lines.append(f"### {chapter.get('timestamp', '')} - {chapter.get('title', 'Chapter')}")
for hl in chapter.get('highlights', []):
lines.append(f"- **{hl.get('timestamp', '')}**: {hl.get('title', '')}")
lines.append("")
lines += ["---", ""]
# Full Transcript (collapsible)
lines += [
"## 📜 Full Transcript",
"",
"",
"Click to expand full transcript
",
"",
"```",
transcript.get("text", ""),
"```",
"",
" ",
"",
"---",
"_Generated by Deep-Dive Video Note Taker — AI-Powered Video Analysis_",
]
return "\n".join(lines)
# ── Private: JSON builder ─────────────────────────────────
def _build_json(self, **kwargs) -> Dict:
return {
"job_id": kwargs["job_id"],
"filename": kwargs["filename"],
"generated_at": datetime.utcnow().isoformat(),
"duration_seconds": kwargs.get("duration"),
"language": kwargs["transcript"].get("language", "en"),
"word_count": len(kwargs["transcript"].get("text", "").split()),
"final_notes": kwargs["final_notes"],
"quiz": kwargs.get("quiz", []),
"topics": kwargs.get("topics", []),
"qa_pairs": kwargs.get("qa_pairs", []),
"action_items": kwargs.get("action_items", []),
"highlights": kwargs.get("highlights", []),
"chapters": kwargs.get("chapters", []),
"chunk_summaries": [
{
"chunk_id": c["chunk_id"],
"start_ts": c["start_ts"],
"end_ts": c["end_ts"],
"summary": c.get("summary", ""),
}
for c in kwargs["summarized_chunks"]
],
"transcript_segments": kwargs["transcript"].get("segments", []),
}