Video-Note-Taker / tests /test_summary.py
rajiv-ramteke's picture
initial commit
6b64d63
Raw
History Blame
4.62 kB
"""
tests/test_summary.py
=======================
Unit tests for the Summarizer and TextChunker services.
"""
import pytest
from unittest.mock import patch, MagicMock
from backend.services.text_chunker import TextChunker
from backend.services.summarizer import Summarizer
# ── TextChunker Tests ─────────────────────────────────────────────────────────
SAMPLE_TRANSCRIPT = {
"text": "word " * 3000,
"language": "en",
"segments": [
{
"id": i,
"start": i * 10.0,
"end": (i + 1) * 10.0,
"start_ts": f"00:{i:02d}:00",
"end_ts": f"00:{i+1:02d}:00",
"text": "word " * 100,
"words": [],
}
for i in range(30)
],
}
class TestTextChunker:
def test_chunks_produced(self):
"""Chunker should produce at least 1 chunk for a non-empty transcript."""
chunker = TextChunker(max_chunk_size=500, overlap=50)
chunks = chunker.chunk_transcript(SAMPLE_TRANSCRIPT)
assert len(chunks) > 0
def test_chunk_has_required_keys(self):
"""Each chunk must have the required metadata keys."""
chunker = TextChunker(max_chunk_size=500, overlap=50)
chunks = chunker.chunk_transcript(SAMPLE_TRANSCRIPT)
required = {"chunk_id", "text", "start", "end", "start_ts", "end_ts", "segments"}
for chunk in chunks:
assert required.issubset(chunk.keys()), f"Missing keys: {required - chunk.keys()}"
def test_chunk_ids_are_sequential(self):
"""Chunk IDs should be sequential starting from 0."""
chunker = TextChunker(max_chunk_size=500, overlap=50)
chunks = chunker.chunk_transcript(SAMPLE_TRANSCRIPT)
ids = [c["chunk_id"] for c in chunks]
assert ids == list(range(len(chunks)))
def test_empty_transcript_returns_empty_list(self):
"""Empty transcript should produce an empty chunk list."""
chunker = TextChunker()
result = chunker.chunk_transcript({"text": "", "segments": []})
assert result == []
def test_plain_text_chunking(self):
"""chunk_text() should split a plain string into multiple parts."""
chunker = TextChunker(max_chunk_size=10, overlap=2)
text = " ".join([f"word{i}" for i in range(50)])
chunks = chunker.chunk_text(text)
assert len(chunks) > 1
for c in chunks:
assert isinstance(c, str)
assert len(c) > 0
def test_chunk_text_word_count_limit(self):
"""Each chunk should not exceed max_chunk_size words."""
chunker = TextChunker(max_chunk_size=10, overlap=2)
text = " ".join([f"w{i}" for i in range(100)])
chunks = chunker.chunk_text(text)
for c in chunks:
assert len(c.split()) <= 10
# ── Summarizer Tests ──────────────────────────────────────────────────────────
class TestSummarizer:
@patch("backend.services.summarizer.Summarizer._summarize_openai")
def test_summarize_chunks_calls_backend(self, mock_openai):
"""summarize_chunks should call the LLM backend for each chunk."""
mock_openai.return_value = "β€’ Point 1\nβ€’ Point 2"
import backend.utils.config as cfg
cfg.settings.LLM_PROVIDER = "openai"
cfg.settings.OPENAI_API_KEY = "sk-test"
s = Summarizer()
chunks = [
{"chunk_id": 0, "text": "Sample text A", "start_ts": "00:00:00", "end_ts": "00:01:00"},
{"chunk_id": 1, "text": "Sample text B", "start_ts": "00:01:00", "end_ts": "00:02:00"},
]
result = s.summarize_chunks(chunks)
assert len(result) == 2
for r in result:
assert "summary" in r
assert r["summary"] == "β€’ Point 1\nβ€’ Point 2"
@patch("backend.services.summarizer.Summarizer._summarize_huggingface")
def test_hf_fallback(self, mock_hf):
"""summarize_chunks should use HuggingFace when LLM_PROVIDER=huggingface."""
mock_hf.return_value = "HF summary text"
import backend.utils.config as cfg
cfg.settings.LLM_PROVIDER = "huggingface"
s = Summarizer()
chunks = [{"chunk_id": 0, "text": "Test", "start_ts": "00:00:00", "end_ts": "00:00:30"}]
result = s.summarize_chunks(chunks)
assert result[0]["summary"] == "HF summary text"
mock_hf.assert_called_once()