Spaces:
Sleeping
Sleeping
| """ | |
| tests/test_rag.py | |
| ================== | |
| Unit tests for the RAG pipeline (FAISS indexing + semantic search). | |
| """ | |
| import pytest | |
| import numpy as np | |
| from unittest.mock import patch, MagicMock | |
| from backend.services.rag_pipeline import RAGPipeline | |
| # ββ Fixtures ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SAMPLE_CHUNKS = [ | |
| {"chunk_id": 0, "text": "Machine learning is a subset of artificial intelligence.", | |
| "start_ts": "00:00:00", "end_ts": "00:01:00", "start": 0.0, "end": 60.0}, | |
| {"chunk_id": 1, "text": "Deep learning uses neural networks with many layers.", | |
| "start_ts": "00:01:00", "end_ts": "00:02:00", "start": 60.0, "end": 120.0}, | |
| {"chunk_id": 2, "text": "Natural language processing deals with text and speech.", | |
| "start_ts": "00:02:00", "end_ts": "00:03:00", "start": 120.0, "end": 180.0}, | |
| {"chunk_id": 3, "text": "FAISS is a library for efficient similarity search.", | |
| "start_ts": "00:03:00", "end_ts": "00:04:00", "start": 180.0, "end": 240.0}, | |
| {"chunk_id": 4, "text": "Whisper is an automatic speech recognition model by OpenAI.", | |
| "start_ts": "00:04:00", "end_ts": "00:05:00", "start": 240.0, "end": 300.0}, | |
| ] | |
| def _make_mock_embedder(dim: int = 64): | |
| """Return a mock SentenceTransformer that produces random fixed embeddings.""" | |
| mock = MagicMock() | |
| np.random.seed(42) | |
| def encode(texts, **kwargs): | |
| vecs = np.random.rand(len(texts), dim).astype("float32") | |
| # Normalise | |
| norms = np.linalg.norm(vecs, axis=1, keepdims=True) | |
| return vecs / norms | |
| mock.encode.side_effect = encode | |
| return mock | |
| # ββ Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestRAGPipeline: | |
| def test_index_builds_successfully(self, mock_get): | |
| mock_get.return_value = _make_mock_embedder() | |
| rag = RAGPipeline() | |
| rag.index_chunks(SAMPLE_CHUNKS) | |
| assert rag._index is not None | |
| assert rag._index.ntotal == len(SAMPLE_CHUNKS) | |
| def test_query_returns_results(self, mock_get): | |
| mock_get.return_value = _make_mock_embedder() | |
| rag = RAGPipeline() | |
| rag.index_chunks(SAMPLE_CHUNKS) | |
| results = rag.query("speech recognition", top_k=3) | |
| assert len(results) <= 3 | |
| assert all("text" in r for r in results) | |
| assert all("score" in r for r in results) | |
| def test_query_results_have_scores(self, mock_get): | |
| mock_get.return_value = _make_mock_embedder() | |
| rag = RAGPipeline() | |
| rag.index_chunks(SAMPLE_CHUNKS) | |
| results = rag.query("neural network", top_k=2) | |
| for r in results: | |
| assert isinstance(r["score"], float) | |
| def test_top_k_respected(self, mock_get): | |
| mock_get.return_value = _make_mock_embedder() | |
| rag = RAGPipeline() | |
| rag.index_chunks(SAMPLE_CHUNKS) | |
| results = rag.query("anything", top_k=2) | |
| assert len(results) <= 2 | |
| def test_query_on_empty_index_returns_empty(self): | |
| rag = RAGPipeline() | |
| results = rag.query("test", top_k=3) | |
| assert results == [] | |
| def test_save_and_load_index(self, mock_get, tmp_path): | |
| mock_get.return_value = _make_mock_embedder() | |
| index_path = str(tmp_path / "test.index") | |
| rag = RAGPipeline() | |
| rag.index_chunks(SAMPLE_CHUNKS) | |
| rag.save_index(index_path) | |
| import os | |
| assert os.path.exists(index_path) | |
| rag2 = RAGPipeline() | |
| loaded = rag2.load_index(index_path) | |
| assert loaded | |
| assert rag2._index.ntotal == len(SAMPLE_CHUNKS) | |
| def test_get_context_returns_string(self, mock_get): | |
| mock_get.return_value = _make_mock_embedder() | |
| rag = RAGPipeline() | |
| rag.index_chunks(SAMPLE_CHUNKS) | |
| ctx = rag.get_context_for_summary("speech recognition", top_k=2) | |
| assert isinstance(ctx, str) | |
| assert len(ctx) > 0 | |