Spaces:
Sleeping
Sleeping
| """ | |
| Lógica de las 4 tools de RAG sobre ESL e ISLP (v2). | |
| Diferencia con v1: la base ChromaDB se obtiene de un dataset publicado en | |
| HF Hub vía `snapshot_download`. La primera invocación tarda lo que tarde | |
| la descarga (~40 MB); las siguientes son cache hit. | |
| Variables de entorno: | |
| - RAG_CHROMA_DIR Si está set y apunta a una carpeta existente, se usa | |
| en lugar del dataset (útil para dev local con índice | |
| recién regenerado por `ingest.py`). | |
| - RAG_CHROMA_DATASET Repo del dataset HF a descargar. | |
| Default: gusdelact/rag-esl-islp-chromadb | |
| - RAG_CHROMA_REVISION Revision (branch/tag/commit) del dataset. | |
| Default: main | |
| - RAG_CHROMA_CACHE_DIR Directorio cache para el snapshot_download. | |
| Default: ~/.cache/rag-books-mcp/chroma_db (o /data/chroma_db | |
| si existe /data, como en HF Spaces con persistent storage). | |
| - HF_TOKEN Solo si el dataset es privado. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Optional | |
| import chromadb | |
| from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction | |
| # --- Configuración --- | |
| EMBEDDING_MODEL = "all-MiniLM-L6-v2" | |
| DEFAULT_DATASET = "gusdelact/rag-esl-islp-chromadb" | |
| DEFAULT_REVISION = "main" | |
| def _resolve_cache_dir() -> Path: | |
| """Decide dónde guardar el snapshot del dataset. | |
| Prioridad: | |
| 1. RAG_CHROMA_CACHE_DIR si está set. | |
| 2. /data/chroma_db si existe /data (HF Spaces con persistent storage). | |
| 3. ~/.cache/rag-books-mcp/chroma_db. | |
| """ | |
| explicit = os.environ.get("RAG_CHROMA_CACHE_DIR") | |
| if explicit: | |
| return Path(explicit) | |
| if Path("/data").is_dir(): | |
| return Path("/data/chroma_db") | |
| return Path.home() / ".cache" / "rag-books-mcp" / "chroma_db" | |
| # Singletons por proceso | |
| _client: Optional[chromadb.ClientAPI] = None | |
| _embedding_fn = None | |
| _chroma_path_resolved: Optional[str] = None | |
| def _resolve_chroma_path() -> str: | |
| """Resuelve la ruta a usar como ChromaDB persistente. | |
| Si `RAG_CHROMA_DIR` apunta a una carpeta existente, la usa tal cual. | |
| En caso contrario, baja `RAG_CHROMA_DATASET@RAG_CHROMA_REVISION` desde HF | |
| Hub y devuelve la ruta al snapshot. | |
| """ | |
| global _chroma_path_resolved | |
| if _chroma_path_resolved is not None: | |
| return _chroma_path_resolved | |
| override = os.environ.get("RAG_CHROMA_DIR") | |
| if override and Path(override).is_dir(): | |
| print(f"[rag-books-mcp v2] Using local RAG_CHROMA_DIR: {override}", file=sys.stderr) | |
| _chroma_path_resolved = override | |
| return override | |
| repo_id = os.environ.get("RAG_CHROMA_DATASET", DEFAULT_DATASET) | |
| revision = os.environ.get("RAG_CHROMA_REVISION", DEFAULT_REVISION) | |
| cache_dir = _resolve_cache_dir() | |
| cache_dir.mkdir(parents=True, exist_ok=True) | |
| print( | |
| f"[rag-books-mcp v2] Downloading ChromaDB from HF dataset " | |
| f"{repo_id}@{revision} into {cache_dir} ...", | |
| file=sys.stderr, | |
| ) | |
| # Import perezoso para no pagar el costo si está cacheado vía RAG_CHROMA_DIR. | |
| from huggingface_hub import snapshot_download | |
| snapshot = snapshot_download( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| revision=revision, | |
| cache_dir=str(cache_dir), | |
| token=os.environ.get("HF_TOKEN"), # solo si es privado | |
| ) | |
| print(f"[rag-books-mcp v2] ChromaDB ready at {snapshot}", file=sys.stderr) | |
| _chroma_path_resolved = snapshot | |
| return snapshot | |
| def get_client() -> chromadb.ClientAPI: | |
| """Cliente ChromaDB persistente (singleton).""" | |
| global _client | |
| if _client is None: | |
| _client = chromadb.PersistentClient(path=_resolve_chroma_path()) | |
| return _client | |
| def get_embedding_fn(): | |
| """Función de embedding `sentence-transformers/all-MiniLM-L6-v2` (singleton).""" | |
| global _embedding_fn | |
| if _embedding_fn is None: | |
| _embedding_fn = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL) | |
| return _embedding_fn | |
| def get_collection(name: str): | |
| """Obtiene una colección de ChromaDB por nombre.""" | |
| return get_client().get_collection(name=name, embedding_function=get_embedding_fn()) | |
| # --- Tools (idénticas en comportamiento a v1) --- | |
| def search_theory( | |
| query: str, | |
| book: str = "all", | |
| top_k: int = 5, | |
| ) -> str: | |
| """ | |
| Busca fragmentos relevantes en los libros ESL, ISLP, FES, PDSH y R4DS usando búsqueda semántica. | |
| Args: | |
| query (str): Consulta en lenguaje natural (ej: "bias-variance tradeoff", | |
| "regularización L1 vs L2", "random forest out-of-bag error", | |
| "exploratory data analysis iterative cycle"). | |
| book (str): Libro donde buscar. Opciones: "esl", "islp", "fes", "pdsh", "r4ds", | |
| "both" (ESL+ISLP, retro-compat) o "all" (los 5, default). | |
| top_k (int): Número de resultados a devolver (default: 5, máximo: 10). | |
| Returns: | |
| str: Fragmentos relevantes con metadata (libro, capítulo, sección, similitud). | |
| Nota: | |
| R4DS está escrito en R (tidyverse). Sus principios de EDA, transformación | |
| de datos y manipulación tabular son agnósticos del lenguaje y se traducen | |
| directamente a pandas/Python; el código en R debe leerse como pseudocódigo. | |
| En v2, R4DS solo está disponible cuando se usa con `RAG_CHROMA_DIR` local; | |
| no se publica en el dataset HF por su licencia CC BY-NC-ND 3.0 US. | |
| """ | |
| top_k = min(max(int(top_k), 1), 10) | |
| # "both" se mantiene para retro-compatibilidad (ESL + ISLP) | |
| # "all" incluye FES, PDSH y R4DS también | |
| collections_to_search = [] | |
| if book in ("esl", "both", "all"): | |
| try: | |
| collections_to_search.append(("ESL", get_collection("esl_chapters"))) | |
| except Exception: | |
| pass | |
| if book in ("islp", "both", "all"): | |
| try: | |
| collections_to_search.append(("ISLP", get_collection("islp_chapters"))) | |
| except Exception: | |
| pass | |
| if book in ("fes", "all"): | |
| try: | |
| collections_to_search.append(("FES", get_collection("fes_chapters"))) | |
| except Exception: | |
| pass | |
| if book in ("pdsh", "all"): | |
| try: | |
| collections_to_search.append(("PDSH", get_collection("pdsh_chapters"))) | |
| except Exception: | |
| pass | |
| if book in ("r4ds", "all"): | |
| try: | |
| collections_to_search.append(("R4DS", get_collection("r4ds_chapters"))) | |
| except Exception: | |
| pass | |
| if not collections_to_search: | |
| return ( | |
| "❌ No se encontraron colecciones. Verifica que el dataset HF " | |
| "esté disponible o ejecuta la ingesta local." | |
| ) | |
| results = [] | |
| for book_label, collection in collections_to_search: | |
| res = collection.query(query_texts=[query], n_results=top_k) | |
| if res["documents"] and res["documents"][0]: | |
| for doc, meta, dist in zip( | |
| res["documents"][0], res["metadatas"][0], res["distances"][0] | |
| ): | |
| similarity = 1 - dist | |
| results.append({ | |
| "book": book_label, | |
| "chapter": meta.get("chapter", ""), | |
| "section": meta.get("section", ""), | |
| "similarity": similarity, | |
| "content": doc, | |
| }) | |
| results.sort(key=lambda x: x["similarity"], reverse=True) | |
| results = results[:top_k] | |
| if not results: | |
| return f"No se encontraron resultados para: '{query}'" | |
| output_parts = [f"## Resultados para: \"{query}\"\n"] | |
| for i, r in enumerate(results, 1): | |
| output_parts.append( | |
| f"### [{i}] {r['book']} — {r['chapter']} § {r['section']}\n" | |
| f"**Similitud:** {r['similarity']:.3f}\n\n" | |
| f"{r['content'][:1500]}\n\n---\n" | |
| ) | |
| return "\n".join(output_parts) | |
| def get_section( | |
| book: str, | |
| chapter: str, | |
| section: str = "", | |
| max_chunks: int = 5, | |
| ) -> str: | |
| """ | |
| Recupera una sección específica de un libro por referencia exacta. | |
| Args: | |
| book (str): Libro a consultar. Opciones: "esl", "islp", "fes", "pdsh" o "r4ds". | |
| chapter (str): Nombre del capítulo (búsqueda parcial soportada). | |
| section (str): Nombre de la sección dentro del capítulo (opcional). | |
| max_chunks (int): Máximo de chunks a devolver (default: 5). | |
| Returns: | |
| str: Contenido de la sección con metadata. | |
| """ | |
| max_chunks = int(max_chunks) | |
| collection_name = f"{book}_chapters" | |
| try: | |
| collection = get_collection(collection_name) | |
| except Exception: | |
| return f"❌ Colección '{collection_name}' no encontrada. Opciones: esl, islp, fes, pdsh, r4ds" | |
| try: | |
| if section: | |
| results = collection.get( | |
| where={"$and": [ | |
| {"chapter": {"$contains": chapter}}, | |
| {"section": {"$contains": section}}, | |
| ]}, | |
| limit=max_chunks, | |
| ) | |
| else: | |
| results = collection.get( | |
| where={"chapter": {"$contains": chapter}}, | |
| limit=max_chunks, | |
| ) | |
| except Exception: | |
| search_query = f"{chapter} {section}".strip() | |
| results = collection.query(query_texts=[search_query], n_results=max_chunks) | |
| if results["documents"] and results["documents"][0]: | |
| output_parts = [f"## {book.upper()} — {chapter}\n"] | |
| for doc, meta in zip(results["documents"][0], results["metadatas"][0]): | |
| output_parts.append( | |
| f"### § {meta.get('section', 'N/A')}\n\n{doc}\n\n---\n" | |
| ) | |
| return "\n".join(output_parts) | |
| return f"No se encontró el capítulo '{chapter}' en {book.upper()}" | |
| if not results["documents"]: | |
| search_query = f"{chapter} {section}".strip() | |
| results = collection.query(query_texts=[search_query], n_results=max_chunks) | |
| if results["documents"] and results["documents"][0]: | |
| output_parts = [f"## {book.upper()} — {chapter}\n"] | |
| for doc, meta in zip(results["documents"][0], results["metadatas"][0]): | |
| output_parts.append( | |
| f"### § {meta.get('section', 'N/A')}\n\n{doc}\n\n---\n" | |
| ) | |
| return "\n".join(output_parts) | |
| return f"No se encontró el capítulo '{chapter}' en {book.upper()}" | |
| output_parts = [f"## {book.upper()} — {chapter}"] | |
| if section: | |
| output_parts[0] += f" § {section}" | |
| output_parts[0] += "\n" | |
| for doc, meta in zip(results["documents"], results["metadatas"]): | |
| sec_title = meta.get("section", "") | |
| chunk_idx = meta.get("chunk_index", 0) | |
| total = meta.get("total_chunks_in_section", 1) | |
| output_parts.append( | |
| f"### § {sec_title} (parte {chunk_idx + 1}/{total})\n\n{doc}\n\n---\n" | |
| ) | |
| return "\n".join(output_parts) | |
| def cite_foundation( | |
| topic: str, | |
| detail_level: str = "medium", | |
| ) -> str: | |
| """ | |
| Devuelve la fundamentación teórica para un tema citando los libros (ESL + ISLP + FES + PDSH + R4DS). | |
| Args: | |
| topic (str): Tema a fundamentar (ej: "ridge regression", "bagging", | |
| "feature engineering", "missing data imputation", | |
| "exploratory data analysis"). | |
| detail_level (str): "brief" (1-2), "medium" (3-4) o "deep" (6-8). | |
| Returns: | |
| str: Fundamentación teórica con citas, organizada de intuitivo (ISLP) | |
| a riguroso (ESL), más prácticas de feature engineering (FES), | |
| código práctico Python (PDSH) y workflow iterativo de EDA / data | |
| wrangling (R4DS, ejemplos en R). | |
| """ | |
| top_k_map = {"brief": 2, "medium": 4, "deep": 8} | |
| top_k = top_k_map.get(detail_level, 4) | |
| islp_results = [] | |
| try: | |
| islp_col = get_collection("islp_chapters") | |
| res = islp_col.query(query_texts=[topic], n_results=top_k) | |
| if res["documents"] and res["documents"][0]: | |
| for doc, meta, dist in zip( | |
| res["documents"][0], res["metadatas"][0], res["distances"][0] | |
| ): | |
| islp_results.append({ | |
| "content": doc, | |
| "chapter": meta.get("chapter", ""), | |
| "section": meta.get("section", ""), | |
| "similarity": 1 - dist, | |
| }) | |
| except Exception: | |
| pass | |
| esl_results = [] | |
| try: | |
| esl_col = get_collection("esl_chapters") | |
| res = esl_col.query(query_texts=[topic], n_results=top_k) | |
| if res["documents"] and res["documents"][0]: | |
| for doc, meta, dist in zip( | |
| res["documents"][0], res["metadatas"][0], res["distances"][0] | |
| ): | |
| esl_results.append({ | |
| "content": doc, | |
| "chapter": meta.get("chapter", ""), | |
| "section": meta.get("section", ""), | |
| "similarity": 1 - dist, | |
| }) | |
| except Exception: | |
| pass | |
| fes_results = [] | |
| try: | |
| fes_col = get_collection("fes_chapters") | |
| res = fes_col.query(query_texts=[topic], n_results=top_k) | |
| if res["documents"] and res["documents"][0]: | |
| for doc, meta, dist in zip( | |
| res["documents"][0], res["metadatas"][0], res["distances"][0] | |
| ): | |
| fes_results.append({ | |
| "content": doc, | |
| "chapter": meta.get("chapter", ""), | |
| "section": meta.get("section", ""), | |
| "similarity": 1 - dist, | |
| }) | |
| except Exception: | |
| pass | |
| pdsh_results = [] | |
| try: | |
| pdsh_col = get_collection("pdsh_chapters") | |
| res = pdsh_col.query(query_texts=[topic], n_results=top_k) | |
| if res["documents"] and res["documents"][0]: | |
| for doc, meta, dist in zip( | |
| res["documents"][0], res["metadatas"][0], res["distances"][0] | |
| ): | |
| pdsh_results.append({ | |
| "content": doc, | |
| "chapter": meta.get("chapter", ""), | |
| "section": meta.get("section", ""), | |
| "similarity": 1 - dist, | |
| }) | |
| except Exception: | |
| pass | |
| r4ds_results = [] | |
| try: | |
| r4ds_col = get_collection("r4ds_chapters") | |
| res = r4ds_col.query(query_texts=[topic], n_results=top_k) | |
| if res["documents"] and res["documents"][0]: | |
| for doc, meta, dist in zip( | |
| res["documents"][0], res["metadatas"][0], res["distances"][0] | |
| ): | |
| r4ds_results.append({ | |
| "content": doc, | |
| "chapter": meta.get("chapter", ""), | |
| "section": meta.get("section", ""), | |
| "similarity": 1 - dist, | |
| }) | |
| except Exception: | |
| pass | |
| if not islp_results and not esl_results and not fes_results and not pdsh_results and not r4ds_results: | |
| return ( | |
| f"❌ No se encontró fundamentación para '{topic}'. " | |
| "Verifica que la ingesta se haya ejecutado correctamente." | |
| ) | |
| output_parts = [ | |
| f"# Fundamentación Teórica: {topic}\n", | |
| f"**Nivel de detalle:** {detail_level}\n", | |
| ] | |
| if islp_results: | |
| output_parts.append("\n## 📘 ISLP (Explicación Intuitiva)\n") | |
| for i, r in enumerate(islp_results, 1): | |
| output_parts.append( | |
| f"### [{i}] Cap. {r['chapter']} § {r['section']} " | |
| f"(sim: {r['similarity']:.3f})\n\n" | |
| f"{r['content'][:1200]}\n\n---\n" | |
| ) | |
| if esl_results: | |
| output_parts.append("\n## 📗 ESL (Tratamiento Riguroso)\n") | |
| for i, r in enumerate(esl_results, 1): | |
| output_parts.append( | |
| f"### [{i}] Cap. {r['chapter']} § {r['section']} " | |
| f"(sim: {r['similarity']:.3f})\n\n" | |
| f"{r['content'][:1200]}\n\n---\n" | |
| ) | |
| if fes_results: | |
| output_parts.append("\n## 📙 FES (Feature Engineering Práctico)\n") | |
| for i, r in enumerate(fes_results, 1): | |
| output_parts.append( | |
| f"### [{i}] Cap. {r['chapter']} § {r['section']} " | |
| f"(sim: {r['similarity']:.3f})\n\n" | |
| f"{r['content'][:1200]}\n\n---\n" | |
| ) | |
| if pdsh_results: | |
| output_parts.append("\n## 📓 PDSH (Código Práctico Python)\n") | |
| for i, r in enumerate(pdsh_results, 1): | |
| output_parts.append( | |
| f"### [{i}] Cap. {r['chapter']} § {r['section']} " | |
| f"(sim: {r['similarity']:.3f})\n\n" | |
| f"{r['content'][:1200]}\n\n---\n" | |
| ) | |
| if r4ds_results: | |
| output_parts.append( | |
| "\n## 📕 R4DS (EDA & Data Wrangling — ejemplos en R, principios universales)\n" | |
| "> ⚠️ El código está en R con tidyverse. Léelo como pseudocódigo: el flujo, " | |
| "las heurísticas y la filosofía iterativa de EDA se traducen directamente a " | |
| "pandas/Python.\n" | |
| ) | |
| for i, r in enumerate(r4ds_results, 1): | |
| output_parts.append( | |
| f"### [{i}] Cap. {r['chapter']} § {r['section']} " | |
| f"(sim: {r['similarity']:.3f})\n\n" | |
| f"{r['content'][:1200]}\n\n---\n" | |
| ) | |
| output_parts.append("\n## 📚 Referencias\n") | |
| if islp_results: | |
| chapters = set(r["chapter"] for r in islp_results) | |
| output_parts.append(f"- **ISLP:** {', '.join(chapters)}\n") | |
| if esl_results: | |
| chapters = set(r["chapter"] for r in esl_results) | |
| output_parts.append(f"- **ESL:** {', '.join(chapters)}\n") | |
| if fes_results: | |
| chapters = set(r["chapter"] for r in fes_results) | |
| output_parts.append(f"- **FES:** {', '.join(chapters)}\n") | |
| if pdsh_results: | |
| chapters = set(r["chapter"] for r in pdsh_results) | |
| output_parts.append(f"- **PDSH:** {', '.join(chapters)}\n") | |
| if r4ds_results: | |
| chapters = set(r["chapter"] for r in r4ds_results) | |
| output_parts.append( | |
| f"- **R4DS:** {', '.join(chapters)} _(R / tidyverse — principios transferibles a pandas)_\n" | |
| ) | |
| return "\n".join(output_parts) | |
| def list_available_topics() -> str: | |
| """ | |
| Lista los capítulos y temas indexados en la base de conocimiento. | |
| Returns: | |
| str: Lista organizada de capítulos por libro con sus secciones principales. | |
| """ | |
| output_parts = ["# 📚 Contenido Disponible en la Base de Conocimiento\n"] | |
| for book_key, collection_name in [ | |
| ("ESL", "esl_chapters"), | |
| ("ISLP", "islp_chapters"), | |
| ("FES", "fes_chapters"), | |
| ("PDSH", "pdsh_chapters"), | |
| ("R4DS", "r4ds_chapters"), | |
| ]: | |
| try: | |
| collection = get_collection(collection_name) | |
| all_data = collection.get(include=["metadatas"]) | |
| if not all_data["metadatas"]: | |
| output_parts.append(f"\n## {book_key}: Sin datos\n") | |
| continue | |
| chapters = {} | |
| for meta in all_data["metadatas"]: | |
| chapter = meta.get("chapter", "Unknown") | |
| section = meta.get("section", "") | |
| if chapter not in chapters: | |
| chapters[chapter] = set() | |
| if section: | |
| chapters[chapter].add(section) | |
| output_parts.append(f"\n## 📗 {book_key}\n") | |
| for chapter in sorted(chapters.keys()): | |
| sections = sorted(chapters[chapter]) | |
| output_parts.append(f"\n### {chapter}\n") | |
| if sections: | |
| for sec in sections[:8]: | |
| output_parts.append(f" - {sec}\n") | |
| if len(sections) > 8: | |
| output_parts.append(f" - ... y {len(sections) - 8} secciones más\n") | |
| total = collection.count() | |
| output_parts.append(f"\n**Total chunks indexados:** {total}\n") | |
| except Exception as e: | |
| output_parts.append(f"\n## {book_key}: ❌ Error ({e})\n") | |
| return "\n".join(output_parts) | |