Spaces:

gusdelact
/

rag-books-mcp-v2

Sleeping

App Files Files Community

gusdelact commited on May 18

Commit

55d52e5

verified ·

1 Parent(s): 9fd48e5

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

rag_books_mcp/app.py +34 -15
rag_books_mcp/ingest.py +17 -0
rag_books_mcp/tools.py +64 -12

rag_books_mcp/app.py CHANGED Viewed

@@ -33,14 +33,22 @@ def _build_search_tab() -> gr.Interface:
                 value="bias-variance tradeoff",
                 placeholder="Consulta en lenguaje natural",
             ),
-            gr.Radio(choices=["both", "esl", "islp"], value="both", label="book"),
             gr.Slider(minimum=1, maximum=10, step=1, value=5, label="top_k"),
         ],
         outputs=gr.Markdown(label="Resultados"),
         title="🔎 search_theory",
         description=(
-            "Búsqueda semántica en ESL e ISLP. Devuelve los fragmentos más "
-            "relevantes ordenados por similitud."
         ),
         api_name="search_theory",
     )
@@ -50,7 +58,7 @@ def _build_get_section_tab() -> gr.Interface:
     return gr.Interface(
         fn=get_section,
         inputs=[
-            gr.Radio(choices=["esl", "islp"], value="islp", label="book"),
             gr.Textbox(
                 label="chapter",
                 value="8 Tree-Based Methods",
@@ -66,8 +74,8 @@ def _build_get_section_tab() -> gr.Interface:
         outputs=gr.Markdown(label="Sección"),
         title="📑 get_section",
         description=(
-            "Recupera una sección específica de ESL o ISLP. Si no se encuentra "
-            "por metadata, hace fallback a búsqueda semántica."
         ),
         api_name="get_section",
     )
@@ -80,7 +88,7 @@ def _build_cite_tab() -> gr.Interface:
             gr.Textbox(
                 label="topic",
                 value="ridge regression",
-                placeholder="Tema a fundamentar",
             ),
             gr.Radio(
                 choices=["brief", "medium", "deep"],
@@ -91,8 +99,9 @@ def _build_cite_tab() -> gr.Interface:
         outputs=gr.Markdown(label="Fundamentación"),
         title="📚 cite_foundation",
         description=(
-            "Fundamentación teórica que cita ambos libros: ISLP (intuitivo) y "
-            "ESL (riguroso)."
         ),
         api_name="cite_foundation",
     )
@@ -111,16 +120,26 @@ def _build_list_topics_tab() -> gr.Interface:
 def build_demo() -> gr.Blocks:
     """Construye la UI tabulada del MCP Server v2."""
-    with gr.Blocks(title="rag-books-mcp v2 · ESL + ISLP") as demo:
         gr.Markdown(
             """
-            # 📖 RAG Books MCP v2 — ESL + ISLP
-            Servidor MCP que expone búsqueda semántica sobre dos libros de
-            referencia de Statistical Learning:
             - **ESL** — *The Elements of Statistical Learning* (Hastie, Tibshirani, Friedman)
             - **ISLP** — *An Introduction to Statistical Learning with Python* (James, Witten, Hastie, Tibshirani)
             **v2 vs v1:** la base ChromaDB se carga desde el dataset HF
             `gusdelact/rag-esl-islp-chromadb` en lugar de empaquetarla con el
@@ -129,9 +148,9 @@ def build_demo() -> gr.Blocks:
             **Endpoint MCP:** `/gradio_api/mcp/` (streamable HTTP).
             **Embeddings:** `sentence-transformers/all-MiniLM-L6-v2` (local, sin API key).
-            **Vector store:** ChromaDB con 1977 chunks (1093 ESL + 884 ISLP).
-            La primera tool call descarga el dataset (~40 MB). Las siguientes
             son cache hit.
             """
         )

                 value="bias-variance tradeoff",
                 placeholder="Consulta en lenguaje natural",
             ),
+            gr.Radio(
+                choices=["all", "both", "esl", "islp", "fes", "pdsh"],
+                value="all",
+                label="book",
+                info=(
+                    "R4DS no se ofrece en este Space por su licencia CC BY-NC-ND. "
+                    "Disponible solo en la variante local con RAG_CHROMA_DIR."
+                ),
+            ),
             gr.Slider(minimum=1, maximum=10, step=1, value=5, label="top_k"),
         ],
         outputs=gr.Markdown(label="Resultados"),
         title="🔎 search_theory",
         description=(
+            "Búsqueda semántica en ESL, ISLP, FES y PDSH. Devuelve los fragmentos "
+            "más relevantes ordenados por similitud."
         ),
         api_name="search_theory",
     )
     return gr.Interface(
         fn=get_section,
         inputs=[
+            gr.Radio(choices=["esl", "islp", "fes", "pdsh"], value="islp", label="book"),
             gr.Textbox(
                 label="chapter",
                 value="8 Tree-Based Methods",
         outputs=gr.Markdown(label="Sección"),
         title="📑 get_section",
         description=(
+            "Recupera una sección específica de ESL, ISLP, FES o PDSH. Si no se "
+            "encuentra por metadata, hace fallback a búsqueda semántica."
         ),
         api_name="get_section",
     )
             gr.Textbox(
                 label="topic",
                 value="ridge regression",
+                placeholder="Tema a fundamentar (ej: 'bagging', 'feature selection')",
             ),
             gr.Radio(
                 choices=["brief", "medium", "deep"],
         outputs=gr.Markdown(label="Fundamentación"),
         title="📚 cite_foundation",
         description=(
+            "Fundamentación teórica que cita los libros publicados: "
+            "ISLP (intuitivo), ESL (riguroso), FES (feature engineering) y "
+            "PDSH (código Python)."
         ),
         api_name="cite_foundation",
     )
 def build_demo() -> gr.Blocks:
     """Construye la UI tabulada del MCP Server v2."""
+    with gr.Blocks(title="rag-books-mcp v2 · ESL + ISLP + FES + PDSH") as demo:
         gr.Markdown(
             """
+            # 📖 RAG Books MCP v2 — ESL + ISLP + FES + PDSH
+            Servidor MCP que expone búsqueda semántica sobre cuatro libros de
+            referencia de Statistical Learning y Data Science:
             - **ESL** — *The Elements of Statistical Learning* (Hastie, Tibshirani, Friedman)
             - **ISLP** — *An Introduction to Statistical Learning with Python* (James, Witten, Hastie, Tibshirani)
+            - **FES** — *Feature Engineering and Selection* (Kuhn, Johnson)
+            - **PDSH** — *Python Data Science Handbook* (VanderPlas)
+            > ℹ️ **Sobre R4DS** — *R for Data Science, 2nd Ed.* (Wickham et al.)
+            > está indexado en la **variante local** del server pero **NO**
+            > en este Space. La razón es su licencia CC BY-NC-ND 3.0 US
+            > (NoDerivatives), incompatible con redistribución pública en
+            > formato vectorial. Para usarlo, corre el server localmente con
+            > `RAG_CHROMA_DIR` apuntando a tu propio `chroma_db/` con R4DS
+            > indexado. Detalles en el [DATA_CARD del dataset](https://huggingface.co/datasets/gusdelact/rag-esl-islp-chromadb).
             **v2 vs v1:** la base ChromaDB se carga desde el dataset HF
             `gusdelact/rag-esl-islp-chromadb` en lugar de empaquetarla con el
             **Endpoint MCP:** `/gradio_api/mcp/` (streamable HTTP).
             **Embeddings:** `sentence-transformers/all-MiniLM-L6-v2` (local, sin API key).
+            **Vector store:** ChromaDB con 3005 chunks (1093 ESL + 884 ISLP + 465 FES + 563 PDSH).
+            La primera tool call descarga el dataset (~95 MB). Las siguientes
             son cache hit.
             """
         )

rag_books_mcp/ingest.py CHANGED Viewed

@@ -43,6 +43,21 @@ BOOKS_CONFIG = {
         "collection": "pdsh_chapters",
         "full_name": "Python Data Science Handbook (VanderPlas)",
     },
 }
@@ -173,6 +188,8 @@ def ingest_book(books_dir: Path, book_key: str, client: chromadb.ClientAPI, embe
     total_chunks = 0
     md_files = sorted(chapters_dir.glob("*.md"))
     print(f"\n  📚 {config['full_name']}")
     print(f"     Archivos encontrados: {len(md_files)}")

         "collection": "pdsh_chapters",
         "full_name": "Python Data Science Handbook (VanderPlas)",
     },
+    "r4ds": {
+        "dir_name": "capitulos_r4ds",
+        "collection": "r4ds_chapters",
+        "full_name": (
+            "R for Data Science, 2nd Edition "
+            "(Wickham, Çetinkaya-Rundel, Grolemund) — examples in R; "
+            "principles transfer to pandas/Python"
+        ),
+        # ⚠️ R4DS está bajo licencia CC BY-NC-ND 3.0 US (NoDerivatives).
+        # `local_only=True` indica que esta colección NO debe redistribuirse
+        # como dataset HF público. `publish_chroma_dataset.py` la elimina del
+        # snapshot antes de subir al Hub. Para uso local (RAG_CHROMA_DIR) el
+        # comportamiento es transparente.
+        "local_only": True,
+    },
 }
     total_chunks = 0
     md_files = sorted(chapters_dir.glob("*.md"))
+    # Excluir READMEs y notas auxiliares: solo capítulos NN_*.md
+    md_files = [f for f in md_files if not f.name.upper().startswith("README")]
     print(f"\n  📚 {config['full_name']}")
     print(f"     Archivos encontrados: {len(md_files)}")

rag_books_mcp/tools.py CHANGED Viewed

@@ -132,22 +132,30 @@ def search_theory(
     top_k: int = 5,
 ) -> str:
     """
-    Busca fragmentos relevantes en los libros ESL, ISLP, FES y PDSH usando búsqueda semántica.
     Args:
         query (str): Consulta en lenguaje natural (ej: "bias-variance tradeoff",
-                     "regularización L1 vs L2", "random forest out-of-bag error").
-        book (str): Libro donde buscar. Opciones: "esl", "islp", "fes", "pdsh",
-                    "both" (ESL+ISLP, retro-compat) o "all" (los 4, default).
         top_k (int): Número de resultados a devolver (default: 5, máximo: 10).
     Returns:
         str: Fragmentos relevantes con metadata (libro, capítulo, sección, similitud).
     """
     top_k = min(max(int(top_k), 1), 10)
     # "both" se mantiene para retro-compatibilidad (ESL + ISLP)
-    # "all" incluye FES también
     collections_to_search = []
     if book in ("esl", "both", "all"):
         try:
@@ -169,6 +177,11 @@ def search_theory(
             collections_to_search.append(("PDSH", get_collection("pdsh_chapters")))
         except Exception:
             pass
     if not collections_to_search:
         return (
@@ -219,7 +232,7 @@ def get_section(
     Recupera una sección específica de un libro por referencia exacta.
     Args:
-        book (str): Libro a consultar. Opciones: "esl", "islp", "fes" o "pdsh".
         chapter (str): Nombre del capítulo (búsqueda parcial soportada).
         section (str): Nombre de la sección dentro del capítulo (opcional).
         max_chunks (int): Máximo de chunks a devolver (default: 5).
@@ -232,7 +245,7 @@ def get_section(
     try:
         collection = get_collection(collection_name)
     except Exception:
-        return f"❌ Colección '{collection_name}' no encontrada. Opciones: esl, islp, fes, pdsh"
     try:
         if section:
@@ -292,17 +305,19 @@ def cite_foundation(
     detail_level: str = "medium",
 ) -> str:
     """
-    Devuelve la fundamentación teórica para un tema citando los libros (ESL + ISLP + FES + PDSH).
     Args:
         topic (str): Tema a fundamentar (ej: "ridge regression", "bagging",
-                     "feature engineering", "missing data imputation").
         detail_level (str): "brief" (1-2), "medium" (3-4) o "deep" (6-8).
     Returns:
         str: Fundamentación teórica con citas, organizada de intuitivo (ISLP)
-             a riguroso (ESL), más prácticas de feature engineering (FES)
-             y código práctico (PDSH).
     """
     top_k_map = {"brief": 2, "medium": 4, "deep": 8}
     top_k = top_k_map.get(detail_level, 4)
@@ -375,7 +390,24 @@ def cite_foundation(
     except Exception:
         pass
-    if not islp_results and not esl_results and not fes_results and not pdsh_results:
         return (
             f"❌ No se encontró fundamentación para '{topic}'. "
             "Verifica que la ingesta se haya ejecutado correctamente."
@@ -422,6 +454,20 @@ def cite_foundation(
                 f"{r['content'][:1200]}\n\n---\n"
             )
     output_parts.append("\n## 📚 Referencias\n")
     if islp_results:
         chapters = set(r["chapter"] for r in islp_results)
@@ -435,6 +481,11 @@ def cite_foundation(
     if pdsh_results:
         chapters = set(r["chapter"] for r in pdsh_results)
         output_parts.append(f"- **PDSH:** {', '.join(chapters)}\n")
     return "\n".join(output_parts)
@@ -453,6 +504,7 @@ def list_available_topics() -> str:
         ("ISLP", "islp_chapters"),
         ("FES", "fes_chapters"),
         ("PDSH", "pdsh_chapters"),
     ]:
         try:
             collection = get_collection(collection_name)

     top_k: int = 5,
 ) -> str:
     """
+    Busca fragmentos relevantes en los libros ESL, ISLP, FES, PDSH y R4DS usando búsqueda semántica.
     Args:
         query (str): Consulta en lenguaje natural (ej: "bias-variance tradeoff",
+                     "regularización L1 vs L2", "random forest out-of-bag error",
+                     "exploratory data analysis iterative cycle").
+        book (str): Libro donde buscar. Opciones: "esl", "islp", "fes", "pdsh", "r4ds",
+                    "both" (ESL+ISLP, retro-compat) o "all" (los 5, default).
         top_k (int): Número de resultados a devolver (default: 5, máximo: 10).
     Returns:
         str: Fragmentos relevantes con metadata (libro, capítulo, sección, similitud).
+    Nota:
+        R4DS está escrito en R (tidyverse). Sus principios de EDA, transformación
+        de datos y manipulación tabular son agnósticos del lenguaje y se traducen
+        directamente a pandas/Python; el código en R debe leerse como pseudocódigo.
+        En v2, R4DS solo está disponible cuando se usa con `RAG_CHROMA_DIR` local;
+        no se publica en el dataset HF por su licencia CC BY-NC-ND 3.0 US.
     """
     top_k = min(max(int(top_k), 1), 10)
     # "both" se mantiene para retro-compatibilidad (ESL + ISLP)
+    # "all" incluye FES, PDSH y R4DS también
     collections_to_search = []
     if book in ("esl", "both", "all"):
         try:
             collections_to_search.append(("PDSH", get_collection("pdsh_chapters")))
         except Exception:
             pass
+    if book in ("r4ds", "all"):
+        try:
+            collections_to_search.append(("R4DS", get_collection("r4ds_chapters")))
+        except Exception:
+            pass
     if not collections_to_search:
         return (
     Recupera una sección específica de un libro por referencia exacta.
     Args:
+        book (str): Libro a consultar. Opciones: "esl", "islp", "fes", "pdsh" o "r4ds".
         chapter (str): Nombre del capítulo (búsqueda parcial soportada).
         section (str): Nombre de la sección dentro del capítulo (opcional).
         max_chunks (int): Máximo de chunks a devolver (default: 5).
     try:
         collection = get_collection(collection_name)
     except Exception:
+        return f"❌ Colección '{collection_name}' no encontrada. Opciones: esl, islp, fes, pdsh, r4ds"
     try:
         if section:
     detail_level: str = "medium",
 ) -> str:
     """
+    Devuelve la fundamentación teórica para un tema citando los libros (ESL + ISLP + FES + PDSH + R4DS).
     Args:
         topic (str): Tema a fundamentar (ej: "ridge regression", "bagging",
+                     "feature engineering", "missing data imputation",
+                     "exploratory data analysis").
         detail_level (str): "brief" (1-2), "medium" (3-4) o "deep" (6-8).
     Returns:
         str: Fundamentación teórica con citas, organizada de intuitivo (ISLP)
+             a riguroso (ESL), más prácticas de feature engineering (FES),
+             código práctico Python (PDSH) y workflow iterativo de EDA / data
+             wrangling (R4DS, ejemplos en R).
     """
     top_k_map = {"brief": 2, "medium": 4, "deep": 8}
     top_k = top_k_map.get(detail_level, 4)
     except Exception:
         pass
+    r4ds_results = []
+    try:
+        r4ds_col = get_collection("r4ds_chapters")
+        res = r4ds_col.query(query_texts=[topic], n_results=top_k)
+        if res["documents"] and res["documents"][0]:
+            for doc, meta, dist in zip(
+                res["documents"][0], res["metadatas"][0], res["distances"][0]
+            ):
+                r4ds_results.append({
+                    "content": doc,
+                    "chapter": meta.get("chapter", ""),
+                    "section": meta.get("section", ""),
+                    "similarity": 1 - dist,
+                })
+    except Exception:
+        pass
+    if not islp_results and not esl_results and not fes_results and not pdsh_results and not r4ds_results:
         return (
             f"❌ No se encontró fundamentación para '{topic}'. "
             "Verifica que la ingesta se haya ejecutado correctamente."
                 f"{r['content'][:1200]}\n\n---\n"
             )
+    if r4ds_results:
+        output_parts.append(
+            "\n## 📕 R4DS (EDA & Data Wrangling — ejemplos en R, principios universales)\n"
+            "> ⚠️ El código está en R con tidyverse. Léelo como pseudocódigo: el flujo, "
+            "las heurísticas y la filosofía iterativa de EDA se traducen directamente a "
+            "pandas/Python.\n"
+        )
+        for i, r in enumerate(r4ds_results, 1):
+            output_parts.append(
+                f"### [{i}] Cap. {r['chapter']} § {r['section']} "
+                f"(sim: {r['similarity']:.3f})\n\n"
+                f"{r['content'][:1200]}\n\n---\n"
+            )
     output_parts.append("\n## 📚 Referencias\n")
     if islp_results:
         chapters = set(r["chapter"] for r in islp_results)
     if pdsh_results:
         chapters = set(r["chapter"] for r in pdsh_results)
         output_parts.append(f"- **PDSH:** {', '.join(chapters)}\n")
+    if r4ds_results:
+        chapters = set(r["chapter"] for r in r4ds_results)
+        output_parts.append(
+            f"- **R4DS:** {', '.join(chapters)} _(R / tidyverse — principios transferibles a pandas)_\n"
+        )
     return "\n".join(output_parts)
         ("ISLP", "islp_chapters"),
         ("FES", "fes_chapters"),
         ("PDSH", "pdsh_chapters"),
+        ("R4DS", "r4ds_chapters"),
     ]:
         try:
             collection = get_collection(collection_name)