Spaces:

gusdelact
/

rag-books-mcp-v2

Sleeping

App Files Files Community

gusdelact commited on May 18

Commit

2e3520f

verified ·

1 Parent(s): 7689f43

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

rag_books_mcp/__init__.py +10 -0
rag_books_mcp/app.py +158 -0
rag_books_mcp/ingest.py +244 -0
rag_books_mcp/server.py +75 -0
rag_books_mcp/tools.py +412 -0

rag_books_mcp/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# RAG Books MCP Server v2
+"""MCP Server v2 que expone herramientas RAG sobre ESL e ISLP.
+Diferencia clave con v1: la base vectorial ChromaDB no se empaqueta junto al
+código. Se publica como dataset en HF Hub (`gusdelact/rag-esl-islp-chromadb`
+por default) y este server hace `snapshot_download` la primera vez que se
+necesita. Ver `tools.py` para los detalles.
+"""
+__version__ = "2.0.0"

rag_books_mcp/app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""
+Gradio app v2 que expone las 4 tools como MCP Server (streamable HTTP).
+Diferencia con v1: la base ChromaDB se descarga del dataset HF Hub al primer
+uso. La carga del modelo de embeddings + descarga del snapshot se hace lazy
+en la primera tool call, no al arrancar el Space.
+Local:
+    uv run python -m rag_books_mcp.app
+HF Spaces:
+    Ver `deploy_to_hf_space.py`.
+"""
+from __future__ import annotations
+import gradio as gr
+from rag_books_mcp.tools import (
+    cite_foundation,
+    get_section,
+    list_available_topics,
+    search_theory,
+)
+def _build_search_tab() -> gr.Interface:
+    return gr.Interface(
+        fn=search_theory,
+        inputs=[
+            gr.Textbox(
+                label="query",
+                value="bias-variance tradeoff",
+                placeholder="Consulta en lenguaje natural",
+            ),
+            gr.Radio(choices=["both", "esl", "islp"], value="both", label="book"),
+            gr.Slider(minimum=1, maximum=10, step=1, value=5, label="top_k"),
+        ],
+        outputs=gr.Markdown(label="Resultados"),
+        title="🔎 search_theory",
+        description=(
+            "Búsqueda semántica en ESL e ISLP. Devuelve los fragmentos más "
+            "relevantes ordenados por similitud."
+        ),
+        api_name="search_theory",
+    )
+def _build_get_section_tab() -> gr.Interface:
+    return gr.Interface(
+        fn=get_section,
+        inputs=[
+            gr.Radio(choices=["esl", "islp"], value="islp", label="book"),
+            gr.Textbox(
+                label="chapter",
+                value="8 Tree-Based Methods",
+                placeholder="Nombre del capítulo (búsqueda parcial soportada)",
+            ),
+            gr.Textbox(
+                label="section",
+                value="",
+                placeholder="(Opcional) Nombre de la sección",
+            ),
+            gr.Slider(minimum=1, maximum=15, step=1, value=5, label="max_chunks"),
+        ],
+        outputs=gr.Markdown(label="Sección"),
+        title="📑 get_section",
+        description=(
+            "Recupera una sección específica de ESL o ISLP. Si no se encuentra "
+            "por metadata, hace fallback a búsqueda semántica."
+        ),
+        api_name="get_section",
+    )
+def _build_cite_tab() -> gr.Interface:
+    return gr.Interface(
+        fn=cite_foundation,
+        inputs=[
+            gr.Textbox(
+                label="topic",
+                value="ridge regression",
+                placeholder="Tema a fundamentar",
+            ),
+            gr.Radio(
+                choices=["brief", "medium", "deep"],
+                value="medium",
+                label="detail_level",
+            ),
+        ],
+        outputs=gr.Markdown(label="Fundamentación"),
+        title="📚 cite_foundation",
+        description=(
+            "Fundamentación teórica que cita ambos libros: ISLP (intuitivo) y "
+            "ESL (riguroso)."
+        ),
+        api_name="cite_foundation",
+    )
+def _build_list_topics_tab() -> gr.Interface:
+    return gr.Interface(
+        fn=list_available_topics,
+        inputs=[],
+        outputs=gr.Markdown(label="Contenido indexado"),
+        title="🗂️ list_available_topics",
+        description="Lista los capítulos y secciones indexados en ChromaDB.",
+        api_name="list_available_topics",
+    )
+def build_demo() -> gr.Blocks:
+    """Construye la UI tabulada del MCP Server v2."""
+    with gr.Blocks(title="rag-books-mcp v2 · ESL + ISLP") as demo:
+        gr.Markdown(
+            """
+            # 📖 RAG Books MCP v2 — ESL + ISLP
+            Servidor MCP que expone búsqueda semántica sobre dos libros de
+            referencia de Statistical Learning:
+            - **ESL** — *The Elements of Statistical Learning* (Hastie, Tibshirani, Friedman)
+            - **ISLP** — *An Introduction to Statistical Learning with Python* (James, Witten, Hastie, Tibshirani)
+            **v2 vs v1:** la base ChromaDB se carga desde el dataset HF
+            `gusdelact/rag-esl-islp-chromadb` en lugar de empaquetarla con el
+            código. Permite versionar el índice independientemente y reusarlo
+            desde otros clientes.
+            **Endpoint MCP:** `/gradio_api/mcp/` (streamable HTTP).
+            **Embeddings:** `sentence-transformers/all-MiniLM-L6-v2` (local, sin API key).
+            **Vector store:** ChromaDB con 1977 chunks (1093 ESL + 884 ISLP).
+            La primera tool call descarga el dataset (~40 MB). Las siguientes
+            son cache hit.
+            """
+        )
+        gr.TabbedInterface(
+            interface_list=[
+                _build_search_tab(),
+                _build_cite_tab(),
+                _build_get_section_tab(),
+                _build_list_topics_tab(),
+            ],
+            tab_names=["search_theory", "cite_foundation", "get_section", "list_available_topics"],
+        )
+    return demo
+def main() -> None:
+    demo = build_demo()
+    demo.launch(mcp_server=True, server_name="0.0.0.0")
+if __name__ == "__main__":
+    main()

rag_books_mcp/ingest.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+Script de ingesta: vectoriza los capítulos de ESL e ISLP en ChromaDB.
+Uso:
+    python -m rag_books_mcp.ingest --books-dir ../ebook
+Esto crea/actualiza la base vectorial en ./chroma_db/
+"""
+import os
+import re
+import argparse
+from pathlib import Path
+import chromadb
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+# --- Configuración ---
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+CHUNK_SIZE = 600  # tokens aprox (caracteres / 4)
+CHUNK_OVERLAP = 100
+CHROMA_DIR = Path(__file__).parent.parent / "chroma_db"
+BOOKS_CONFIG = {
+    "esl": {
+        "dir_name": "capitulos_TheElementsOfStatisticalLearning",
+        "collection": "esl_chapters",
+        "full_name": "The Elements of Statistical Learning (Hastie, Tibshirani, Friedman)",
+    },
+    "islp": {
+        "dir_name": "capitulos_islp",
+        "collection": "islp_chapters",
+        "full_name": "An Introduction to Statistical Learning with Python (James, Witten, Hastie, Tibshirani)",
+    },
+}
+def extract_chapter_info(filename: str) -> dict:
+    """Extrae número de archivo y nombre del capítulo del filename."""
+    # Formato: 04_3_Linear_Methods_for_Regression.md
+    stem = Path(filename).stem
+    parts = stem.split("_", 1)
+    file_order = parts[0] if parts else "00"
+    chapter_title = parts[1].replace("_", " ") if len(parts) > 1 else stem
+    return {"file_order": file_order, "chapter_title": chapter_title}
+def split_by_sections(text: str, chapter_title: str) -> list[dict]:
+    """
+    Divide el texto en secciones usando headers markdown (# y ##).
+    Cada sección se subdivide en chunks si es muy larga.
+    """
+    # Patrón para detectar headers de nivel 1-3
+    header_pattern = re.compile(r"^(#{1,3})\s+(.+)$", re.MULTILINE)
+    sections = []
+    matches = list(header_pattern.finditer(text))
+    if not matches:
+        # Sin headers, tratar todo como una sección
+        sections.append({"title": chapter_title, "level": 1, "content": text.strip()})
+    else:
+        # Texto antes del primer header
+        pre_text = text[: matches[0].start()].strip()
+        if pre_text and len(pre_text) > 50:
+            sections.append({"title": chapter_title, "level": 1, "content": pre_text})
+        for i, match in enumerate(matches):
+            level = len(match.group(1))
+            title = match.group(2).strip()
+            start = match.end()
+            end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
+            content = text[start:end].strip()
+            if content and len(content) > 30:
+                sections.append({"title": title, "level": level, "content": content})
+    return sections
+def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
+    """
+    Divide texto en chunks por caracteres con overlap.
+    Intenta cortar en saltos de línea o puntos para no romper oraciones.
+    """
+    # Convertir chunk_size de tokens aprox a caracteres (1 token ≈ 4 chars)
+    char_size = chunk_size * 4
+    char_overlap = overlap * 4
+    if len(text) <= char_size:
+        return [text]
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + char_size
+        if end < len(text):
+            # Buscar un buen punto de corte (párrafo o punto)
+            # Primero intentar doble newline (párrafo)
+            cut_point = text.rfind("\n\n", start + char_size // 2, end)
+            if cut_point == -1:
+                # Intentar punto seguido de espacio
+                cut_point = text.rfind(". ", start + char_size // 2, end)
+                if cut_point != -1:
+                    cut_point += 1  # incluir el punto
+            if cut_point == -1:
+                # Intentar newline simple
+                cut_point = text.rfind("\n", start + char_size // 2, end)
+            if cut_point == -1:
+                cut_point = end
+            end = cut_point
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        start = end - char_overlap
+        if start >= len(text):
+            break
+    return chunks
+def clean_text(text: str) -> str:
+    """Limpia artefactos de la extracción PDF."""
+    # Eliminar marcadores de página
+    text = re.sub(r"---\s*Página\s*\d+\s*---", "", text)
+    # Eliminar líneas con solo números (números de página sueltos)
+    text = re.sub(r"^\d+\s*$", "", text, flags=re.MULTILINE)
+    # Reducir múltiples líneas vacías
+    text = re.sub(r"\n{4,}", "\n\n\n", text)
+    # Eliminar copyright notices
+    text = re.sub(r"©.*?(?:\n|$)", "", text)
+    return text.strip()
+def ingest_book(books_dir: Path, book_key: str, client: chromadb.ClientAPI, embedding_fn):
+    """Ingesta un libro completo en ChromaDB."""
+    config = BOOKS_CONFIG[book_key]
+    chapters_dir = books_dir / config["dir_name"]
+    if not chapters_dir.exists():
+        print(f"  ⚠️  Directorio no encontrado: {chapters_dir}")
+        return 0
+    # Crear o obtener colección (reset si existe)
+    try:
+        client.delete_collection(config["collection"])
+    except Exception:
+        pass
+    collection = client.get_or_create_collection(
+        name=config["collection"],
+        embedding_function=embedding_fn,
+        metadata={"hnsw:space": "cosine"},
+    )
+    total_chunks = 0
+    md_files = sorted(chapters_dir.glob("*.md"))
+    print(f"\n  📚 {config['full_name']}")
+    print(f"     Archivos encontrados: {len(md_files)}")
+    for md_file in md_files:
+        chapter_info = extract_chapter_info(md_file.name)
+        raw_text = md_file.read_text(encoding="utf-8")
+        text = clean_text(raw_text)
+        if len(text) < 100:
+            continue
+        # Dividir en secciones
+        sections = split_by_sections(text, chapter_info["chapter_title"])
+        for section in sections:
+            # Dividir secciones largas en chunks
+            chunks = chunk_text(section["content"])
+            for i, chunk in enumerate(chunks):
+                chunk_id = f"{book_key}_{chapter_info['file_order']}_{section['title'][:30]}_{i}"
+                # Sanitizar ID
+                chunk_id = re.sub(r"[^a-zA-Z0-9_-]", "_", chunk_id)
+                metadata = {
+                    "book": book_key,
+                    "book_full_name": config["full_name"],
+                    "chapter": chapter_info["chapter_title"],
+                    "section": section["title"],
+                    "section_level": section["level"],
+                    "chunk_index": i,
+                    "total_chunks_in_section": len(chunks),
+                    "file": md_file.name,
+                }
+                collection.add(
+                    ids=[chunk_id],
+                    documents=[chunk],
+                    metadatas=[metadata],
+                )
+                total_chunks += 1
+        print(f"     ✓ {md_file.name} → {len(sections)} secciones")
+    print(f"     Total chunks: {total_chunks}")
+    return total_chunks
+def main():
+    parser = argparse.ArgumentParser(description="Ingesta de libros ESL/ISLP en ChromaDB")
+    parser.add_argument(
+        "--books-dir",
+        type=Path,
+        default=Path(__file__).parent.parent.parent / "ebook",
+        help="Directorio raíz con las carpetas de capítulos",
+    )
+    parser.add_argument(
+        "--chroma-dir",
+        type=Path,
+        default=CHROMA_DIR,
+        help="Directorio para la base de datos ChromaDB",
+    )
+    args = parser.parse_args()
+    print("🔧 Inicializando embedding model...")
+    embedding_fn = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)
+    print(f"🗄️  ChromaDB persistente en: {args.chroma_dir}")
+    client = chromadb.PersistentClient(path=str(args.chroma_dir))
+    print("\n📖 Iniciando ingesta de libros...")
+    total = 0
+    for book_key in BOOKS_CONFIG:
+        total += ingest_book(args.books_dir, book_key, client, embedding_fn)
+    print(f"\n✅ Ingesta completada. Total de chunks vectorizados: {total}")
+    print(f"   Base de datos en: {args.chroma_dir}")
+if __name__ == "__main__":
+    main()

rag_books_mcp/server.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+MCP Server v2 (transporte stdio) — RAG sobre ESL e ISLP.
+Diferencia con v1: la base ChromaDB se obtiene de un dataset HF Hub
+(ver `rag_books_mcp.tools` para la resolución de la ruta).
+"""
+from mcp.server.fastmcp import FastMCP
+from rag_books_mcp.tools import (
+    cite_foundation as _cite_foundation,
+    get_section as _get_section,
+    list_available_topics as _list_available_topics,
+    search_theory as _search_theory,
+)
+mcp = FastMCP(
+    "rag-books-mcp-v2",
+    instructions=(
+        "RAG sobre los libros ESL e ISLP. v2: base vectorial ChromaDB cargada "
+        "desde un dataset publicado en HF Hub (separación código/datos)."
+    ),
+)
+@mcp.tool()
+def search_theory(query: str, book: str = "both", top_k: int = 5) -> str:
+    """Busca fragmentos relevantes en ESL/ISLP usando búsqueda semántica.
+    Args:
+        query: Consulta en lenguaje natural (ej: "bias-variance tradeoff").
+        book: "esl", "islp" o "both" (default: "both").
+        top_k: Número de resultados (1-10, default: 5).
+    """
+    return _search_theory(query=query, book=book, top_k=top_k)
+@mcp.tool()
+def get_section(book: str, chapter: str, section: str = "", max_chunks: int = 5) -> str:
+    """Recupera una sección específica de ESL o ISLP por referencia exacta.
+    Args:
+        book: "esl" o "islp".
+        chapter: Nombre del capítulo (búsqueda parcial soportada).
+        section: Nombre de la sección dentro del capítulo (opcional).
+        max_chunks: Máximo de chunks a devolver (default: 5).
+    """
+    return _get_section(book=book, chapter=chapter, section=section, max_chunks=max_chunks)
+@mcp.tool()
+def cite_foundation(topic: str, detail_level: str = "medium") -> str:
+    """Fundamentación teórica de un tema citando ambos libros (ESL + ISLP).
+    Args:
+        topic: Tema a fundamentar (ej: "ridge regression", "bagging").
+        detail_level: "brief", "medium" (default) o "deep".
+    """
+    return _cite_foundation(topic=topic, detail_level=detail_level)
+@mcp.tool()
+def list_available_topics() -> str:
+    """Lista los capítulos y temas indexados en la base de conocimiento."""
+    return _list_available_topics()
+def main():
+    """Punto de entrada del MCP server (stdio)."""
+    mcp.run(transport="stdio")
+if __name__ == "__main__":
+    main()

rag_books_mcp/tools.py ADDED Viewed

	@@ -0,0 +1,412 @@

+"""
+Lógica de las 4 tools de RAG sobre ESL e ISLP (v2).
+Diferencia con v1: la base ChromaDB se obtiene de un dataset publicado en
+HF Hub vía `snapshot_download`. La primera invocación tarda lo que tarde
+la descarga (~40 MB); las siguientes son cache hit.
+Variables de entorno:
+- RAG_CHROMA_DIR        Si está set y apunta a una carpeta existente, se usa
+                        en lugar del dataset (útil para dev local con índice
+                        recién regenerado por `ingest.py`).
+- RAG_CHROMA_DATASET    Repo del dataset HF a descargar.
+                        Default: gusdelact/rag-esl-islp-chromadb
+- RAG_CHROMA_REVISION   Revision (branch/tag/commit) del dataset.
+                        Default: main
+- RAG_CHROMA_CACHE_DIR  Directorio cache para el snapshot_download.
+                        Default: ~/.cache/rag-books-mcp/chroma_db (o /data/chroma_db
+                        si existe /data, como en HF Spaces con persistent storage).
+- HF_TOKEN              Solo si el dataset es privado.
+"""
+from __future__ import annotations
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+import chromadb
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+# --- Configuración ---
+EMBEDDING_MODEL = "all-MiniLM-L6-v2"
+DEFAULT_DATASET = "gusdelact/rag-esl-islp-chromadb"
+DEFAULT_REVISION = "main"
+def _resolve_cache_dir() -> Path:
+    """Decide dónde guardar el snapshot del dataset.
+    Prioridad:
+      1. RAG_CHROMA_CACHE_DIR si está set.
+      2. /data/chroma_db si existe /data (HF Spaces con persistent storage).
+      3. ~/.cache/rag-books-mcp/chroma_db.
+    """
+    explicit = os.environ.get("RAG_CHROMA_CACHE_DIR")
+    if explicit:
+        return Path(explicit)
+    if Path("/data").is_dir():
+        return Path("/data/chroma_db")
+    return Path.home() / ".cache" / "rag-books-mcp" / "chroma_db"
+# Singletons por proceso
+_client: Optional[chromadb.ClientAPI] = None
+_embedding_fn = None
+_chroma_path_resolved: Optional[str] = None
+def _resolve_chroma_path() -> str:
+    """Resuelve la ruta a usar como ChromaDB persistente.
+    Si `RAG_CHROMA_DIR` apunta a una carpeta existente, la usa tal cual.
+    En caso contrario, baja `RAG_CHROMA_DATASET@RAG_CHROMA_REVISION` desde HF
+    Hub y devuelve la ruta al snapshot.
+    """
+    global _chroma_path_resolved
+    if _chroma_path_resolved is not None:
+        return _chroma_path_resolved
+    override = os.environ.get("RAG_CHROMA_DIR")
+    if override and Path(override).is_dir():
+        print(f"[rag-books-mcp v2] Using local RAG_CHROMA_DIR: {override}", file=sys.stderr)
+        _chroma_path_resolved = override
+        return override
+    repo_id = os.environ.get("RAG_CHROMA_DATASET", DEFAULT_DATASET)
+    revision = os.environ.get("RAG_CHROMA_REVISION", DEFAULT_REVISION)
+    cache_dir = _resolve_cache_dir()
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    print(
+        f"[rag-books-mcp v2] Downloading ChromaDB from HF dataset "
+        f"{repo_id}@{revision} into {cache_dir} ...",
+        file=sys.stderr,
+    )
+    # Import perezoso para no pagar el costo si está cacheado vía RAG_CHROMA_DIR.
+    from huggingface_hub import snapshot_download
+    snapshot = snapshot_download(
+        repo_id=repo_id,
+        repo_type="dataset",
+        revision=revision,
+        cache_dir=str(cache_dir),
+        token=os.environ.get("HF_TOKEN"),  # solo si es privado
+    )
+    print(f"[rag-books-mcp v2] ChromaDB ready at {snapshot}", file=sys.stderr)
+    _chroma_path_resolved = snapshot
+    return snapshot
+def get_client() -> chromadb.ClientAPI:
+    """Cliente ChromaDB persistente (singleton)."""
+    global _client
+    if _client is None:
+        _client = chromadb.PersistentClient(path=_resolve_chroma_path())
+    return _client
+def get_embedding_fn():
+    """Función de embedding `sentence-transformers/all-MiniLM-L6-v2` (singleton)."""
+    global _embedding_fn
+    if _embedding_fn is None:
+        _embedding_fn = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)
+    return _embedding_fn
+def get_collection(name: str):
+    """Obtiene una colección de ChromaDB por nombre."""
+    return get_client().get_collection(name=name, embedding_function=get_embedding_fn())
+# --- Tools (idénticas en comportamiento a v1) ---
+def search_theory(
+    query: str,
+    book: str = "both",
+    top_k: int = 5,
+) -> str:
+    """
+    Busca fragmentos relevantes en los libros ESL e ISLP usando búsqueda semántica.
+    Args:
+        query (str): Consulta en lenguaje natural (ej: "bias-variance tradeoff",
+                     "regularización L1 vs L2", "random forest out-of-bag error").
+        book (str): Libro donde buscar. Opciones: "esl", "islp", "both" (default: "both").
+        top_k (int): Número de resultados a devolver (default: 5, máximo: 10).
+    Returns:
+        str: Fragmentos relevantes con metadata (libro, capítulo, sección, similitud).
+    """
+    top_k = min(max(int(top_k), 1), 10)
+    collections_to_search = []
+    if book in ("esl", "both"):
+        try:
+            collections_to_search.append(("ESL", get_collection("esl_chapters")))
+        except Exception:
+            pass
+    if book in ("islp", "both"):
+        try:
+            collections_to_search.append(("ISLP", get_collection("islp_chapters")))
+        except Exception:
+            pass
+    if not collections_to_search:
+        return (
+            "❌ No se encontraron colecciones. Verifica que el dataset HF "
+            "esté disponible o ejecuta la ingesta local."
+        )
+    results = []
+    for book_label, collection in collections_to_search:
+        res = collection.query(query_texts=[query], n_results=top_k)
+        if res["documents"] and res["documents"][0]:
+            for doc, meta, dist in zip(
+                res["documents"][0], res["metadatas"][0], res["distances"][0]
+            ):
+                similarity = 1 - dist
+                results.append({
+                    "book": book_label,
+                    "chapter": meta.get("chapter", ""),
+                    "section": meta.get("section", ""),
+                    "similarity": similarity,
+                    "content": doc,
+                })
+    results.sort(key=lambda x: x["similarity"], reverse=True)
+    results = results[:top_k]
+    if not results:
+        return f"No se encontraron resultados para: '{query}'"
+    output_parts = [f"## Resultados para: \"{query}\"\n"]
+    for i, r in enumerate(results, 1):
+        output_parts.append(
+            f"### [{i}] {r['book']} — {r['chapter']} § {r['section']}\n"
+            f"**Similitud:** {r['similarity']:.3f}\n\n"
+            f"{r['content'][:1500]}\n\n---\n"
+        )
+    return "\n".join(output_parts)
+def get_section(
+    book: str,
+    chapter: str,
+    section: str = "",
+    max_chunks: int = 5,
+) -> str:
+    """
+    Recupera una sección específica de un libro por referencia exacta.
+    Args:
+        book (str): Libro a consultar. Opciones: "esl" o "islp".
+        chapter (str): Nombre del capítulo (búsqueda parcial soportada).
+        section (str): Nombre de la sección dentro del capítulo (opcional).
+        max_chunks (int): Máximo de chunks a devolver (default: 5).
+    Returns:
+        str: Contenido de la sección con metadata.
+    """
+    max_chunks = int(max_chunks)
+    collection_name = f"{book}_chapters"
+    try:
+        collection = get_collection(collection_name)
+    except Exception:
+        return f"❌ Colección '{collection_name}' no encontrada. Opciones: esl, islp"
+    try:
+        if section:
+            results = collection.get(
+                where={"$and": [
+                    {"chapter": {"$contains": chapter}},
+                    {"section": {"$contains": section}},
+                ]},
+                limit=max_chunks,
+            )
+        else:
+            results = collection.get(
+                where={"chapter": {"$contains": chapter}},
+                limit=max_chunks,
+            )
+    except Exception:
+        search_query = f"{chapter} {section}".strip()
+        results = collection.query(query_texts=[search_query], n_results=max_chunks)
+        if results["documents"] and results["documents"][0]:
+            output_parts = [f"## {book.upper()} — {chapter}\n"]
+            for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
+                output_parts.append(
+                    f"### § {meta.get('section', 'N/A')}\n\n{doc}\n\n---\n"
+                )
+            return "\n".join(output_parts)
+        return f"No se encontró el capítulo '{chapter}' en {book.upper()}"
+    if not results["documents"]:
+        search_query = f"{chapter} {section}".strip()
+        results = collection.query(query_texts=[search_query], n_results=max_chunks)
+        if results["documents"] and results["documents"][0]:
+            output_parts = [f"## {book.upper()} — {chapter}\n"]
+            for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
+                output_parts.append(
+                    f"### § {meta.get('section', 'N/A')}\n\n{doc}\n\n---\n"
+                )
+            return "\n".join(output_parts)
+        return f"No se encontró el capítulo '{chapter}' en {book.upper()}"
+    output_parts = [f"## {book.upper()} — {chapter}"]
+    if section:
+        output_parts[0] += f" § {section}"
+    output_parts[0] += "\n"
+    for doc, meta in zip(results["documents"], results["metadatas"]):
+        sec_title = meta.get("section", "")
+        chunk_idx = meta.get("chunk_index", 0)
+        total = meta.get("total_chunks_in_section", 1)
+        output_parts.append(
+            f"### § {sec_title} (parte {chunk_idx + 1}/{total})\n\n{doc}\n\n---\n"
+        )
+    return "\n".join(output_parts)
+def cite_foundation(
+    topic: str,
+    detail_level: str = "medium",
+) -> str:
+    """
+    Devuelve la fundamentación teórica para un tema citando ambos libros (ESL + ISLP).
+    Args:
+        topic (str): Tema a fundamentar (ej: "ridge regression", "bagging").
+        detail_level (str): "brief" (1-2), "medium" (3-4) o "deep" (6-8).
+    Returns:
+        str: Fundamentación teórica con citas, organizada de intuitivo (ISLP)
+             a riguroso (ESL).
+    """
+    top_k_map = {"brief": 2, "medium": 4, "deep": 8}
+    top_k = top_k_map.get(detail_level, 4)
+    islp_results = []
+    try:
+        islp_col = get_collection("islp_chapters")
+        res = islp_col.query(query_texts=[topic], n_results=top_k)
+        if res["documents"] and res["documents"][0]:
+            for doc, meta, dist in zip(
+                res["documents"][0], res["metadatas"][0], res["distances"][0]
+            ):
+                islp_results.append({
+                    "content": doc,
+                    "chapter": meta.get("chapter", ""),
+                    "section": meta.get("section", ""),
+                    "similarity": 1 - dist,
+                })
+    except Exception:
+        pass
+    esl_results = []
+    try:
+        esl_col = get_collection("esl_chapters")
+        res = esl_col.query(query_texts=[topic], n_results=top_k)
+        if res["documents"] and res["documents"][0]:
+            for doc, meta, dist in zip(
+                res["documents"][0], res["metadatas"][0], res["distances"][0]
+            ):
+                esl_results.append({
+                    "content": doc,
+                    "chapter": meta.get("chapter", ""),
+                    "section": meta.get("section", ""),
+                    "similarity": 1 - dist,
+                })
+    except Exception:
+        pass
+    if not islp_results and not esl_results:
+        return (
+            f"❌ No se encontró fundamentación para '{topic}'. "
+            "Verifica que la ingesta se haya ejecutado correctamente."
+        )
+    output_parts = [
+        f"# Fundamentación Teórica: {topic}\n",
+        f"**Nivel de detalle:** {detail_level}\n",
+    ]
+    if islp_results:
+        output_parts.append("\n## 📘 ISLP (Explicación Intuitiva)\n")
+        for i, r in enumerate(islp_results, 1):
+            output_parts.append(
+                f"### [{i}] Cap. {r['chapter']} § {r['section']} "
+                f"(sim: {r['similarity']:.3f})\n\n"
+                f"{r['content'][:1200]}\n\n---\n"
+            )
+    if esl_results:
+        output_parts.append("\n## 📗 ESL (Tratamiento Riguroso)\n")
+        for i, r in enumerate(esl_results, 1):
+            output_parts.append(
+                f"### [{i}] Cap. {r['chapter']} § {r['section']} "
+                f"(sim: {r['similarity']:.3f})\n\n"
+                f"{r['content'][:1200]}\n\n---\n"
+            )
+    output_parts.append("\n## 📚 Referencias\n")
+    if islp_results:
+        chapters = set(r["chapter"] for r in islp_results)
+        output_parts.append(f"- **ISLP:** {', '.join(chapters)}\n")
+    if esl_results:
+        chapters = set(r["chapter"] for r in esl_results)
+        output_parts.append(f"- **ESL:** {', '.join(chapters)}\n")
+    return "\n".join(output_parts)
+def list_available_topics() -> str:
+    """
+    Lista los capítulos y temas indexados en la base de conocimiento.
+    Returns:
+        str: Lista organizada de capítulos por libro con sus secciones principales.
+    """
+    output_parts = ["# 📚 Contenido Disponible en la Base de Conocimiento\n"]
+    for book_key, collection_name in [("ESL", "esl_chapters"), ("ISLP", "islp_chapters")]:
+        try:
+            collection = get_collection(collection_name)
+            all_data = collection.get(include=["metadatas"])
+            if not all_data["metadatas"]:
+                output_parts.append(f"\n## {book_key}: Sin datos\n")
+                continue
+            chapters = {}
+            for meta in all_data["metadatas"]:
+                chapter = meta.get("chapter", "Unknown")
+                section = meta.get("section", "")
+                if chapter not in chapters:
+                    chapters[chapter] = set()
+                if section:
+                    chapters[chapter].add(section)
+            output_parts.append(f"\n## 📗 {book_key}\n")
+            for chapter in sorted(chapters.keys()):
+                sections = sorted(chapters[chapter])
+                output_parts.append(f"\n### {chapter}\n")
+                if sections:
+                    for sec in sections[:8]:
+                        output_parts.append(f"  - {sec}\n")
+                    if len(sections) > 8:
+                        output_parts.append(f"  - ... y {len(sections) - 8} secciones más\n")
+            total = collection.count()
+            output_parts.append(f"\n**Total chunks indexados:** {total}\n")
+        except Exception as e:
+            output_parts.append(f"\n## {book_key}: ❌ Error ({e})\n")
+    return "\n".join(output_parts)