Spaces:

gusdelact
/

rag-books-mcp

Paused

App Files Files Community

rag-books-mcp / rag_books_mcp /ingest.py

gusdelact

Upload folder using huggingface_hub

4639111 verified about 2 months ago

Raw

History Blame Contribute Delete

9.34 kB

	"""
	Script de ingesta: vectoriza los capítulos de ESL e ISLP en ChromaDB.

	Uso:
	python -m rag_books_mcp.ingest --books-dir ../ebook

	Esto crea/actualiza la base vectorial en ./chroma_db/
	"""

	import os
	import re
	import argparse
	from pathlib import Path

	import chromadb
	from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction


	# --- Configuración ---
	EMBEDDING_MODEL = "all-MiniLM-L6-v2"
	CHUNK_SIZE = 600 # tokens aprox (caracteres / 4)
	CHUNK_OVERLAP = 100
	CHROMA_DIR = Path(__file__).parent.parent / "chroma_db"

	BOOKS_CONFIG = {
	"esl": {
	"dir_name": "capitulos_TheElementsOfStatisticalLearning",
	"collection": "esl_chapters",
	"full_name": "The Elements of Statistical Learning (Hastie, Tibshirani, Friedman)",
	},
	"islp": {
	"dir_name": "capitulos_islp",
	"collection": "islp_chapters",
	"full_name": "An Introduction to Statistical Learning with Python (James, Witten, Hastie, Tibshirani)",
	},
	"fes": {
	"dir_name": "capitulos_fes",
	"collection": "fes_chapters",
	"full_name": "Feature Engineering and Selection (Kuhn, Johnson)",
	},
	"pdsh": {
	"dir_name": "capitulos_pdsh",
	"collection": "pdsh_chapters",
	"full_name": "Python Data Science Handbook (VanderPlas)",
	},
	"r4ds": {
	"dir_name": "capitulos_r4ds",
	"collection": "r4ds_chapters",
	"full_name": (
	"R for Data Science, 2nd Edition "
	"(Wickham, Çetinkaya-Rundel, Grolemund) — examples in R; "
	"principles transfer to pandas/Python"
	),
	# Originalmente marcado como local_only=True por la licencia
	# CC BY-NC-ND 3.0 US. Se desactiva para uso académico explícito;
	# la decisión está documentada en DATA_CARD.md y los autores
	# tienen un mecanismo de takedown rápido en caso de objeción.
	"local_only": False,
	},
	}


	def extract_chapter_info(filename: str) -> dict:
	"""Extrae número de archivo y nombre del capítulo del filename."""
	# Formato: 04_3_Linear_Methods_for_Regression.md
	stem = Path(filename).stem
	parts = stem.split("_", 1)
	file_order = parts[0] if parts else "00"
	chapter_title = parts[1].replace("_", " ") if len(parts) > 1 else stem
	return {"file_order": file_order, "chapter_title": chapter_title}


	def split_by_sections(text: str, chapter_title: str) -> list[dict]:
	"""
	Divide el texto en secciones usando headers markdown (# y ##).
	Cada sección se subdivide en chunks si es muy larga.
	"""
	# Patrón para detectar headers de nivel 1-3
	header_pattern = re.compile(r"^(#{1,3})\s+(.+)$", re.MULTILINE)

	sections = []
	matches = list(header_pattern.finditer(text))

	if not matches:
	# Sin headers, tratar todo como una sección
	sections.append({"title": chapter_title, "level": 1, "content": text.strip()})
	else:
	# Texto antes del primer header
	pre_text = text[: matches[0].start()].strip()
	if pre_text and len(pre_text) > 50:
	sections.append({"title": chapter_title, "level": 1, "content": pre_text})

	for i, match in enumerate(matches):
	level = len(match.group(1))
	title = match.group(2).strip()
	start = match.end()
	end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
	content = text[start:end].strip()

	if content and len(content) > 30:
	sections.append({"title": title, "level": level, "content": content})

	return sections


	def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
	"""
	Divide texto en chunks por caracteres con overlap.
	Intenta cortar en saltos de línea o puntos para no romper oraciones.
	"""
	# Convertir chunk_size de tokens aprox a caracteres (1 token ≈ 4 chars)
	char_size = chunk_size * 4
	char_overlap = overlap * 4

	if len(text) <= char_size:
	return [text]

	chunks = []
	start = 0

	while start < len(text):
	end = start + char_size

	if end < len(text):
	# Buscar un buen punto de corte (párrafo o punto)
	# Primero intentar doble newline (párrafo)
	cut_point = text.rfind("\n\n", start + char_size // 2, end)
	if cut_point == -1:
	# Intentar punto seguido de espacio
	cut_point = text.rfind(". ", start + char_size // 2, end)
	if cut_point != -1:
	cut_point += 1 # incluir el punto
	if cut_point == -1:
	# Intentar newline simple
	cut_point = text.rfind("\n", start + char_size // 2, end)
	if cut_point == -1:
	cut_point = end

	end = cut_point

	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)

	start = end - char_overlap
	if start >= len(text):
	break

	return chunks


	def clean_text(text: str) -> str:
	"""Limpia artefactos de la extracción PDF / scraping HTML."""
	# Eliminar YAML frontmatter al inicio (---\nkey: value\n---)
	text = re.sub(r"^---\s\n.?\n---\s*\n", "", text, count=1, flags=re.DOTALL)
	# Eliminar marcadores de página
	text = re.sub(r"---\sPágina\s\d+\s*---", "", text)
	# Eliminar líneas con solo números (números de página sueltos)
	text = re.sub(r"^\d+\s*$", "", text, flags=re.MULTILINE)
	# Reducir múltiples líneas vacías
	text = re.sub(r"\n{4,}", "\n\n\n", text)
	# Eliminar copyright notices
	text = re.sub(r"©.*?(?:\n\|$)", "", text)
	return text.strip()


	def ingest_book(books_dir: Path, book_key: str, client: chromadb.ClientAPI, embedding_fn):
	"""Ingesta un libro completo en ChromaDB."""
	config = BOOKS_CONFIG[book_key]
	chapters_dir = books_dir / config["dir_name"]

	if not chapters_dir.exists():
	print(f" ⚠️ Directorio no encontrado: {chapters_dir}")
	return 0

	# Crear o obtener colección (reset si existe)
	try:
	client.delete_collection(config["collection"])
	except Exception:
	pass

	collection = client.get_or_create_collection(
	name=config["collection"],
	embedding_function=embedding_fn,
	metadata={"hnsw:space": "cosine"},
	)

	total_chunks = 0
	md_files = sorted(chapters_dir.glob("*.md"))
	# Excluir READMEs y notas auxiliares: solo capítulos NN_*.md
	md_files = [f for f in md_files if not f.name.upper().startswith("README")]

	print(f"\n 📚 {config['full_name']}")
	print(f" Archivos encontrados: {len(md_files)}")

	for md_file in md_files:
	chapter_info = extract_chapter_info(md_file.name)
	raw_text = md_file.read_text(encoding="utf-8")
	text = clean_text(raw_text)

	if len(text) < 100:
	continue

	# Dividir en secciones
	sections = split_by_sections(text, chapter_info["chapter_title"])

	for section in sections:
	# Dividir secciones largas en chunks
	chunks = chunk_text(section["content"])

	for i, chunk in enumerate(chunks):
	chunk_id = f"{book_key}_{chapter_info['file_order']}_{section['title'][:30]}_{i}"
	# Sanitizar ID
	chunk_id = re.sub(r"[^a-zA-Z0-9_-]", "_", chunk_id)

	metadata = {
	"book": book_key,
	"book_full_name": config["full_name"],
	"chapter": chapter_info["chapter_title"],
	"section": section["title"],
	"section_level": section["level"],
	"chunk_index": i,
	"total_chunks_in_section": len(chunks),
	"file": md_file.name,
	}

	collection.add(
	ids=[chunk_id],
	documents=[chunk],
	metadatas=[metadata],
	)
	total_chunks += 1

	print(f" ✓ {md_file.name} → {len(sections)} secciones")

	print(f" Total chunks: {total_chunks}")
	return total_chunks


	def main():
	parser = argparse.ArgumentParser(description="Ingesta de libros ESL/ISLP en ChromaDB")
	parser.add_argument(
	"--books-dir",
	type=Path,
	default=Path(__file__).parent.parent.parent / "ebook",
	help="Directorio raíz con las carpetas de capítulos",
	)
	parser.add_argument(
	"--chroma-dir",
	type=Path,
	default=CHROMA_DIR,
	help="Directorio para la base de datos ChromaDB",
	)
	args = parser.parse_args()

	print("🔧 Inicializando embedding model...")
	embedding_fn = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)

	print(f"🗄️ ChromaDB persistente en: {args.chroma_dir}")
	client = chromadb.PersistentClient(path=str(args.chroma_dir))

	print("\n📖 Iniciando ingesta de libros...")
	total = 0
	for book_key in BOOKS_CONFIG:
	total += ingest_book(args.books_dir, book_key, client, embedding_fn)

	print(f"\n✅ Ingesta completada. Total de chunks vectorizados: {total}")
	print(f" Base de datos en: {args.chroma_dir}")


	if __name__ == "__main__":
	main()