Spaces:
Sleeping
Sleeping
Adzacam commited on
Commit ·
0b30b46
1
Parent(s): f659377
Deploy: Conexión de pipeline BETO con esquema estrella de Supabase
Browse files- Dockerfile +40 -0
- database.py +61 -0
- main.py +89 -0
- ner_engine.py +101 -0
- requirements.txt +12 -0
Dockerfile
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# DOCKERFILE OPTIMIZADO PARA API FASTAPI + TRANSFORMERS (BETO) EN ENTORNOS CPU
|
| 3 |
+
# =============================================================================
|
| 4 |
+
|
| 5 |
+
FROM python:3.12-slim
|
| 6 |
+
|
| 7 |
+
# 1. Instalar compiladores necesarios para psycopg2 y rapidfuzz
|
| 8 |
+
USER root
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
build-essential \
|
| 11 |
+
libpq-dev \
|
| 12 |
+
gcc \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# 2. Configurar el usuario exigido por Hugging Face (UID 1000)
|
| 16 |
+
RUN useradd -m -u 1000 user
|
| 17 |
+
ENV PATH="/home/user/.local/bin:$PATH" \
|
| 18 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 19 |
+
PYTHONUNBUFFERED=1 \
|
| 20 |
+
HF_HOME=/tmp/huggingface_cache
|
| 21 |
+
|
| 22 |
+
WORKDIR /app
|
| 23 |
+
|
| 24 |
+
# 3. Copiar e instalar dependencias con los permisos correctos
|
| 25 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
| 26 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 27 |
+
pip install --no-cache-dir -r requirements.txt
|
| 28 |
+
|
| 29 |
+
# 4. Descargar y almacenar los pesos de BETO en caché durante la construcción
|
| 30 |
+
RUN python -c "from transformers import pipeline; pipeline('ner', model='mrm8488/bert-spanish-ner-cased', tokenizer='mrm8488/bert-spanish-ner-cased', aggregation_strategy='simple')"
|
| 31 |
+
|
| 32 |
+
# 5. Copiar el resto del código asignando la propiedad al usuario final
|
| 33 |
+
COPY --chown=user . /app
|
| 34 |
+
|
| 35 |
+
USER user
|
| 36 |
+
|
| 37 |
+
# 6. Escuchar obligatoriamente en el puerto 7860 exigido por Hugging Face
|
| 38 |
+
EXPOSE 7860
|
| 39 |
+
|
| 40 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
database.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from sqlalchemy import create_engine
|
| 4 |
+
from sqlalchemy.orm import declarative_base, sessionmaker
|
| 5 |
+
|
| 6 |
+
# Load environment variables from .env file
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
DATABASE_URL = os.getenv("DATABASE_URL")
|
| 10 |
+
|
| 11 |
+
if not DATABASE_URL:
|
| 12 |
+
# Fallback to local postgres if not set for safety, or raise a warning
|
| 13 |
+
DATABASE_URL = "postgresql://postgres:postgres@localhost:5432/postgres"
|
| 14 |
+
|
| 15 |
+
# For Supabase, pool_pre_ping=True is highly recommended to handle stale connections
|
| 16 |
+
engine = create_engine(
|
| 17 |
+
DATABASE_URL,
|
| 18 |
+
pool_pre_ping=True,
|
| 19 |
+
pool_recycle=3600
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 23 |
+
|
| 24 |
+
from sqlalchemy import Column, Integer, String, Numeric, Boolean, DateTime, ForeignKey
|
| 25 |
+
import datetime
|
| 26 |
+
|
| 27 |
+
Base = declarative_base()
|
| 28 |
+
|
| 29 |
+
class DimEstudiante(Base):
|
| 30 |
+
__tablename__ = "dim_estudiante"
|
| 31 |
+
id_estudiante = Column(Integer, primary_key=True, index=True)
|
| 32 |
+
nombre_completo = Column(String(200), nullable=False)
|
| 33 |
+
codigo_estudiante = Column(String(50))
|
| 34 |
+
|
| 35 |
+
class FactRendimientoAcademico(Base):
|
| 36 |
+
__tablename__ = "fact_rendimiento_academico"
|
| 37 |
+
id_hecho_aca = Column(Integer, primary_key=True, index=True)
|
| 38 |
+
id_estudiante = Column(Integer, ForeignKey("dim_estudiante.id_estudiante"))
|
| 39 |
+
id_docente = Column(Integer)
|
| 40 |
+
id_modulo = Column(Integer)
|
| 41 |
+
id_tiempo = Column(Integer)
|
| 42 |
+
id_documento = Column(Integer)
|
| 43 |
+
id_usuario_carga = Column(Integer)
|
| 44 |
+
nota_final = Column(Numeric(5,2))
|
| 45 |
+
asistencia_pct = Column(Numeric(5,2))
|
| 46 |
+
incumplimiento_actividades_pct = Column(Numeric(5,2))
|
| 47 |
+
nivel_confianza_ia = Column(Numeric(5,4))
|
| 48 |
+
requiere_revision = Column(Boolean, default=False)
|
| 49 |
+
created_at = Column(DateTime, default=datetime.datetime.utcnow)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def get_db():
|
| 53 |
+
"""
|
| 54 |
+
Dependency injection helper to yield a database session.
|
| 55 |
+
Guarantees that the session is closed after the request completes.
|
| 56 |
+
"""
|
| 57 |
+
db = SessionLocal()
|
| 58 |
+
try:
|
| 59 |
+
yield db
|
| 60 |
+
finally:
|
| 61 |
+
db.close()
|
main.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List
|
| 4 |
+
from fastapi import FastAPI, Depends, HTTPException, status
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from sqlalchemy.orm import Session
|
| 7 |
+
from database import get_db, DimEstudiante, FactRendimientoAcademico, engine
|
| 8 |
+
from ner_engine import ner_engine
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
app = FastAPI(
|
| 14 |
+
title="GiraGroup BI Backend Cloud",
|
| 15 |
+
description="API para Tecnologías Emergentes II con BETO y Supabase",
|
| 16 |
+
version="1.0.0"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
class ProcessSheetPayload(BaseModel):
|
| 20 |
+
texto_celda: str
|
| 21 |
+
nota_detectada: float
|
| 22 |
+
asistencia: float
|
| 23 |
+
incumplimiento_tareas: float
|
| 24 |
+
id_docente: int
|
| 25 |
+
id_modulo: int
|
| 26 |
+
id_tiempo: int
|
| 27 |
+
id_documento: int
|
| 28 |
+
id_usuario: int
|
| 29 |
+
|
| 30 |
+
@app.get("/")
|
| 31 |
+
def read_root():
|
| 32 |
+
return {
|
| 33 |
+
"status": "healthy",
|
| 34 |
+
"service": "GiraGroup BI Backend API Cloud",
|
| 35 |
+
"ner_initialized": ner_engine._initialized or ner_engine.pipeline is not None
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
@app.post("/api/v1/ingesta/tabular", status_code=status.HTTP_201_CREATED)
|
| 39 |
+
def procesar_registro_tabular(payload: ProcessSheetPayload, db: Session = Depends(get_db)):
|
| 40 |
+
entidades = ner_engine.extract_entities(payload.texto_celda)
|
| 41 |
+
confianza_ia = sum([e["score"] for e in entidades]) / len(entidades) if entidades else 1.0
|
| 42 |
+
|
| 43 |
+
forzar_revision = False
|
| 44 |
+
if confianza_ia < 0.60:
|
| 45 |
+
forzar_revision = True
|
| 46 |
+
|
| 47 |
+
nombre_resuelto = payload.texto_celda.strip()
|
| 48 |
+
estudiante = db.query(DimEstudiante).filter(DimEstudiante.nombre_completo == nombre_resuelto).first()
|
| 49 |
+
|
| 50 |
+
if not estudiante:
|
| 51 |
+
estudiante = DimEstudiante(nombre_completo=nombre_resuelto)
|
| 52 |
+
db.add(estudiante)
|
| 53 |
+
db.commit()
|
| 54 |
+
db.refresh(estudiante)
|
| 55 |
+
|
| 56 |
+
alertas_disparadas = []
|
| 57 |
+
if payload.nota_detectada <= 70.0:
|
| 58 |
+
alertas_disparadas.append("RIESGO_ACADEMICO_CRITICO")
|
| 59 |
+
if payload.asistencia < 70.0 or payload.incumplimiento_tareas > 30.0:
|
| 60 |
+
alertas_disparadas.append("RIESGO_DESERCION_ALTA")
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
nuevo_hecho = FactRendimientoAcademico(
|
| 64 |
+
id_estudiante=estudiante.id_estudiante,
|
| 65 |
+
id_docente=payload.id_docente,
|
| 66 |
+
id_modulo=payload.id_modulo,
|
| 67 |
+
id_tiempo=payload.id_tiempo,
|
| 68 |
+
id_documento=payload.id_documento,
|
| 69 |
+
id_usuario_carga=payload.id_usuario,
|
| 70 |
+
nota_final=payload.nota_detectada,
|
| 71 |
+
asistencia_pct=payload.asistencia,
|
| 72 |
+
incumplimiento_actividades_pct=payload.incumplimiento_tareas,
|
| 73 |
+
nivel_confianza_ia=confianza_ia,
|
| 74 |
+
requiere_revision=forzar_revision
|
| 75 |
+
)
|
| 76 |
+
db.add(nuevo_hecho)
|
| 77 |
+
db.commit()
|
| 78 |
+
|
| 79 |
+
return {
|
| 80 |
+
"status": "processed",
|
| 81 |
+
"id_estudiante_asignado": estudiante.id_estudiante,
|
| 82 |
+
"confianza_modelo_beto": round(confianza_ia, 4),
|
| 83 |
+
"requiere_auditoria_humana": forzar_revision,
|
| 84 |
+
"alertas_estrategicas": alertas_disparadas
|
| 85 |
+
}
|
| 86 |
+
except Exception as err:
|
| 87 |
+
db.rollback()
|
| 88 |
+
logger.error(f"Fallo en persistencia: {err}")
|
| 89 |
+
raise HTTPException(status_code=500, detail="Error al escribir en Supabase.")
|
ner_engine.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
|
| 5 |
+
# Configure logging
|
| 6 |
+
logging.basicConfig(level=logging.INFO)
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class NEREngine:
|
| 10 |
+
def __init__(self, model_name: str = "mrm8488/bert-spanish-ner-cased"):
|
| 11 |
+
self.model_name = model_name
|
| 12 |
+
self.pipeline = None
|
| 13 |
+
self._initialized = False
|
| 14 |
+
|
| 15 |
+
def initialize(self):
|
| 16 |
+
"""
|
| 17 |
+
Lazily initialize the Hugging Face pipeline.
|
| 18 |
+
This prevents startup failure if transformers/torch is missing
|
| 19 |
+
and speeds up unit tests.
|
| 20 |
+
"""
|
| 21 |
+
if self._initialized:
|
| 22 |
+
return
|
| 23 |
+
|
| 24 |
+
logger.info(f"Initializing NER model: {self.model_name}...")
|
| 25 |
+
try:
|
| 26 |
+
from transformers import pipeline
|
| 27 |
+
self.pipeline = pipeline(
|
| 28 |
+
"ner",
|
| 29 |
+
model=self.model_name,
|
| 30 |
+
tokenizer=self.model_name,
|
| 31 |
+
aggregation_strategy="simple"
|
| 32 |
+
)
|
| 33 |
+
self._initialized = True
|
| 34 |
+
logger.info("NER model loaded successfully.")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logger.error(f"Error loading NER model: {e}")
|
| 37 |
+
logger.warning("Falling back to a mock regex entity extractor for development/testing.")
|
| 38 |
+
self.pipeline = None
|
| 39 |
+
self._initialized = True
|
| 40 |
+
|
| 41 |
+
def extract_entities(self, text: str) -> List[Dict[str, Any]]:
|
| 42 |
+
"""
|
| 43 |
+
Extract named entities from Spanish text.
|
| 44 |
+
Returns a list of dictionaries with word, entity_group, score, start, and end.
|
| 45 |
+
"""
|
| 46 |
+
self.initialize()
|
| 47 |
+
|
| 48 |
+
if not text or not text.strip():
|
| 49 |
+
return []
|
| 50 |
+
|
| 51 |
+
if self.pipeline:
|
| 52 |
+
try:
|
| 53 |
+
raw_entities = self.pipeline(text)
|
| 54 |
+
entities = []
|
| 55 |
+
for ent in raw_entities:
|
| 56 |
+
entities.append({
|
| 57 |
+
"word": ent.get("word"),
|
| 58 |
+
"entity_group": ent.get("entity_group"),
|
| 59 |
+
"score": float(ent.get("score")),
|
| 60 |
+
"start": int(ent.get("start")),
|
| 61 |
+
"end": int(ent.get("end"))
|
| 62 |
+
})
|
| 63 |
+
return entities
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.error(f"Error executing NER pipeline: {e}")
|
| 66 |
+
return self._mock_extraction(text)
|
| 67 |
+
else:
|
| 68 |
+
return self._mock_extraction(text)
|
| 69 |
+
|
| 70 |
+
def _mock_extraction(self, text: str) -> List[Dict[str, Any]]:
|
| 71 |
+
"""
|
| 72 |
+
Basic regex-based entity extractor fallback for development/testing environments
|
| 73 |
+
when PyTorch or Transformers are not fully installed.
|
| 74 |
+
"""
|
| 75 |
+
entities = []
|
| 76 |
+
# Find capitalized words
|
| 77 |
+
for match in re.finditer(r'\b[A-Z][a-zA-Z0-9_]+\b', text):
|
| 78 |
+
word = match.group()
|
| 79 |
+
start, end = match.span()
|
| 80 |
+
|
| 81 |
+
# Simple rule-based classification for mock entities
|
| 82 |
+
if word in ["Madrid", "Barcelona", "Bogota", "Santiago", "Lima", "Mexico"]:
|
| 83 |
+
entity_group = "LOC"
|
| 84 |
+
elif word in ["GiraGroup", "Supabase", "Google", "Microsoft", "SQLAlchemy"]:
|
| 85 |
+
entity_group = "ORG"
|
| 86 |
+
elif word in ["Juan", "Maria", "Carlos", "Jose", "Pedro", "Ana"]:
|
| 87 |
+
entity_group = "PER"
|
| 88 |
+
else:
|
| 89 |
+
entity_group = "MISC"
|
| 90 |
+
|
| 91 |
+
entities.append({
|
| 92 |
+
"word": word,
|
| 93 |
+
"entity_group": entity_group,
|
| 94 |
+
"score": 1.0,
|
| 95 |
+
"start": start,
|
| 96 |
+
"end": end
|
| 97 |
+
})
|
| 98 |
+
return entities
|
| 99 |
+
|
| 100 |
+
# Global instance of the engine
|
| 101 |
+
ner_engine = NEREngine()
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.100.0
|
| 2 |
+
uvicorn[standard]>=0.22.0
|
| 3 |
+
sqlalchemy>=2.0.0
|
| 4 |
+
psycopg2-binary>=2.9.0
|
| 5 |
+
python-dotenv>=1.0.0
|
| 6 |
+
transformers>=4.30.0
|
| 7 |
+
torch>=2.0.0 --extra-index-url https://download.pytorch.org/whl/cpu
|
| 8 |
+
pandas>=2.2.1
|
| 9 |
+
openpyxl==3.1.2
|
| 10 |
+
gspread==6.1.0
|
| 11 |
+
rapidfuzz==3.6.1
|
| 12 |
+
pydantic>=2.0.0
|