Adzacam commited on
Commit
0b30b46
·
1 Parent(s): f659377

Deploy: Conexión de pipeline BETO con esquema estrella de Supabase

Browse files
Files changed (5) hide show
  1. Dockerfile +40 -0
  2. database.py +61 -0
  3. main.py +89 -0
  4. ner_engine.py +101 -0
  5. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # DOCKERFILE OPTIMIZADO PARA API FASTAPI + TRANSFORMERS (BETO) EN ENTORNOS CPU
3
+ # =============================================================================
4
+
5
+ FROM python:3.12-slim
6
+
7
+ # 1. Instalar compiladores necesarios para psycopg2 y rapidfuzz
8
+ USER root
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ libpq-dev \
12
+ gcc \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # 2. Configurar el usuario exigido por Hugging Face (UID 1000)
16
+ RUN useradd -m -u 1000 user
17
+ ENV PATH="/home/user/.local/bin:$PATH" \
18
+ PYTHONDONTWRITEBYTECODE=1 \
19
+ PYTHONUNBUFFERED=1 \
20
+ HF_HOME=/tmp/huggingface_cache
21
+
22
+ WORKDIR /app
23
+
24
+ # 3. Copiar e instalar dependencias con los permisos correctos
25
+ COPY --chown=user ./requirements.txt requirements.txt
26
+ RUN pip install --no-cache-dir --upgrade pip && \
27
+ pip install --no-cache-dir -r requirements.txt
28
+
29
+ # 4. Descargar y almacenar los pesos de BETO en caché durante la construcción
30
+ RUN python -c "from transformers import pipeline; pipeline('ner', model='mrm8488/bert-spanish-ner-cased', tokenizer='mrm8488/bert-spanish-ner-cased', aggregation_strategy='simple')"
31
+
32
+ # 5. Copiar el resto del código asignando la propiedad al usuario final
33
+ COPY --chown=user . /app
34
+
35
+ USER user
36
+
37
+ # 6. Escuchar obligatoriamente en el puerto 7860 exigido por Hugging Face
38
+ EXPOSE 7860
39
+
40
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
database.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from sqlalchemy import create_engine
4
+ from sqlalchemy.orm import declarative_base, sessionmaker
5
+
6
+ # Load environment variables from .env file
7
+ load_dotenv()
8
+
9
+ DATABASE_URL = os.getenv("DATABASE_URL")
10
+
11
+ if not DATABASE_URL:
12
+ # Fallback to local postgres if not set for safety, or raise a warning
13
+ DATABASE_URL = "postgresql://postgres:postgres@localhost:5432/postgres"
14
+
15
+ # For Supabase, pool_pre_ping=True is highly recommended to handle stale connections
16
+ engine = create_engine(
17
+ DATABASE_URL,
18
+ pool_pre_ping=True,
19
+ pool_recycle=3600
20
+ )
21
+
22
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
23
+
24
+ from sqlalchemy import Column, Integer, String, Numeric, Boolean, DateTime, ForeignKey
25
+ import datetime
26
+
27
+ Base = declarative_base()
28
+
29
+ class DimEstudiante(Base):
30
+ __tablename__ = "dim_estudiante"
31
+ id_estudiante = Column(Integer, primary_key=True, index=True)
32
+ nombre_completo = Column(String(200), nullable=False)
33
+ codigo_estudiante = Column(String(50))
34
+
35
+ class FactRendimientoAcademico(Base):
36
+ __tablename__ = "fact_rendimiento_academico"
37
+ id_hecho_aca = Column(Integer, primary_key=True, index=True)
38
+ id_estudiante = Column(Integer, ForeignKey("dim_estudiante.id_estudiante"))
39
+ id_docente = Column(Integer)
40
+ id_modulo = Column(Integer)
41
+ id_tiempo = Column(Integer)
42
+ id_documento = Column(Integer)
43
+ id_usuario_carga = Column(Integer)
44
+ nota_final = Column(Numeric(5,2))
45
+ asistencia_pct = Column(Numeric(5,2))
46
+ incumplimiento_actividades_pct = Column(Numeric(5,2))
47
+ nivel_confianza_ia = Column(Numeric(5,4))
48
+ requiere_revision = Column(Boolean, default=False)
49
+ created_at = Column(DateTime, default=datetime.datetime.utcnow)
50
+
51
+
52
+ def get_db():
53
+ """
54
+ Dependency injection helper to yield a database session.
55
+ Guarantees that the session is closed after the request completes.
56
+ """
57
+ db = SessionLocal()
58
+ try:
59
+ yield db
60
+ finally:
61
+ db.close()
main.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ from typing import List
4
+ from fastapi import FastAPI, Depends, HTTPException, status
5
+ from pydantic import BaseModel
6
+ from sqlalchemy.orm import Session
7
+ from database import get_db, DimEstudiante, FactRendimientoAcademico, engine
8
+ from ner_engine import ner_engine
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ app = FastAPI(
14
+ title="GiraGroup BI Backend Cloud",
15
+ description="API para Tecnologías Emergentes II con BETO y Supabase",
16
+ version="1.0.0"
17
+ )
18
+
19
+ class ProcessSheetPayload(BaseModel):
20
+ texto_celda: str
21
+ nota_detectada: float
22
+ asistencia: float
23
+ incumplimiento_tareas: float
24
+ id_docente: int
25
+ id_modulo: int
26
+ id_tiempo: int
27
+ id_documento: int
28
+ id_usuario: int
29
+
30
+ @app.get("/")
31
+ def read_root():
32
+ return {
33
+ "status": "healthy",
34
+ "service": "GiraGroup BI Backend API Cloud",
35
+ "ner_initialized": ner_engine._initialized or ner_engine.pipeline is not None
36
+ }
37
+
38
+ @app.post("/api/v1/ingesta/tabular", status_code=status.HTTP_201_CREATED)
39
+ def procesar_registro_tabular(payload: ProcessSheetPayload, db: Session = Depends(get_db)):
40
+ entidades = ner_engine.extract_entities(payload.texto_celda)
41
+ confianza_ia = sum([e["score"] for e in entidades]) / len(entidades) if entidades else 1.0
42
+
43
+ forzar_revision = False
44
+ if confianza_ia < 0.60:
45
+ forzar_revision = True
46
+
47
+ nombre_resuelto = payload.texto_celda.strip()
48
+ estudiante = db.query(DimEstudiante).filter(DimEstudiante.nombre_completo == nombre_resuelto).first()
49
+
50
+ if not estudiante:
51
+ estudiante = DimEstudiante(nombre_completo=nombre_resuelto)
52
+ db.add(estudiante)
53
+ db.commit()
54
+ db.refresh(estudiante)
55
+
56
+ alertas_disparadas = []
57
+ if payload.nota_detectada <= 70.0:
58
+ alertas_disparadas.append("RIESGO_ACADEMICO_CRITICO")
59
+ if payload.asistencia < 70.0 or payload.incumplimiento_tareas > 30.0:
60
+ alertas_disparadas.append("RIESGO_DESERCION_ALTA")
61
+
62
+ try:
63
+ nuevo_hecho = FactRendimientoAcademico(
64
+ id_estudiante=estudiante.id_estudiante,
65
+ id_docente=payload.id_docente,
66
+ id_modulo=payload.id_modulo,
67
+ id_tiempo=payload.id_tiempo,
68
+ id_documento=payload.id_documento,
69
+ id_usuario_carga=payload.id_usuario,
70
+ nota_final=payload.nota_detectada,
71
+ asistencia_pct=payload.asistencia,
72
+ incumplimiento_actividades_pct=payload.incumplimiento_tareas,
73
+ nivel_confianza_ia=confianza_ia,
74
+ requiere_revision=forzar_revision
75
+ )
76
+ db.add(nuevo_hecho)
77
+ db.commit()
78
+
79
+ return {
80
+ "status": "processed",
81
+ "id_estudiante_asignado": estudiante.id_estudiante,
82
+ "confianza_modelo_beto": round(confianza_ia, 4),
83
+ "requiere_auditoria_humana": forzar_revision,
84
+ "alertas_estrategicas": alertas_disparadas
85
+ }
86
+ except Exception as err:
87
+ db.rollback()
88
+ logger.error(f"Fallo en persistencia: {err}")
89
+ raise HTTPException(status_code=500, detail="Error al escribir en Supabase.")
ner_engine.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from typing import List, Dict, Any
4
+
5
+ # Configure logging
6
+ logging.basicConfig(level=logging.INFO)
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class NEREngine:
10
+ def __init__(self, model_name: str = "mrm8488/bert-spanish-ner-cased"):
11
+ self.model_name = model_name
12
+ self.pipeline = None
13
+ self._initialized = False
14
+
15
+ def initialize(self):
16
+ """
17
+ Lazily initialize the Hugging Face pipeline.
18
+ This prevents startup failure if transformers/torch is missing
19
+ and speeds up unit tests.
20
+ """
21
+ if self._initialized:
22
+ return
23
+
24
+ logger.info(f"Initializing NER model: {self.model_name}...")
25
+ try:
26
+ from transformers import pipeline
27
+ self.pipeline = pipeline(
28
+ "ner",
29
+ model=self.model_name,
30
+ tokenizer=self.model_name,
31
+ aggregation_strategy="simple"
32
+ )
33
+ self._initialized = True
34
+ logger.info("NER model loaded successfully.")
35
+ except Exception as e:
36
+ logger.error(f"Error loading NER model: {e}")
37
+ logger.warning("Falling back to a mock regex entity extractor for development/testing.")
38
+ self.pipeline = None
39
+ self._initialized = True
40
+
41
+ def extract_entities(self, text: str) -> List[Dict[str, Any]]:
42
+ """
43
+ Extract named entities from Spanish text.
44
+ Returns a list of dictionaries with word, entity_group, score, start, and end.
45
+ """
46
+ self.initialize()
47
+
48
+ if not text or not text.strip():
49
+ return []
50
+
51
+ if self.pipeline:
52
+ try:
53
+ raw_entities = self.pipeline(text)
54
+ entities = []
55
+ for ent in raw_entities:
56
+ entities.append({
57
+ "word": ent.get("word"),
58
+ "entity_group": ent.get("entity_group"),
59
+ "score": float(ent.get("score")),
60
+ "start": int(ent.get("start")),
61
+ "end": int(ent.get("end"))
62
+ })
63
+ return entities
64
+ except Exception as e:
65
+ logger.error(f"Error executing NER pipeline: {e}")
66
+ return self._mock_extraction(text)
67
+ else:
68
+ return self._mock_extraction(text)
69
+
70
+ def _mock_extraction(self, text: str) -> List[Dict[str, Any]]:
71
+ """
72
+ Basic regex-based entity extractor fallback for development/testing environments
73
+ when PyTorch or Transformers are not fully installed.
74
+ """
75
+ entities = []
76
+ # Find capitalized words
77
+ for match in re.finditer(r'\b[A-Z][a-zA-Z0-9_]+\b', text):
78
+ word = match.group()
79
+ start, end = match.span()
80
+
81
+ # Simple rule-based classification for mock entities
82
+ if word in ["Madrid", "Barcelona", "Bogota", "Santiago", "Lima", "Mexico"]:
83
+ entity_group = "LOC"
84
+ elif word in ["GiraGroup", "Supabase", "Google", "Microsoft", "SQLAlchemy"]:
85
+ entity_group = "ORG"
86
+ elif word in ["Juan", "Maria", "Carlos", "Jose", "Pedro", "Ana"]:
87
+ entity_group = "PER"
88
+ else:
89
+ entity_group = "MISC"
90
+
91
+ entities.append({
92
+ "word": word,
93
+ "entity_group": entity_group,
94
+ "score": 1.0,
95
+ "start": start,
96
+ "end": end
97
+ })
98
+ return entities
99
+
100
+ # Global instance of the engine
101
+ ner_engine = NEREngine()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi>=0.100.0
2
+ uvicorn[standard]>=0.22.0
3
+ sqlalchemy>=2.0.0
4
+ psycopg2-binary>=2.9.0
5
+ python-dotenv>=1.0.0
6
+ transformers>=4.30.0
7
+ torch>=2.0.0 --extra-index-url https://download.pytorch.org/whl/cpu
8
+ pandas>=2.2.1
9
+ openpyxl==3.1.2
10
+ gspread==6.1.0
11
+ rapidfuzz==3.6.1
12
+ pydantic>=2.0.0