giragroup-bi-backend / ner_engine.py
Adzacam
Fix: Cambiar a modelo dccuchile BERT base libre de restricciones 401
452cdbb
Raw
History Blame
3.61 kB
import logging
import re
from typing import List, Dict, Any
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class NEREngine:
def __init__(self, model_name: str = "dccuchile/bert-base-spanish-wwm-cased"):
self.model_name = model_name
self.pipeline = None
self._initialized = False
def initialize(self):
"""
Lazily initialize the Hugging Face pipeline.
This prevents startup failure if transformers/torch is missing
and speeds up unit tests.
"""
if self._initialized:
return
logger.info(f"Initializing NER model: {self.model_name}...")
try:
from transformers import pipeline
self.pipeline = pipeline(
"ner",
model=self.model_name,
tokenizer=self.model_name,
aggregation_strategy="simple"
)
self._initialized = True
logger.info("NER model loaded successfully.")
except Exception as e:
logger.error(f"Error loading NER model: {e}")
logger.warning("Falling back to a mock regex entity extractor for development/testing.")
self.pipeline = None
self._initialized = True
def extract_entities(self, text: str) -> List[Dict[str, Any]]:
"""
Extract named entities from Spanish text.
Returns a list of dictionaries with word, entity_group, score, start, and end.
"""
self.initialize()
if not text or not text.strip():
return []
if self.pipeline:
try:
raw_entities = self.pipeline(text)
entities = []
for ent in raw_entities:
entities.append({
"word": ent.get("word"),
"entity_group": ent.get("entity_group"),
"score": float(ent.get("score")),
"start": int(ent.get("start")),
"end": int(ent.get("end"))
})
return entities
except Exception as e:
logger.error(f"Error executing NER pipeline: {e}")
return self._mock_extraction(text)
else:
return self._mock_extraction(text)
def _mock_extraction(self, text: str) -> List[Dict[str, Any]]:
"""
Basic regex-based entity extractor fallback for development/testing environments
when PyTorch or Transformers are not fully installed.
"""
entities = []
# Find capitalized words
for match in re.finditer(r'\b[A-Z][a-zA-Z0-9_]+\b', text):
word = match.group()
start, end = match.span()
# Simple rule-based classification for mock entities
if word in ["Madrid", "Barcelona", "Bogota", "Santiago", "Lima", "Mexico"]:
entity_group = "LOC"
elif word in ["GiraGroup", "Supabase", "Google", "Microsoft", "SQLAlchemy"]:
entity_group = "ORG"
elif word in ["Juan", "Maria", "Carlos", "Jose", "Pedro", "Ana"]:
entity_group = "PER"
else:
entity_group = "MISC"
entities.append({
"word": word,
"entity_group": entity_group,
"score": 1.0,
"start": start,
"end": end
})
return entities
# Global instance of the engine
ner_engine = NEREngine()