import logging import re from typing import List, Dict, Any # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class NEREngine: def __init__(self, model_name: str = "dccuchile/bert-base-spanish-wwm-cased"): self.model_name = model_name self.pipeline = None self._initialized = False def initialize(self): """ Lazily initialize the Hugging Face pipeline. This prevents startup failure if transformers/torch is missing and speeds up unit tests. """ if self._initialized: return logger.info(f"Initializing NER model: {self.model_name}...") try: from transformers import pipeline self.pipeline = pipeline( "ner", model=self.model_name, tokenizer=self.model_name, aggregation_strategy="simple" ) self._initialized = True logger.info("NER model loaded successfully.") except Exception as e: logger.error(f"Error loading NER model: {e}") logger.warning("Falling back to a mock regex entity extractor for development/testing.") self.pipeline = None self._initialized = True def extract_entities(self, text: str) -> List[Dict[str, Any]]: """ Extract named entities from Spanish text. Returns a list of dictionaries with word, entity_group, score, start, and end. """ self.initialize() if not text or not text.strip(): return [] if self.pipeline: try: raw_entities = self.pipeline(text) entities = [] for ent in raw_entities: entities.append({ "word": ent.get("word"), "entity_group": ent.get("entity_group"), "score": float(ent.get("score")), "start": int(ent.get("start")), "end": int(ent.get("end")) }) return entities except Exception as e: logger.error(f"Error executing NER pipeline: {e}") return self._mock_extraction(text) else: return self._mock_extraction(text) def _mock_extraction(self, text: str) -> List[Dict[str, Any]]: """ Basic regex-based entity extractor fallback for development/testing environments when PyTorch or Transformers are not fully installed. """ entities = [] # Find capitalized words for match in re.finditer(r'\b[A-Z][a-zA-Z0-9_]+\b', text): word = match.group() start, end = match.span() # Simple rule-based classification for mock entities if word in ["Madrid", "Barcelona", "Bogota", "Santiago", "Lima", "Mexico"]: entity_group = "LOC" elif word in ["GiraGroup", "Supabase", "Google", "Microsoft", "SQLAlchemy"]: entity_group = "ORG" elif word in ["Juan", "Maria", "Carlos", "Jose", "Pedro", "Ana"]: entity_group = "PER" else: entity_group = "MISC" entities.append({ "word": word, "entity_group": entity_group, "score": 1.0, "start": start, "end": end }) return entities # Global instance of the engine ner_engine = NEREngine()