Spaces:
Sleeping
Sleeping
| import logging | |
| import re | |
| from typing import List, Dict, Any | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class NEREngine: | |
| def __init__(self, model_name: str = "dccuchile/bert-base-spanish-wwm-cased"): | |
| self.model_name = model_name | |
| self.pipeline = None | |
| self._initialized = False | |
| def initialize(self): | |
| """ | |
| Lazily initialize the Hugging Face pipeline. | |
| This prevents startup failure if transformers/torch is missing | |
| and speeds up unit tests. | |
| """ | |
| if self._initialized: | |
| return | |
| logger.info(f"Initializing NER model: {self.model_name}...") | |
| try: | |
| from transformers import pipeline | |
| self.pipeline = pipeline( | |
| "ner", | |
| model=self.model_name, | |
| tokenizer=self.model_name, | |
| aggregation_strategy="simple" | |
| ) | |
| self._initialized = True | |
| logger.info("NER model loaded successfully.") | |
| except Exception as e: | |
| logger.error(f"Error loading NER model: {e}") | |
| logger.warning("Falling back to a mock regex entity extractor for development/testing.") | |
| self.pipeline = None | |
| self._initialized = True | |
| def extract_entities(self, text: str) -> List[Dict[str, Any]]: | |
| """ | |
| Extract named entities from Spanish text. | |
| Returns a list of dictionaries with word, entity_group, score, start, and end. | |
| """ | |
| self.initialize() | |
| if not text or not text.strip(): | |
| return [] | |
| if self.pipeline: | |
| try: | |
| raw_entities = self.pipeline(text) | |
| entities = [] | |
| for ent in raw_entities: | |
| entities.append({ | |
| "word": ent.get("word"), | |
| "entity_group": ent.get("entity_group"), | |
| "score": float(ent.get("score")), | |
| "start": int(ent.get("start")), | |
| "end": int(ent.get("end")) | |
| }) | |
| return entities | |
| except Exception as e: | |
| logger.error(f"Error executing NER pipeline: {e}") | |
| return self._mock_extraction(text) | |
| else: | |
| return self._mock_extraction(text) | |
| def _mock_extraction(self, text: str) -> List[Dict[str, Any]]: | |
| """ | |
| Basic regex-based entity extractor fallback for development/testing environments | |
| when PyTorch or Transformers are not fully installed. | |
| """ | |
| entities = [] | |
| # Find capitalized words | |
| for match in re.finditer(r'\b[A-Z][a-zA-Z0-9_]+\b', text): | |
| word = match.group() | |
| start, end = match.span() | |
| # Simple rule-based classification for mock entities | |
| if word in ["Madrid", "Barcelona", "Bogota", "Santiago", "Lima", "Mexico"]: | |
| entity_group = "LOC" | |
| elif word in ["GiraGroup", "Supabase", "Google", "Microsoft", "SQLAlchemy"]: | |
| entity_group = "ORG" | |
| elif word in ["Juan", "Maria", "Carlos", "Jose", "Pedro", "Ana"]: | |
| entity_group = "PER" | |
| else: | |
| entity_group = "MISC" | |
| entities.append({ | |
| "word": word, | |
| "entity_group": entity_group, | |
| "score": 1.0, | |
| "start": start, | |
| "end": end | |
| }) | |
| return entities | |
| # Global instance of the engine | |
| ner_engine = NEREngine() | |