Spaces:

DazaC
/

giragroup-bi-backend

Sleeping

Adzacam

Fix: Cambiar a modelo dccuchile BERT base libre de restricciones 401

452cdbb 24 days ago

3.61 kB

	import logging
	import re
	from typing import List, Dict, Any

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class NEREngine:
	def __init__(self, model_name: str = "dccuchile/bert-base-spanish-wwm-cased"):
	self.model_name = model_name
	self.pipeline = None
	self._initialized = False

	def initialize(self):
	"""
	Lazily initialize the Hugging Face pipeline.
	This prevents startup failure if transformers/torch is missing
	and speeds up unit tests.
	"""
	if self._initialized:
	return

	logger.info(f"Initializing NER model: {self.model_name}...")
	try:
	from transformers import pipeline
	self.pipeline = pipeline(
	"ner",
	model=self.model_name,
	tokenizer=self.model_name,
	aggregation_strategy="simple"
	)
	self._initialized = True
	logger.info("NER model loaded successfully.")
	except Exception as e:
	logger.error(f"Error loading NER model: {e}")
	logger.warning("Falling back to a mock regex entity extractor for development/testing.")
	self.pipeline = None
	self._initialized = True

	def extract_entities(self, text: str) -> List[Dict[str, Any]]:
	"""
	Extract named entities from Spanish text.
	Returns a list of dictionaries with word, entity_group, score, start, and end.
	"""
	self.initialize()

	if not text or not text.strip():
	return []

	if self.pipeline:
	try:
	raw_entities = self.pipeline(text)
	entities = []
	for ent in raw_entities:
	entities.append({
	"word": ent.get("word"),
	"entity_group": ent.get("entity_group"),
	"score": float(ent.get("score")),
	"start": int(ent.get("start")),
	"end": int(ent.get("end"))
	})
	return entities
	except Exception as e:
	logger.error(f"Error executing NER pipeline: {e}")
	return self._mock_extraction(text)
	else:
	return self._mock_extraction(text)

	def _mock_extraction(self, text: str) -> List[Dict[str, Any]]:
	"""
	Basic regex-based entity extractor fallback for development/testing environments
	when PyTorch or Transformers are not fully installed.
	"""
	entities = []
	# Find capitalized words
	for match in re.finditer(r'\b[A-Z][a-zA-Z0-9_]+\b', text):
	word = match.group()
	start, end = match.span()

	# Simple rule-based classification for mock entities
	if word in ["Madrid", "Barcelona", "Bogota", "Santiago", "Lima", "Mexico"]:
	entity_group = "LOC"
	elif word in ["GiraGroup", "Supabase", "Google", "Microsoft", "SQLAlchemy"]:
	entity_group = "ORG"
	elif word in ["Juan", "Maria", "Carlos", "Jose", "Pedro", "Ana"]:
	entity_group = "PER"
	else:
	entity_group = "MISC"

	entities.append({
	"word": word,
	"entity_group": entity_group,
	"score": 1.0,
	"start": start,
	"end": end
	})
	return entities

	# Global instance of the engine
	ner_engine = NEREngine()