Spaces:

DazaC
/

giragroup-bi-backend

Sleeping

Adzacam

feat: implement Jaro-Winkler fuzzy matching for student linking and add data quality diagnostic endpoint

8d50119 23 days ago

2.68 kB

	import re
	import unicodedata

	def normalize_text(text: str) -> str:
	if not text:
	return ""
	# Remove accents
	text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
	text = text.lower()
	# Remove non-alphanumeric (keep spaces)
	text = re.sub(r'[^a-z0-9\s]', '', text)
	# Collapse spaces
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def jaro_winkler_similarity(s1: str, s2: str) -> float:
	"""
	Pure Python implementation of Jaro-Winkler similarity.
	Returns a float between 0.0 and 1.0.
	"""
	s1 = normalize_text(s1)
	s2 = normalize_text(s2)

	if s1 == s2:
	return 1.0

	len1, len2 = len(s1), len(s2)
	if len1 == 0 or len2 == 0:
	return 0.0

	match_distance = max(len1, len2) // 2 - 1

	s1_matches = [False] * len1
	s2_matches = [False] * len2

	matches = 0
	for i in range(len1):
	start = max(0, i - match_distance)
	end = min(i + match_distance + 1, len2)
	for j in range(start, end):
	if s2_matches[j]:
	continue
	if s1[i] == s2[j]:
	s1_matches[i] = True
	s2_matches[j] = True
	matches += 1
	break

	if matches == 0:
	return 0.0

	t = 0
	k = 0
	for i in range(len1):
	if s1_matches[i]:
	while not s2_matches[k]:
	k += 1
	if s1[i] != s2[k]:
	t += 1
	k += 1
	t /= 2.0

	jaro = (matches / len1 + matches / len2 + (matches - t) / matches) / 3.0

	# Winkler modification
	prefix = 0
	max_prefix = min(4, min(len1, len2))
	for i in range(max_prefix):
	if s1[i] == s2[i]:
	prefix += 1
	else:
	break

	# Standard Winkler weight is 0.1
	jw = jaro + prefix * 0.1 * (1.0 - jaro)
	return jw

	def find_best_match(target: str, candidates: list, threshold: float = 0.80):
	"""
	Finds the best match for 'target' in 'candidates' (a list of dicts with 'name' and 'id' or object).
	Returns (best_candidate, best_score) or (None, 0.0)
	"""
	best_score = 0.0
	best_candidate = None

	for candidate in candidates:
	# Assuming candidate is an object with 'nombre_completo' attribute
	name = getattr(candidate, 'nombre_completo', None)
	if not name:
	continue

	score = jaro_winkler_similarity(target, name)
	if score > best_score:
	best_score = score
	best_candidate = candidate

	if best_score >= threshold:
	return best_candidate, best_score
	return None, best_score