giragroup-bi-backend / similarity.py
Adzacam
feat: implement Jaro-Winkler fuzzy matching for student linking and add data quality diagnostic endpoint
8d50119
Raw
History Blame
2.68 kB
import re
import unicodedata
def normalize_text(text: str) -> str:
if not text:
return ""
# Remove accents
text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
text = text.lower()
# Remove non-alphanumeric (keep spaces)
text = re.sub(r'[^a-z0-9\s]', '', text)
# Collapse spaces
text = re.sub(r'\s+', ' ', text).strip()
return text
def jaro_winkler_similarity(s1: str, s2: str) -> float:
"""
Pure Python implementation of Jaro-Winkler similarity.
Returns a float between 0.0 and 1.0.
"""
s1 = normalize_text(s1)
s2 = normalize_text(s2)
if s1 == s2:
return 1.0
len1, len2 = len(s1), len(s2)
if len1 == 0 or len2 == 0:
return 0.0
match_distance = max(len1, len2) // 2 - 1
s1_matches = [False] * len1
s2_matches = [False] * len2
matches = 0
for i in range(len1):
start = max(0, i - match_distance)
end = min(i + match_distance + 1, len2)
for j in range(start, end):
if s2_matches[j]:
continue
if s1[i] == s2[j]:
s1_matches[i] = True
s2_matches[j] = True
matches += 1
break
if matches == 0:
return 0.0
t = 0
k = 0
for i in range(len1):
if s1_matches[i]:
while not s2_matches[k]:
k += 1
if s1[i] != s2[k]:
t += 1
k += 1
t /= 2.0
jaro = (matches / len1 + matches / len2 + (matches - t) / matches) / 3.0
# Winkler modification
prefix = 0
max_prefix = min(4, min(len1, len2))
for i in range(max_prefix):
if s1[i] == s2[i]:
prefix += 1
else:
break
# Standard Winkler weight is 0.1
jw = jaro + prefix * 0.1 * (1.0 - jaro)
return jw
def find_best_match(target: str, candidates: list, threshold: float = 0.80):
"""
Finds the best match for 'target' in 'candidates' (a list of dicts with 'name' and 'id' or object).
Returns (best_candidate, best_score) or (None, 0.0)
"""
best_score = 0.0
best_candidate = None
for candidate in candidates:
# Assuming candidate is an object with 'nombre_completo' attribute
name = getattr(candidate, 'nombre_completo', None)
if not name:
continue
score = jaro_winkler_similarity(target, name)
if score > best_score:
best_score = score
best_candidate = candidate
if best_score >= threshold:
return best_candidate, best_score
return None, best_score