Spaces:
Sleeping
Sleeping
Adzacam
feat: implement Jaro-Winkler fuzzy matching for student linking and add data quality diagnostic endpoint
8d50119 | import re | |
| import unicodedata | |
| def normalize_text(text: str) -> str: | |
| if not text: | |
| return "" | |
| # Remove accents | |
| text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') | |
| text = text.lower() | |
| # Remove non-alphanumeric (keep spaces) | |
| text = re.sub(r'[^a-z0-9\s]', '', text) | |
| # Collapse spaces | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def jaro_winkler_similarity(s1: str, s2: str) -> float: | |
| """ | |
| Pure Python implementation of Jaro-Winkler similarity. | |
| Returns a float between 0.0 and 1.0. | |
| """ | |
| s1 = normalize_text(s1) | |
| s2 = normalize_text(s2) | |
| if s1 == s2: | |
| return 1.0 | |
| len1, len2 = len(s1), len(s2) | |
| if len1 == 0 or len2 == 0: | |
| return 0.0 | |
| match_distance = max(len1, len2) // 2 - 1 | |
| s1_matches = [False] * len1 | |
| s2_matches = [False] * len2 | |
| matches = 0 | |
| for i in range(len1): | |
| start = max(0, i - match_distance) | |
| end = min(i + match_distance + 1, len2) | |
| for j in range(start, end): | |
| if s2_matches[j]: | |
| continue | |
| if s1[i] == s2[j]: | |
| s1_matches[i] = True | |
| s2_matches[j] = True | |
| matches += 1 | |
| break | |
| if matches == 0: | |
| return 0.0 | |
| t = 0 | |
| k = 0 | |
| for i in range(len1): | |
| if s1_matches[i]: | |
| while not s2_matches[k]: | |
| k += 1 | |
| if s1[i] != s2[k]: | |
| t += 1 | |
| k += 1 | |
| t /= 2.0 | |
| jaro = (matches / len1 + matches / len2 + (matches - t) / matches) / 3.0 | |
| # Winkler modification | |
| prefix = 0 | |
| max_prefix = min(4, min(len1, len2)) | |
| for i in range(max_prefix): | |
| if s1[i] == s2[i]: | |
| prefix += 1 | |
| else: | |
| break | |
| # Standard Winkler weight is 0.1 | |
| jw = jaro + prefix * 0.1 * (1.0 - jaro) | |
| return jw | |
| def find_best_match(target: str, candidates: list, threshold: float = 0.80): | |
| """ | |
| Finds the best match for 'target' in 'candidates' (a list of dicts with 'name' and 'id' or object). | |
| Returns (best_candidate, best_score) or (None, 0.0) | |
| """ | |
| best_score = 0.0 | |
| best_candidate = None | |
| for candidate in candidates: | |
| # Assuming candidate is an object with 'nombre_completo' attribute | |
| name = getattr(candidate, 'nombre_completo', None) | |
| if not name: | |
| continue | |
| score = jaro_winkler_similarity(target, name) | |
| if score > best_score: | |
| best_score = score | |
| best_candidate = candidate | |
| if best_score >= threshold: | |
| return best_candidate, best_score | |
| return None, best_score | |