import re import unicodedata def normalize_text(text: str) -> str: if not text: return "" # Remove accents text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') text = text.lower() # Remove non-alphanumeric (keep spaces) text = re.sub(r'[^a-z0-9\s]', '', text) # Collapse spaces text = re.sub(r'\s+', ' ', text).strip() return text def jaro_winkler_similarity(s1: str, s2: str) -> float: """ Pure Python implementation of Jaro-Winkler similarity. Returns a float between 0.0 and 1.0. """ s1 = normalize_text(s1) s2 = normalize_text(s2) if s1 == s2: return 1.0 len1, len2 = len(s1), len(s2) if len1 == 0 or len2 == 0: return 0.0 match_distance = max(len1, len2) // 2 - 1 s1_matches = [False] * len1 s2_matches = [False] * len2 matches = 0 for i in range(len1): start = max(0, i - match_distance) end = min(i + match_distance + 1, len2) for j in range(start, end): if s2_matches[j]: continue if s1[i] == s2[j]: s1_matches[i] = True s2_matches[j] = True matches += 1 break if matches == 0: return 0.0 t = 0 k = 0 for i in range(len1): if s1_matches[i]: while not s2_matches[k]: k += 1 if s1[i] != s2[k]: t += 1 k += 1 t /= 2.0 jaro = (matches / len1 + matches / len2 + (matches - t) / matches) / 3.0 # Winkler modification prefix = 0 max_prefix = min(4, min(len1, len2)) for i in range(max_prefix): if s1[i] == s2[i]: prefix += 1 else: break # Standard Winkler weight is 0.1 jw = jaro + prefix * 0.1 * (1.0 - jaro) return jw def find_best_match(target: str, candidates: list, threshold: float = 0.80): """ Finds the best match for 'target' in 'candidates' (a list of dicts with 'name' and 'id' or object). Returns (best_candidate, best_score) or (None, 0.0) """ best_score = 0.0 best_candidate = None for candidate in candidates: # Assuming candidate is an object with 'nombre_completo' attribute name = getattr(candidate, 'nombre_completo', None) if not name: continue score = jaro_winkler_similarity(target, name) if score > best_score: best_score = score best_candidate = candidate if best_score >= threshold: return best_candidate, best_score return None, best_score def get_top_matches(target: str, candidates: list, top_k: int = 5): results = [] for candidate in candidates: name = getattr(candidate, 'nombre_completo', None) if not name: continue score = jaro_winkler_similarity(target, name) if score > 0.4: # Only include decent candidates results.append({"id": candidate.id_estudiante, "nombre": name, "score": score}) results.sort(key=lambda x: x["score"], reverse=True) return results[:top_k]