Spaces:

DazaC
/

giragroup-bi-backend

Sleeping

File size: 3,183 Bytes

import re
import unicodedata

def normalize_text(text: str) -> str:
    if not text:
        return ""
    # Remove accents
    text = ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')
    text = text.lower()
    # Remove non-alphanumeric (keep spaces)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    # Collapse spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def jaro_winkler_similarity(s1: str, s2: str) -> float:
    """
    Pure Python implementation of Jaro-Winkler similarity.
    Returns a float between 0.0 and 1.0.
    """
    s1 = normalize_text(s1)
    s2 = normalize_text(s2)

    if s1 == s2:
        return 1.0

    len1, len2 = len(s1), len(s2)
    if len1 == 0 or len2 == 0:
        return 0.0

    match_distance = max(len1, len2) // 2 - 1

    s1_matches = [False] * len1
    s2_matches = [False] * len2

    matches = 0
    for i in range(len1):
        start = max(0, i - match_distance)
        end = min(i + match_distance + 1, len2)
        for j in range(start, end):
            if s2_matches[j]:
                continue
            if s1[i] == s2[j]:
                s1_matches[i] = True
                s2_matches[j] = True
                matches += 1
                break

    if matches == 0:
        return 0.0

    t = 0
    k = 0
    for i in range(len1):
        if s1_matches[i]:
            while not s2_matches[k]:
                k += 1
            if s1[i] != s2[k]:
                t += 1
            k += 1
    t /= 2.0

    jaro = (matches / len1 + matches / len2 + (matches - t) / matches) / 3.0

    # Winkler modification
    prefix = 0
    max_prefix = min(4, min(len1, len2))
    for i in range(max_prefix):
        if s1[i] == s2[i]:
            prefix += 1
        else:
            break

    # Standard Winkler weight is 0.1
    jw = jaro + prefix * 0.1 * (1.0 - jaro)
    return jw

def find_best_match(target: str, candidates: list, threshold: float = 0.80):
    """
    Finds the best match for 'target' in 'candidates' (a list of dicts with 'name' and 'id' or object).
    Returns (best_candidate, best_score) or (None, 0.0)
    """
    best_score = 0.0
    best_candidate = None

    for candidate in candidates:
        # Assuming candidate is an object with 'nombre_completo' attribute
        name = getattr(candidate, 'nombre_completo', None)
        if not name:
            continue
        
        score = jaro_winkler_similarity(target, name)
        if score > best_score:
            best_score = score
            best_candidate = candidate

    if best_score >= threshold:
        return best_candidate, best_score
    return None, best_score

def get_top_matches(target: str, candidates: list, top_k: int = 5):
    results = []
    for candidate in candidates:
        name = getattr(candidate, 'nombre_completo', None)
        if not name:
            continue
        score = jaro_winkler_similarity(target, name)
        if score > 0.4: # Only include decent candidates
            results.append({"id": candidate.id_estudiante, "nombre": name, "score": score})
    results.sort(key=lambda x: x["score"], reverse=True)
    return results[:top_k]