# advanced_duplicate_detection.py """ Advanced duplicate detection for glossary entries. Uses multiple algorithms and takes the best match. """ def get_similarity_score(name1, name2, threshold=0.90): """ Calculate similarity using multiple algorithms and return the best score. Args: name1: First name to compare name2: Second name to compare threshold: Minimum similarity threshold (0.0-1.0) Returns: float: Best similarity score from all algorithms (0.0-1.0) """ if not name1 or not name2: return 0.0 # Quick exact match check if name1.lower() == name2.lower(): return 1.0 scores = [] # Try RapidFuzz first (fastest) try: from rapidfuzz import fuzz # Basic ratio ratio = fuzz.ratio(name1.lower(), name2.lower()) / 100.0 scores.append(ratio) # Token sort (handles word order) token_sort = fuzz.token_sort_ratio(name1.lower(), name2.lower()) / 100.0 scores.append(token_sort) # Partial ratio (substring matching) partial = fuzz.partial_ratio(name1.lower(), name2.lower()) / 100.0 scores.append(partial) except ImportError: pass # Try TheFuzz/FuzzyWuzzy (more sophisticated) try: from thefuzz import fuzz as tfuzz # Token set ratio (best for name variations) token_set = tfuzz.token_set_ratio(name1, name2) / 100.0 scores.append(token_set) except ImportError: pass # Try Jellyfish (phonetic matching for names) try: import jellyfish # Jaro-Winkler (designed for names, prioritizes prefix matches) jaro = jellyfish.jaro_winkler_similarity(name1, name2) scores.append(jaro) except ImportError: pass # Try TextDistance (additional algorithms) try: import textdistance # Jaro-Winkler from textdistance jw = textdistance.jaro_winkler.normalized_similarity(name1, name2) scores.append(jw) # DamerauLevenshtein (handles transpositions) dl = textdistance.damerau_levenshtein.normalized_similarity(name1, name2) scores.append(dl) except ImportError: pass # Fallback to difflib if no libraries available if not scores: from difflib import SequenceMatcher ratio = SequenceMatcher(None, name1.lower(), name2.lower()).ratio() scores.append(ratio) # Return the maximum score from all algorithms best_score = max(scores) if scores else 0.0 return best_score def find_duplicates_advanced(entries, threshold=0.90, debug=False): """ Find duplicates using advanced multi-algorithm approach. Args: entries: List of dict entries with 'raw_name' field threshold: Similarity threshold (0.0-1.0) debug: Print debug information Returns: tuple: (deduplicated_entries, removed_count, duplicate_pairs) """ from remove_honorifics import remove_honorifics # Your existing function seen_entries = [] # List of (cleaned_name, original_entry) deduplicated = [] duplicate_pairs = [] # Track what was merged removed_count = 0 if debug: print(f"[AdvancedDedup] Processing {len(entries)} entries with threshold {threshold:.2f}") for idx, entry in enumerate(entries): raw_name = entry.get('raw_name', '') if not raw_name: continue # Clean the name (remove honorifics) cleaned_name = remove_honorifics(raw_name) # Check against all seen entries is_duplicate = False best_match_score = 0.0 matched_with = None for seen_clean, seen_entry in seen_entries: # Get similarity score using multiple algorithms score = get_similarity_score(cleaned_name, seen_clean, threshold) if score >= threshold: is_duplicate = True if score > best_match_score: best_match_score = score matched_with = seen_entry.get('raw_name', '') break if is_duplicate: removed_count += 1 duplicate_pairs.append({ 'duplicate': raw_name, 'original': matched_with, 'score': best_match_score }) if debug and removed_count <= 10: print(f"[AdvancedDedup] Duplicate: '{raw_name}' matches '{matched_with}' (score: {best_match_score:.3f})") else: seen_entries.append((cleaned_name, entry)) deduplicated.append(entry) if debug: print(f"[AdvancedDedup] Removed {removed_count} duplicates") print(f"[AdvancedDedup] Kept {len(deduplicated)} unique entries") return deduplicated, removed_count, duplicate_pairs def get_available_algorithms(): """Check which algorithms are available""" available = [] try: import rapidfuzz available.append("RapidFuzz (Basic + Token)") except ImportError: pass try: import thefuzz available.append("TheFuzz (Token Set)") except ImportError: pass try: import jellyfish available.append("Jellyfish (Jaro-Winkler)") except ImportError: pass try: import textdistance available.append("TextDistance (Multiple)") except ImportError: pass if not available: available.append("difflib (Fallback)") return available if __name__ == "__main__": from shutdown_utils import run_cli_main def _main(): # Test the similarity scoring print("Available algorithms:") for algo in get_available_algorithms(): print(f" ✓ {algo}") print("\nTest cases:") test_pairs = [ ("김상현", "김상현님"), ("Kim Sang-hyun", "Kim Sanghyun"), ("김상현", "김상혁"), ("Park Ji-sung", "Ji-sung Park"), ("田中太郎", "田中太郎さん"), ] for name1, name2 in test_pairs: score = get_similarity_score(name1, name2) status = "✓ MATCH" if score >= 0.90 else "✗ DIFFERENT" print(f"{status} '{name1}' vs '{name2}': {score:.3f}") return 0 run_cli_main(_main)