# duplicate_detection_config.py """ Configuration helper for duplicate detection algorithms. Maps user selection to actual algorithm settings. Performance note: This module is used in tight loops during glossary deduplication. Optional dependencies are imported once at module import time to avoid repeated ImportError exceptions. """ import os # Optional dependencies (imported once to avoid per-comparison ImportError overhead) try: from rapidfuzz import fuzz as _rf_fuzz # type: ignore _HAS_RAPIDFUZZ = True except (ImportError, OSError): _rf_fuzz = None _HAS_RAPIDFUZZ = False try: import jellyfish as _jellyfish # type: ignore _HAS_JELLYFISH = True except (ImportError, OSError): _jellyfish = None _HAS_JELLYFISH = False def get_duplicate_detection_config(): """ Get the duplicate detection configuration based on environment variable. Returns: dict: Configuration with 'algorithms', 'threshold', and 'description' """ # Get selected algorithm from environment (set by GUI) selected = os.getenv('GLOSSARY_DUPLICATE_ALGORITHM', 'auto').lower() # Get base threshold from slider (can be overridden by preset) base_threshold = float(os.getenv('GLOSSARY_FUZZY_THRESHOLD', '0.90')) partial_weight = float(os.getenv('GLOSSARY_PARTIAL_RATIO_WEIGHT', '0.45') or 0.45) partial_weight = max(0.0, min(1.0, partial_weight)) configs = { 'auto': { 'algorithms': ['basic', 'token_sort', 'partial', 'jaro_winkler'], 'threshold': base_threshold, 'description': 'Auto - Uses all algorithms, best score wins', 'adjust_threshold': False # Don't override user threshold }, 'strict': { 'algorithms': ['basic'], 'threshold': max(base_threshold, 0.95), # Enforce minimum 95% 'description': 'Strict - High precision, minimal merging', 'adjust_threshold': True # Override to at least 95% }, 'balanced': { 'algorithms': ['token_sort', 'partial'], 'threshold': base_threshold, 'description': 'Balanced - Token + Partial matching', 'adjust_threshold': False }, 'aggressive': { 'algorithms': ['basic', 'token_sort', 'partial', 'jaro_winkler'], 'threshold': min(base_threshold, 0.80), # Lower to at most 80% 'description': 'Aggressive - Maximum duplicate detection', 'adjust_threshold': True # Override to at most 80% }, 'basic': { 'algorithms': ['basic'], 'threshold': base_threshold, 'description': 'Basic - Simple Levenshtein distance', 'adjust_threshold': False } } config = configs.get(selected, configs['auto']) # Filter out algorithms that aren't available in this environment. # This avoids misleading config and prevents slow per-comparison ImportError handling. algorithms = list(config.get('algorithms', [])) # Disable partial-ratio when weight is 0 if partial_weight <= 0: algorithms = [a for a in algorithms if a != 'partial'] if not _HAS_RAPIDFUZZ: algorithms = [a for a in algorithms if a not in ('token_sort', 'partial')] if not _HAS_JELLYFISH: algorithms = [a for a in algorithms if a != 'jaro_winkler'] if not algorithms: algorithms = ['basic'] # Return a copy so callers can safely mutate without affecting defaults. resolved = dict(config) resolved['algorithms'] = algorithms resolved['partial_ratio_weight'] = partial_weight return resolved def calculate_similarity_with_config(name1, name2, config=None): """ Calculate similarity between two names using configured algorithms. Args: name1: First name name2: Second name config: Configuration dict (from get_duplicate_detection_config()) Returns: float: Best similarity score (0.0-1.0) """ if not name1 or not name2: return 0.0 # Normalize once (hot path) n1 = str(name1) n2 = str(name2) n1_lower = n1.lower() n2_lower = n2.lower() # Quick exact match if n1_lower == n2_lower: return 1.0 # Get config if not provided if config is None: config = get_duplicate_detection_config() algorithms = config.get('algorithms', []) partial_weight = max(0.0, min(1.0, config.get('partial_ratio_weight', 1.0))) best = 0.0 # Basic ratio (Levenshtein-like) if 'basic' in algorithms: if _HAS_RAPIDFUZZ: best = max(best, _rf_fuzz.ratio(n1_lower, n2_lower) / 100.0) else: from difflib import SequenceMatcher best = max(best, SequenceMatcher(None, n1_lower, n2_lower).ratio()) # Token sort (word order insensitive) + partial ratio (substring matching) if _HAS_RAPIDFUZZ: if 'token_sort' in algorithms: best = max(best, _rf_fuzz.token_sort_ratio(n1_lower, n2_lower) / 100.0) if 'partial' in algorithms and partial_weight > 0: partial_score = _rf_fuzz.partial_ratio(n1_lower, n2_lower) / 100.0 best = max(best, partial_score * partial_weight) # Jaro-Winkler (designed for names) if 'jaro_winkler' in algorithms and _HAS_JELLYFISH: best = max(best, _jellyfish.jaro_winkler_similarity(n1, n2)) return best def get_algorithm_display_info(): """Get information about which algorithms are actually available.""" available = [] if _HAS_RAPIDFUZZ: available.append("RapidFuzz") else: available.append("difflib (fallback)") if _HAS_JELLYFISH: available.append("Jaro-Winkler") return available if __name__ == "__main__": from shutdown_utils import run_cli_main def _main(): # Test the configuration import os print("Testing duplicate detection configuration...\n") # Test all modes modes = ['auto', 'strict', 'balanced', 'aggressive', 'basic'] for mode in modes: os.environ['GLOSSARY_DUPLICATE_ALGORITHM'] = mode os.environ['GLOSSARY_FUZZY_THRESHOLD'] = '0.90' config = get_duplicate_detection_config() print(f"{mode.upper()}:") print(f" Description: {config['description']}") print(f" Algorithms: {', '.join(config['algorithms'])}") print(f" Threshold: {config['threshold']:.2f}") print() # Test similarity calculation print("\nTesting similarity calculations (AUTO mode):") os.environ['GLOSSARY_DUPLICATE_ALGORITHM'] = 'auto' test_pairs = [ ("Kim Sang-hyun", "Kim Sanghyun"), ("Park Ji-sung", "Ji-sung Park"), ("김상현", "김상현님"), ("Catherine", "Katherine"), ] for name1, name2 in test_pairs: score = calculate_similarity_with_config(name1, name2) status = "✓ MATCH" if score >= 0.90 else "✗ DIFFERENT" print(f"{status} '{name1}' vs '{name2}': {score:.3f}") print(f"\nAvailable algorithms: {', '.join(get_algorithm_display_info())}") return 0 run_cli_main(_main)