Spaces:

Artyomorax
/

CogniaHub

Sleeping

App Files Files Community

Artyomorax commited on Nov 18, 2025

Commit

e0b6f48

verified ·

1 Parent(s): e1e3707

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -1

app.py CHANGED Viewed

@@ -40,14 +40,17 @@ logger.info("✅ Model loaded successfully")
 # ==================== Utility functions ====================
 def syllables_to_phonetic(syllables):
     return "".join(syllables).replace("-", "").replace(" ", "").lower()
 def phonetic_similarity(s1, s2):
     s1_clean = s1.replace(" ", "").replace("-", "").lower()
     s2_clean = s2.replace(" ", "").replace("-", "").lower()
     return SequenceMatcher(None, s1_clean, s2_clean).ratio()
 def syllable_similarity(syll1, syll2):
     s1_phonetic = syllables_to_phonetic(syll1)
     s2_phonetic = syllables_to_phonetic(syll2)
     phonetic_score = SequenceMatcher(None, s1_phonetic, s2_phonetic).ratio()
@@ -55,6 +58,7 @@ def syllable_similarity(syll1, syll2):
     return (phonetic_score * 0.8) + (count_match * 0.2)
 def find_closest_word_in_dictionary(transcription):
     trans_phonetic = transcription.replace(" ", "").replace("-", "").lower()
     best_word = None
     best_syllables = []
@@ -70,6 +74,7 @@ def find_closest_word_in_dictionary(transcription):
     return best_word, best_syllables, best_score
 def find_best_match(transcription, target_word):
     trans_clean = transcription.replace("-", " ").lower().strip()
     target_clean = target_word.lower().strip()
     target_sylls = word_syllables.get(target_clean, [])
@@ -77,10 +82,12 @@ def find_best_match(transcription, target_word):
         logger.warning(f"⚠️ Target word '{target_clean}' not found in dictionary")
         return trans_clean, trans_clean.split(), 0.5
     if target_clean in trans_clean:
         return target_clean, target_sylls, 1.0
     trans_nospace = trans_clean.replace(" ", "")
     target_nospace = target_clean.replace(" ", "")
     if target_nospace in trans_nospace:
         return target_clean, target_sylls, 0.95
     words = trans_clean.split()
     best_match = ""
@@ -94,6 +101,7 @@ def find_best_match(transcription, target_word):
                 best_score = score
                 best_match = word
                 best_syllables = word_sylls
     for i in range(len(words)):
         for j in range(i + 1, min(i + 4, len(words) + 1)):
             combo = "".join(words[i:j])
@@ -105,21 +113,28 @@ def find_best_match(transcription, target_word):
                     best_score = score
                     best_match = combo_display
                     best_syllables = combo_sylls
             else:
                 score = phonetic_similarity(combo, target_clean)
                 if score > best_score:
                     best_score = score
                     best_match = combo_display
                     best_syllables = combo_display.split()
     if best_score < 0.6:
         dict_word, dict_sylls, dict_score = find_closest_word_in_dictionary(trans_clean)
         if dict_score > best_score:
             best_score = dict_score
             best_match = dict_word
             best_syllables = dict_sylls
     return best_match, best_syllables, best_score
 def get_embedding(syllables):
     token_ids = []
     for s in syllables:
         ids = processor.tokenizer(s, add_special_tokens=False).input_ids
@@ -131,6 +146,7 @@ def get_embedding(syllables):
     return embeds.mean(dim=0)
 def calculate_pronunciation_score(cosine_sim, match_score):
     combined_score = (cosine_sim * 0.7) + (match_score * 0.3)
     if match_score < 0.5:
         combined_score *= 0.7
@@ -145,6 +161,11 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
     request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     start_time = datetime.now()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
         content = await audio_file.read()
         file_size = len(content)
@@ -154,11 +175,14 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
     try:
         # Load audio
         audio, sr = librosa.load(tmp_file_path, sr=SR)
         # ===== Noise reduction =====
-        noise_sample = audio[:int(0.3*SR)]  # first 0.3s as noise profile
         audio_denoised = nr.reduce_noise(y=audio, y_noise=noise_sample, sr=SR)
         # Process through Wav2Vec2
         input_values = processor(audio_denoised, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
@@ -166,6 +190,7 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
             logits = model(input_values).logits
         pred_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(pred_ids, group_tokens=True)[0].lower().strip()
         # Match target word
         target_word_clean = target_word.lower().strip()
@@ -217,14 +242,17 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
             }
         }
         return result
     finally:
         if os.path.exists(tmp_file_path):
             os.remove(tmp_file_path)
 @app.get("/")
 def home():
     return {"message": "Cognia Wav2Vec2 Speech API running.", "device": DEVICE}
 @app.get("/health")

 # ==================== Utility functions ====================
 def syllables_to_phonetic(syllables):
+    """Convert syllables list to phonetic string for comparison"""
     return "".join(syllables).replace("-", "").replace(" ", "").lower()
 def phonetic_similarity(s1, s2):
+    """Calculate phonetic similarity between two strings"""
     s1_clean = s1.replace(" ", "").replace("-", "").lower()
     s2_clean = s2.replace(" ", "").replace("-", "").lower()
     return SequenceMatcher(None, s1_clean, s2_clean).ratio()
 def syllable_similarity(syll1, syll2):
+    """Compare two syllable lists and return similarity score"""
     s1_phonetic = syllables_to_phonetic(syll1)
     s2_phonetic = syllables_to_phonetic(syll2)
     phonetic_score = SequenceMatcher(None, s1_phonetic, s2_phonetic).ratio()
     return (phonetic_score * 0.8) + (count_match * 0.2)
 def find_closest_word_in_dictionary(transcription):
+    """Search dictionary for closest matching word"""
     trans_phonetic = transcription.replace(" ", "").replace("-", "").lower()
     best_word = None
     best_syllables = []
     return best_word, best_syllables, best_score
 def find_best_match(transcription, target_word):
+    """Find best matching segment in transcription corresponding to target word"""
     trans_clean = transcription.replace("-", " ").lower().strip()
     target_clean = target_word.lower().strip()
     target_sylls = word_syllables.get(target_clean, [])
         logger.warning(f"⚠️ Target word '{target_clean}' not found in dictionary")
         return trans_clean, trans_clean.split(), 0.5
     if target_clean in trans_clean:
+        logger.info(f"✓ Direct match found: '{target_clean}'")
         return target_clean, target_sylls, 1.0
     trans_nospace = trans_clean.replace(" ", "")
     target_nospace = target_clean.replace(" ", "")
     if target_nospace in trans_nospace:
+        logger.info(f"✓ No-space match found: '{target_nospace}'")
         return target_clean, target_sylls, 0.95
     words = trans_clean.split()
     best_match = ""
                 best_score = score
                 best_match = word
                 best_syllables = word_sylls
+                logger.info(f"  Single word match: '{word}' (score: {score:.2f})")
     for i in range(len(words)):
         for j in range(i + 1, min(i + 4, len(words) + 1)):
             combo = "".join(words[i:j])
                     best_score = score
                     best_match = combo_display
                     best_syllables = combo_sylls
+                    logger.info(f"  Combo match: '{combo}' (score: {score:.2f})")
             else:
                 score = phonetic_similarity(combo, target_clean)
                 if score > best_score:
                     best_score = score
                     best_match = combo_display
                     best_syllables = combo_display.split()
+                    logger.info(f"  Phonetic match: '{combo}' (score: {score:.2f})")
     if best_score < 0.6:
+        logger.info("  Searching entire dictionary for closest match...")
         dict_word, dict_sylls, dict_score = find_closest_word_in_dictionary(trans_clean)
         if dict_score > best_score:
             best_score = dict_score
             best_match = dict_word
             best_syllables = dict_sylls
+            logger.info(f"  Dictionary match used: '{dict_word}' (score: {dict_score:.2f})")
+    logger.info(f"🎯 Best match for '{target_clean}': '{best_match}' (score: {best_score:.2f})")
+    logger.info(f"   Matched syllables: {best_syllables}")
     return best_match, best_syllables, best_score
 def get_embedding(syllables):
+    """Generate embeddings from syllables"""
     token_ids = []
     for s in syllables:
         ids = processor.tokenizer(s, add_special_tokens=False).input_ids
     return embeds.mean(dim=0)
 def calculate_pronunciation_score(cosine_sim, match_score):
+    """Combine embedding similarity and transcription match into final score"""
     combined_score = (cosine_sim * 0.7) + (match_score * 0.3)
     if match_score < 0.5:
         combined_score *= 0.7
     request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     start_time = datetime.now()
+    logger.info(f"\n{'='*80}")
+    logger.info(f"📥 NEW REQUEST [{request_id}]")
+    logger.info(f"📁 File: {audio_file.filename} ({audio_file.content_type})")
+    logger.info(f"🎯 Target word: '{target_word}'")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
         content = await audio_file.read()
         file_size = len(content)
     try:
         # Load audio
+        logger.info(f"🎵 Loading audio from: {tmp_file_path}")
         audio, sr = librosa.load(tmp_file_path, sr=SR)
+        logger.info(f"🎵 Audio shape: {audio.shape}, duration: {len(audio)/SR:.2f}s")
         # ===== Noise reduction =====
+        noise_sample = audio[:int(0.3*SR)]
         audio_denoised = nr.reduce_noise(y=audio, y_noise=noise_sample, sr=SR)
+        logger.info(f"🔊 Noise reduction applied")
         # Process through Wav2Vec2
         input_values = processor(audio_denoised, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
             logits = model(input_values).logits
         pred_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(pred_ids, group_tokens=True)[0].lower().strip()
+        logger.info(f"📝 RAW TRANSCRIPTION: '{transcription}'")
         # Match target word
         target_word_clean = target_word.lower().strip()
             }
         }
+        logger.info(f"📤 RESPONSE: {result}")
         return result
     finally:
         if os.path.exists(tmp_file_path):
             os.remove(tmp_file_path)
+            logger.debug(f"🗑️ Cleaned up temporary file: {tmp_file_path}")
 @app.get("/")
 def home():
+    logger.info("📍 Health check endpoint called")
     return {"message": "Cognia Wav2Vec2 Speech API running.", "device": DEVICE}
 @app.get("/health")