Spaces:

Artyomorax
/

Cognia

Sleeping

Artyomorax commited on Nov 14, 2025

Commit

1c38d45

verified ·

1 Parent(s): 948c612

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -287,15 +287,12 @@ word_syllables = {
     "dula": ["du", "la"]
 }
-import torch.nn.functional as F
-def transcribe(audio_file, expected_word=None):
-    if not audio_file or not expected_word:
-        return ["No audio or expected word provided.", "0.00"]
-    target_word = expected_word.lower().strip()
     input_values = processor(audio_file, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
     with torch.no_grad():
         logits = model(input_values).logits
@@ -322,24 +319,23 @@ def transcribe(audio_file, expected_word=None):
     ref_emb = get_embedding(target_sylls)
     if pred_emb is None or ref_emb is None:
-        cosine_sim = 0.0
-    else:
-        cosine_sim = F.cosine_similarity(pred_emb.unsqueeze(0), ref_emb.unsqueeze(0)).item()
-    return [transcription, str(round(cosine_sim, 2))]
 iface = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.Audio(type="filepath", label="🎤 Record or Upload Audio"),
-        gr.Textbox(label="Expected Word (GIF Title)")  # <-- textbox for app to send
     ],
     outputs=[
         gr.Textbox(label="🗣️ Transcription", interactive=False),
         gr.Textbox(label="📈 Cosine Similarity", interactive=False)
     ],
     title="Cognia Wav2Vec2 Pronunciation Practice",
-    description="Upload or record audio, and get similarity score vs the expected word."
 )
 iface.launch()

     "dula": ["du", "la"]
 }
+def transcribe(audio_file, target_word):
+    if not audio_file or not target_word:
+        return "No audio or word selected.", "0.00"
+    # Load and process audio directly
     input_values = processor(audio_file, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
     with torch.no_grad():
         logits = model(input_values).logits
     ref_emb = get_embedding(target_sylls)
     if pred_emb is None or ref_emb is None:
+        return transcription, "0.00"
+    score = F.cosine_similarity(pred_emb, ref_emb, dim=0).item()
+    return transcription, str(round(score, 4))
 iface = gr.Interface(
     fn=transcribe,
     inputs=[
         gr.Audio(type="filepath", label="🎤 Record or Upload Audio"),
+        gr.Dropdown(sorted(word_syllables.keys()), label="Select Target Word")
     ],
     outputs=[
         gr.Textbox(label="🗣️ Transcription", interactive=False),
         gr.Textbox(label="📈 Cosine Similarity", interactive=False)
     ],
     title="Cognia Wav2Vec2 Pronunciation Practice",
+    description="Upload or record audio, and get similarity score vs the target Filipino word."
 )
 iface.launch()