Artyomorax commited on
Commit
1c38d45
·
verified ·
1 Parent(s): 948c612

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -14
app.py CHANGED
@@ -287,15 +287,12 @@ word_syllables = {
287
  "dula": ["du", "la"]
288
  }
289
 
290
- import torch.nn.functional as F
291
-
292
- def transcribe(audio_file, expected_word=None):
293
- if not audio_file or not expected_word:
294
- return ["No audio or expected word provided.", "0.00"]
295
 
296
- target_word = expected_word.lower().strip()
297
  input_values = processor(audio_file, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
298
-
299
  with torch.no_grad():
300
  logits = model(input_values).logits
301
 
@@ -322,24 +319,23 @@ def transcribe(audio_file, expected_word=None):
322
  ref_emb = get_embedding(target_sylls)
323
 
324
  if pred_emb is None or ref_emb is None:
325
- cosine_sim = 0.0
326
- else:
327
- cosine_sim = F.cosine_similarity(pred_emb.unsqueeze(0), ref_emb.unsqueeze(0)).item()
 
328
 
329
- return [transcription, str(round(cosine_sim, 2))]
330
-
331
  iface = gr.Interface(
332
  fn=transcribe,
333
  inputs=[
334
  gr.Audio(type="filepath", label="🎤 Record or Upload Audio"),
335
- gr.Textbox(label="Expected Word (GIF Title)") # <-- textbox for app to send
336
  ],
337
  outputs=[
338
  gr.Textbox(label="🗣️ Transcription", interactive=False),
339
  gr.Textbox(label="📈 Cosine Similarity", interactive=False)
340
  ],
341
  title="Cognia Wav2Vec2 Pronunciation Practice",
342
- description="Upload or record audio, and get similarity score vs the expected word."
343
  )
344
 
345
  iface.launch()
 
287
  "dula": ["du", "la"]
288
  }
289
 
290
+ def transcribe(audio_file, target_word):
291
+ if not audio_file or not target_word:
292
+ return "No audio or word selected.", "0.00"
 
 
293
 
294
+ # Load and process audio directly
295
  input_values = processor(audio_file, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
 
296
  with torch.no_grad():
297
  logits = model(input_values).logits
298
 
 
319
  ref_emb = get_embedding(target_sylls)
320
 
321
  if pred_emb is None or ref_emb is None:
322
+ return transcription, "0.00"
323
+
324
+ score = F.cosine_similarity(pred_emb, ref_emb, dim=0).item()
325
+ return transcription, str(round(score, 4))
326
 
 
 
327
  iface = gr.Interface(
328
  fn=transcribe,
329
  inputs=[
330
  gr.Audio(type="filepath", label="🎤 Record or Upload Audio"),
331
+ gr.Dropdown(sorted(word_syllables.keys()), label="Select Target Word")
332
  ],
333
  outputs=[
334
  gr.Textbox(label="🗣️ Transcription", interactive=False),
335
  gr.Textbox(label="📈 Cosine Similarity", interactive=False)
336
  ],
337
  title="Cognia Wav2Vec2 Pronunciation Practice",
338
+ description="Upload or record audio, and get similarity score vs the target Filipino word."
339
  )
340
 
341
  iface.launch()