Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -287,15 +287,12 @@ word_syllables = {
|
|
| 287 |
"dula": ["du", "la"]
|
| 288 |
}
|
| 289 |
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
if not audio_file or not expected_word:
|
| 294 |
-
return ["No audio or expected word provided.", "0.00"]
|
| 295 |
|
| 296 |
-
|
| 297 |
input_values = processor(audio_file, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
|
| 298 |
-
|
| 299 |
with torch.no_grad():
|
| 300 |
logits = model(input_values).logits
|
| 301 |
|
|
@@ -322,24 +319,23 @@ def transcribe(audio_file, expected_word=None):
|
|
| 322 |
ref_emb = get_embedding(target_sylls)
|
| 323 |
|
| 324 |
if pred_emb is None or ref_emb is None:
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
| 328 |
|
| 329 |
-
return [transcription, str(round(cosine_sim, 2))]
|
| 330 |
-
|
| 331 |
iface = gr.Interface(
|
| 332 |
fn=transcribe,
|
| 333 |
inputs=[
|
| 334 |
gr.Audio(type="filepath", label="🎤 Record or Upload Audio"),
|
| 335 |
-
gr.
|
| 336 |
],
|
| 337 |
outputs=[
|
| 338 |
gr.Textbox(label="🗣️ Transcription", interactive=False),
|
| 339 |
gr.Textbox(label="📈 Cosine Similarity", interactive=False)
|
| 340 |
],
|
| 341 |
title="Cognia Wav2Vec2 Pronunciation Practice",
|
| 342 |
-
description="Upload or record audio, and get similarity score vs the
|
| 343 |
)
|
| 344 |
|
| 345 |
iface.launch()
|
|
|
|
| 287 |
"dula": ["du", "la"]
|
| 288 |
}
|
| 289 |
|
| 290 |
+
def transcribe(audio_file, target_word):
|
| 291 |
+
if not audio_file or not target_word:
|
| 292 |
+
return "No audio or word selected.", "0.00"
|
|
|
|
|
|
|
| 293 |
|
| 294 |
+
# Load and process audio directly
|
| 295 |
input_values = processor(audio_file, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
|
|
|
|
| 296 |
with torch.no_grad():
|
| 297 |
logits = model(input_values).logits
|
| 298 |
|
|
|
|
| 319 |
ref_emb = get_embedding(target_sylls)
|
| 320 |
|
| 321 |
if pred_emb is None or ref_emb is None:
|
| 322 |
+
return transcription, "0.00"
|
| 323 |
+
|
| 324 |
+
score = F.cosine_similarity(pred_emb, ref_emb, dim=0).item()
|
| 325 |
+
return transcription, str(round(score, 4))
|
| 326 |
|
|
|
|
|
|
|
| 327 |
iface = gr.Interface(
|
| 328 |
fn=transcribe,
|
| 329 |
inputs=[
|
| 330 |
gr.Audio(type="filepath", label="🎤 Record or Upload Audio"),
|
| 331 |
+
gr.Dropdown(sorted(word_syllables.keys()), label="Select Target Word")
|
| 332 |
],
|
| 333 |
outputs=[
|
| 334 |
gr.Textbox(label="🗣️ Transcription", interactive=False),
|
| 335 |
gr.Textbox(label="📈 Cosine Similarity", interactive=False)
|
| 336 |
],
|
| 337 |
title="Cognia Wav2Vec2 Pronunciation Practice",
|
| 338 |
+
description="Upload or record audio, and get similarity score vs the target Filipino word."
|
| 339 |
)
|
| 340 |
|
| 341 |
iface.launch()
|