Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -40,14 +40,17 @@ logger.info("β
Model loaded successfully")
|
|
| 40 |
# ==================== Utility functions ====================
|
| 41 |
|
| 42 |
def syllables_to_phonetic(syllables):
|
|
|
|
| 43 |
return "".join(syllables).replace("-", "").replace(" ", "").lower()
|
| 44 |
|
| 45 |
def phonetic_similarity(s1, s2):
|
|
|
|
| 46 |
s1_clean = s1.replace(" ", "").replace("-", "").lower()
|
| 47 |
s2_clean = s2.replace(" ", "").replace("-", "").lower()
|
| 48 |
return SequenceMatcher(None, s1_clean, s2_clean).ratio()
|
| 49 |
|
| 50 |
def syllable_similarity(syll1, syll2):
|
|
|
|
| 51 |
s1_phonetic = syllables_to_phonetic(syll1)
|
| 52 |
s2_phonetic = syllables_to_phonetic(syll2)
|
| 53 |
phonetic_score = SequenceMatcher(None, s1_phonetic, s2_phonetic).ratio()
|
|
@@ -55,6 +58,7 @@ def syllable_similarity(syll1, syll2):
|
|
| 55 |
return (phonetic_score * 0.8) + (count_match * 0.2)
|
| 56 |
|
| 57 |
def find_closest_word_in_dictionary(transcription):
|
|
|
|
| 58 |
trans_phonetic = transcription.replace(" ", "").replace("-", "").lower()
|
| 59 |
best_word = None
|
| 60 |
best_syllables = []
|
|
@@ -70,6 +74,7 @@ def find_closest_word_in_dictionary(transcription):
|
|
| 70 |
return best_word, best_syllables, best_score
|
| 71 |
|
| 72 |
def find_best_match(transcription, target_word):
|
|
|
|
| 73 |
trans_clean = transcription.replace("-", " ").lower().strip()
|
| 74 |
target_clean = target_word.lower().strip()
|
| 75 |
target_sylls = word_syllables.get(target_clean, [])
|
|
@@ -77,10 +82,12 @@ def find_best_match(transcription, target_word):
|
|
| 77 |
logger.warning(f"β οΈ Target word '{target_clean}' not found in dictionary")
|
| 78 |
return trans_clean, trans_clean.split(), 0.5
|
| 79 |
if target_clean in trans_clean:
|
|
|
|
| 80 |
return target_clean, target_sylls, 1.0
|
| 81 |
trans_nospace = trans_clean.replace(" ", "")
|
| 82 |
target_nospace = target_clean.replace(" ", "")
|
| 83 |
if target_nospace in trans_nospace:
|
|
|
|
| 84 |
return target_clean, target_sylls, 0.95
|
| 85 |
words = trans_clean.split()
|
| 86 |
best_match = ""
|
|
@@ -94,6 +101,7 @@ def find_best_match(transcription, target_word):
|
|
| 94 |
best_score = score
|
| 95 |
best_match = word
|
| 96 |
best_syllables = word_sylls
|
|
|
|
| 97 |
for i in range(len(words)):
|
| 98 |
for j in range(i + 1, min(i + 4, len(words) + 1)):
|
| 99 |
combo = "".join(words[i:j])
|
|
@@ -105,21 +113,28 @@ def find_best_match(transcription, target_word):
|
|
| 105 |
best_score = score
|
| 106 |
best_match = combo_display
|
| 107 |
best_syllables = combo_sylls
|
|
|
|
| 108 |
else:
|
| 109 |
score = phonetic_similarity(combo, target_clean)
|
| 110 |
if score > best_score:
|
| 111 |
best_score = score
|
| 112 |
best_match = combo_display
|
| 113 |
best_syllables = combo_display.split()
|
|
|
|
| 114 |
if best_score < 0.6:
|
|
|
|
| 115 |
dict_word, dict_sylls, dict_score = find_closest_word_in_dictionary(trans_clean)
|
| 116 |
if dict_score > best_score:
|
| 117 |
best_score = dict_score
|
| 118 |
best_match = dict_word
|
| 119 |
best_syllables = dict_sylls
|
|
|
|
|
|
|
|
|
|
| 120 |
return best_match, best_syllables, best_score
|
| 121 |
|
| 122 |
def get_embedding(syllables):
|
|
|
|
| 123 |
token_ids = []
|
| 124 |
for s in syllables:
|
| 125 |
ids = processor.tokenizer(s, add_special_tokens=False).input_ids
|
|
@@ -131,6 +146,7 @@ def get_embedding(syllables):
|
|
| 131 |
return embeds.mean(dim=0)
|
| 132 |
|
| 133 |
def calculate_pronunciation_score(cosine_sim, match_score):
|
|
|
|
| 134 |
combined_score = (cosine_sim * 0.7) + (match_score * 0.3)
|
| 135 |
if match_score < 0.5:
|
| 136 |
combined_score *= 0.7
|
|
@@ -145,6 +161,11 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
|
|
| 145 |
request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
| 146 |
start_time = datetime.now()
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 149 |
content = await audio_file.read()
|
| 150 |
file_size = len(content)
|
|
@@ -154,11 +175,14 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
|
|
| 154 |
|
| 155 |
try:
|
| 156 |
# Load audio
|
|
|
|
| 157 |
audio, sr = librosa.load(tmp_file_path, sr=SR)
|
|
|
|
| 158 |
|
| 159 |
# ===== Noise reduction =====
|
| 160 |
-
noise_sample = audio[:int(0.3*SR)]
|
| 161 |
audio_denoised = nr.reduce_noise(y=audio, y_noise=noise_sample, sr=SR)
|
|
|
|
| 162 |
|
| 163 |
# Process through Wav2Vec2
|
| 164 |
input_values = processor(audio_denoised, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
|
|
@@ -166,6 +190,7 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
|
|
| 166 |
logits = model(input_values).logits
|
| 167 |
pred_ids = torch.argmax(logits, dim=-1)
|
| 168 |
transcription = processor.batch_decode(pred_ids, group_tokens=True)[0].lower().strip()
|
|
|
|
| 169 |
|
| 170 |
# Match target word
|
| 171 |
target_word_clean = target_word.lower().strip()
|
|
@@ -217,14 +242,17 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
|
|
| 217 |
}
|
| 218 |
}
|
| 219 |
|
|
|
|
| 220 |
return result
|
| 221 |
|
| 222 |
finally:
|
| 223 |
if os.path.exists(tmp_file_path):
|
| 224 |
os.remove(tmp_file_path)
|
|
|
|
| 225 |
|
| 226 |
@app.get("/")
|
| 227 |
def home():
|
|
|
|
| 228 |
return {"message": "Cognia Wav2Vec2 Speech API running.", "device": DEVICE}
|
| 229 |
|
| 230 |
@app.get("/health")
|
|
|
|
| 40 |
# ==================== Utility functions ====================
|
| 41 |
|
| 42 |
def syllables_to_phonetic(syllables):
|
| 43 |
+
"""Convert syllables list to phonetic string for comparison"""
|
| 44 |
return "".join(syllables).replace("-", "").replace(" ", "").lower()
|
| 45 |
|
| 46 |
def phonetic_similarity(s1, s2):
|
| 47 |
+
"""Calculate phonetic similarity between two strings"""
|
| 48 |
s1_clean = s1.replace(" ", "").replace("-", "").lower()
|
| 49 |
s2_clean = s2.replace(" ", "").replace("-", "").lower()
|
| 50 |
return SequenceMatcher(None, s1_clean, s2_clean).ratio()
|
| 51 |
|
| 52 |
def syllable_similarity(syll1, syll2):
|
| 53 |
+
"""Compare two syllable lists and return similarity score"""
|
| 54 |
s1_phonetic = syllables_to_phonetic(syll1)
|
| 55 |
s2_phonetic = syllables_to_phonetic(syll2)
|
| 56 |
phonetic_score = SequenceMatcher(None, s1_phonetic, s2_phonetic).ratio()
|
|
|
|
| 58 |
return (phonetic_score * 0.8) + (count_match * 0.2)
|
| 59 |
|
| 60 |
def find_closest_word_in_dictionary(transcription):
|
| 61 |
+
"""Search dictionary for closest matching word"""
|
| 62 |
trans_phonetic = transcription.replace(" ", "").replace("-", "").lower()
|
| 63 |
best_word = None
|
| 64 |
best_syllables = []
|
|
|
|
| 74 |
return best_word, best_syllables, best_score
|
| 75 |
|
| 76 |
def find_best_match(transcription, target_word):
|
| 77 |
+
"""Find best matching segment in transcription corresponding to target word"""
|
| 78 |
trans_clean = transcription.replace("-", " ").lower().strip()
|
| 79 |
target_clean = target_word.lower().strip()
|
| 80 |
target_sylls = word_syllables.get(target_clean, [])
|
|
|
|
| 82 |
logger.warning(f"β οΈ Target word '{target_clean}' not found in dictionary")
|
| 83 |
return trans_clean, trans_clean.split(), 0.5
|
| 84 |
if target_clean in trans_clean:
|
| 85 |
+
logger.info(f"β Direct match found: '{target_clean}'")
|
| 86 |
return target_clean, target_sylls, 1.0
|
| 87 |
trans_nospace = trans_clean.replace(" ", "")
|
| 88 |
target_nospace = target_clean.replace(" ", "")
|
| 89 |
if target_nospace in trans_nospace:
|
| 90 |
+
logger.info(f"β No-space match found: '{target_nospace}'")
|
| 91 |
return target_clean, target_sylls, 0.95
|
| 92 |
words = trans_clean.split()
|
| 93 |
best_match = ""
|
|
|
|
| 101 |
best_score = score
|
| 102 |
best_match = word
|
| 103 |
best_syllables = word_sylls
|
| 104 |
+
logger.info(f" Single word match: '{word}' (score: {score:.2f})")
|
| 105 |
for i in range(len(words)):
|
| 106 |
for j in range(i + 1, min(i + 4, len(words) + 1)):
|
| 107 |
combo = "".join(words[i:j])
|
|
|
|
| 113 |
best_score = score
|
| 114 |
best_match = combo_display
|
| 115 |
best_syllables = combo_sylls
|
| 116 |
+
logger.info(f" Combo match: '{combo}' (score: {score:.2f})")
|
| 117 |
else:
|
| 118 |
score = phonetic_similarity(combo, target_clean)
|
| 119 |
if score > best_score:
|
| 120 |
best_score = score
|
| 121 |
best_match = combo_display
|
| 122 |
best_syllables = combo_display.split()
|
| 123 |
+
logger.info(f" Phonetic match: '{combo}' (score: {score:.2f})")
|
| 124 |
if best_score < 0.6:
|
| 125 |
+
logger.info(" Searching entire dictionary for closest match...")
|
| 126 |
dict_word, dict_sylls, dict_score = find_closest_word_in_dictionary(trans_clean)
|
| 127 |
if dict_score > best_score:
|
| 128 |
best_score = dict_score
|
| 129 |
best_match = dict_word
|
| 130 |
best_syllables = dict_sylls
|
| 131 |
+
logger.info(f" Dictionary match used: '{dict_word}' (score: {dict_score:.2f})")
|
| 132 |
+
logger.info(f"π― Best match for '{target_clean}': '{best_match}' (score: {best_score:.2f})")
|
| 133 |
+
logger.info(f" Matched syllables: {best_syllables}")
|
| 134 |
return best_match, best_syllables, best_score
|
| 135 |
|
| 136 |
def get_embedding(syllables):
|
| 137 |
+
"""Generate embeddings from syllables"""
|
| 138 |
token_ids = []
|
| 139 |
for s in syllables:
|
| 140 |
ids = processor.tokenizer(s, add_special_tokens=False).input_ids
|
|
|
|
| 146 |
return embeds.mean(dim=0)
|
| 147 |
|
| 148 |
def calculate_pronunciation_score(cosine_sim, match_score):
|
| 149 |
+
"""Combine embedding similarity and transcription match into final score"""
|
| 150 |
combined_score = (cosine_sim * 0.7) + (match_score * 0.3)
|
| 151 |
if match_score < 0.5:
|
| 152 |
combined_score *= 0.7
|
|
|
|
| 161 |
request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
| 162 |
start_time = datetime.now()
|
| 163 |
|
| 164 |
+
logger.info(f"\n{'='*80}")
|
| 165 |
+
logger.info(f"π₯ NEW REQUEST [{request_id}]")
|
| 166 |
+
logger.info(f"π File: {audio_file.filename} ({audio_file.content_type})")
|
| 167 |
+
logger.info(f"π― Target word: '{target_word}'")
|
| 168 |
+
|
| 169 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
|
| 170 |
content = await audio_file.read()
|
| 171 |
file_size = len(content)
|
|
|
|
| 175 |
|
| 176 |
try:
|
| 177 |
# Load audio
|
| 178 |
+
logger.info(f"π΅ Loading audio from: {tmp_file_path}")
|
| 179 |
audio, sr = librosa.load(tmp_file_path, sr=SR)
|
| 180 |
+
logger.info(f"π΅ Audio shape: {audio.shape}, duration: {len(audio)/SR:.2f}s")
|
| 181 |
|
| 182 |
# ===== Noise reduction =====
|
| 183 |
+
noise_sample = audio[:int(0.3*SR)]
|
| 184 |
audio_denoised = nr.reduce_noise(y=audio, y_noise=noise_sample, sr=SR)
|
| 185 |
+
logger.info(f"π Noise reduction applied")
|
| 186 |
|
| 187 |
# Process through Wav2Vec2
|
| 188 |
input_values = processor(audio_denoised, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
|
|
|
|
| 190 |
logits = model(input_values).logits
|
| 191 |
pred_ids = torch.argmax(logits, dim=-1)
|
| 192 |
transcription = processor.batch_decode(pred_ids, group_tokens=True)[0].lower().strip()
|
| 193 |
+
logger.info(f"π RAW TRANSCRIPTION: '{transcription}'")
|
| 194 |
|
| 195 |
# Match target word
|
| 196 |
target_word_clean = target_word.lower().strip()
|
|
|
|
| 242 |
}
|
| 243 |
}
|
| 244 |
|
| 245 |
+
logger.info(f"π€ RESPONSE: {result}")
|
| 246 |
return result
|
| 247 |
|
| 248 |
finally:
|
| 249 |
if os.path.exists(tmp_file_path):
|
| 250 |
os.remove(tmp_file_path)
|
| 251 |
+
logger.debug(f"ποΈ Cleaned up temporary file: {tmp_file_path}")
|
| 252 |
|
| 253 |
@app.get("/")
|
| 254 |
def home():
|
| 255 |
+
logger.info("π Health check endpoint called")
|
| 256 |
return {"message": "Cognia Wav2Vec2 Speech API running.", "device": DEVICE}
|
| 257 |
|
| 258 |
@app.get("/health")
|