Artyomorax commited on
Commit
e0b6f48
Β·
verified Β·
1 Parent(s): e1e3707

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -1
app.py CHANGED
@@ -40,14 +40,17 @@ logger.info("βœ… Model loaded successfully")
40
  # ==================== Utility functions ====================
41
 
42
  def syllables_to_phonetic(syllables):
 
43
  return "".join(syllables).replace("-", "").replace(" ", "").lower()
44
 
45
  def phonetic_similarity(s1, s2):
 
46
  s1_clean = s1.replace(" ", "").replace("-", "").lower()
47
  s2_clean = s2.replace(" ", "").replace("-", "").lower()
48
  return SequenceMatcher(None, s1_clean, s2_clean).ratio()
49
 
50
  def syllable_similarity(syll1, syll2):
 
51
  s1_phonetic = syllables_to_phonetic(syll1)
52
  s2_phonetic = syllables_to_phonetic(syll2)
53
  phonetic_score = SequenceMatcher(None, s1_phonetic, s2_phonetic).ratio()
@@ -55,6 +58,7 @@ def syllable_similarity(syll1, syll2):
55
  return (phonetic_score * 0.8) + (count_match * 0.2)
56
 
57
  def find_closest_word_in_dictionary(transcription):
 
58
  trans_phonetic = transcription.replace(" ", "").replace("-", "").lower()
59
  best_word = None
60
  best_syllables = []
@@ -70,6 +74,7 @@ def find_closest_word_in_dictionary(transcription):
70
  return best_word, best_syllables, best_score
71
 
72
  def find_best_match(transcription, target_word):
 
73
  trans_clean = transcription.replace("-", " ").lower().strip()
74
  target_clean = target_word.lower().strip()
75
  target_sylls = word_syllables.get(target_clean, [])
@@ -77,10 +82,12 @@ def find_best_match(transcription, target_word):
77
  logger.warning(f"⚠️ Target word '{target_clean}' not found in dictionary")
78
  return trans_clean, trans_clean.split(), 0.5
79
  if target_clean in trans_clean:
 
80
  return target_clean, target_sylls, 1.0
81
  trans_nospace = trans_clean.replace(" ", "")
82
  target_nospace = target_clean.replace(" ", "")
83
  if target_nospace in trans_nospace:
 
84
  return target_clean, target_sylls, 0.95
85
  words = trans_clean.split()
86
  best_match = ""
@@ -94,6 +101,7 @@ def find_best_match(transcription, target_word):
94
  best_score = score
95
  best_match = word
96
  best_syllables = word_sylls
 
97
  for i in range(len(words)):
98
  for j in range(i + 1, min(i + 4, len(words) + 1)):
99
  combo = "".join(words[i:j])
@@ -105,21 +113,28 @@ def find_best_match(transcription, target_word):
105
  best_score = score
106
  best_match = combo_display
107
  best_syllables = combo_sylls
 
108
  else:
109
  score = phonetic_similarity(combo, target_clean)
110
  if score > best_score:
111
  best_score = score
112
  best_match = combo_display
113
  best_syllables = combo_display.split()
 
114
  if best_score < 0.6:
 
115
  dict_word, dict_sylls, dict_score = find_closest_word_in_dictionary(trans_clean)
116
  if dict_score > best_score:
117
  best_score = dict_score
118
  best_match = dict_word
119
  best_syllables = dict_sylls
 
 
 
120
  return best_match, best_syllables, best_score
121
 
122
  def get_embedding(syllables):
 
123
  token_ids = []
124
  for s in syllables:
125
  ids = processor.tokenizer(s, add_special_tokens=False).input_ids
@@ -131,6 +146,7 @@ def get_embedding(syllables):
131
  return embeds.mean(dim=0)
132
 
133
  def calculate_pronunciation_score(cosine_sim, match_score):
 
134
  combined_score = (cosine_sim * 0.7) + (match_score * 0.3)
135
  if match_score < 0.5:
136
  combined_score *= 0.7
@@ -145,6 +161,11 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
145
  request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
146
  start_time = datetime.now()
147
 
 
 
 
 
 
148
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
149
  content = await audio_file.read()
150
  file_size = len(content)
@@ -154,11 +175,14 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
154
 
155
  try:
156
  # Load audio
 
157
  audio, sr = librosa.load(tmp_file_path, sr=SR)
 
158
 
159
  # ===== Noise reduction =====
160
- noise_sample = audio[:int(0.3*SR)] # first 0.3s as noise profile
161
  audio_denoised = nr.reduce_noise(y=audio, y_noise=noise_sample, sr=SR)
 
162
 
163
  # Process through Wav2Vec2
164
  input_values = processor(audio_denoised, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
@@ -166,6 +190,7 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
166
  logits = model(input_values).logits
167
  pred_ids = torch.argmax(logits, dim=-1)
168
  transcription = processor.batch_decode(pred_ids, group_tokens=True)[0].lower().strip()
 
169
 
170
  # Match target word
171
  target_word_clean = target_word.lower().strip()
@@ -217,14 +242,17 @@ async def analyze_audio(audio_file: UploadFile, target_word: str = Form(...)):
217
  }
218
  }
219
 
 
220
  return result
221
 
222
  finally:
223
  if os.path.exists(tmp_file_path):
224
  os.remove(tmp_file_path)
 
225
 
226
  @app.get("/")
227
  def home():
 
228
  return {"message": "Cognia Wav2Vec2 Speech API running.", "device": DEVICE}
229
 
230
  @app.get("/health")
 
40
  # ==================== Utility functions ====================
41
 
42
  def syllables_to_phonetic(syllables):
43
+ """Convert syllables list to phonetic string for comparison"""
44
  return "".join(syllables).replace("-", "").replace(" ", "").lower()
45
 
46
  def phonetic_similarity(s1, s2):
47
+ """Calculate phonetic similarity between two strings"""
48
  s1_clean = s1.replace(" ", "").replace("-", "").lower()
49
  s2_clean = s2.replace(" ", "").replace("-", "").lower()
50
  return SequenceMatcher(None, s1_clean, s2_clean).ratio()
51
 
52
  def syllable_similarity(syll1, syll2):
53
+ """Compare two syllable lists and return similarity score"""
54
  s1_phonetic = syllables_to_phonetic(syll1)
55
  s2_phonetic = syllables_to_phonetic(syll2)
56
  phonetic_score = SequenceMatcher(None, s1_phonetic, s2_phonetic).ratio()
 
58
  return (phonetic_score * 0.8) + (count_match * 0.2)
59
 
60
  def find_closest_word_in_dictionary(transcription):
61
+ """Search dictionary for closest matching word"""
62
  trans_phonetic = transcription.replace(" ", "").replace("-", "").lower()
63
  best_word = None
64
  best_syllables = []
 
74
  return best_word, best_syllables, best_score
75
 
76
  def find_best_match(transcription, target_word):
77
+ """Find best matching segment in transcription corresponding to target word"""
78
  trans_clean = transcription.replace("-", " ").lower().strip()
79
  target_clean = target_word.lower().strip()
80
  target_sylls = word_syllables.get(target_clean, [])
 
82
  logger.warning(f"⚠️ Target word '{target_clean}' not found in dictionary")
83
  return trans_clean, trans_clean.split(), 0.5
84
  if target_clean in trans_clean:
85
+ logger.info(f"βœ“ Direct match found: '{target_clean}'")
86
  return target_clean, target_sylls, 1.0
87
  trans_nospace = trans_clean.replace(" ", "")
88
  target_nospace = target_clean.replace(" ", "")
89
  if target_nospace in trans_nospace:
90
+ logger.info(f"βœ“ No-space match found: '{target_nospace}'")
91
  return target_clean, target_sylls, 0.95
92
  words = trans_clean.split()
93
  best_match = ""
 
101
  best_score = score
102
  best_match = word
103
  best_syllables = word_sylls
104
+ logger.info(f" Single word match: '{word}' (score: {score:.2f})")
105
  for i in range(len(words)):
106
  for j in range(i + 1, min(i + 4, len(words) + 1)):
107
  combo = "".join(words[i:j])
 
113
  best_score = score
114
  best_match = combo_display
115
  best_syllables = combo_sylls
116
+ logger.info(f" Combo match: '{combo}' (score: {score:.2f})")
117
  else:
118
  score = phonetic_similarity(combo, target_clean)
119
  if score > best_score:
120
  best_score = score
121
  best_match = combo_display
122
  best_syllables = combo_display.split()
123
+ logger.info(f" Phonetic match: '{combo}' (score: {score:.2f})")
124
  if best_score < 0.6:
125
+ logger.info(" Searching entire dictionary for closest match...")
126
  dict_word, dict_sylls, dict_score = find_closest_word_in_dictionary(trans_clean)
127
  if dict_score > best_score:
128
  best_score = dict_score
129
  best_match = dict_word
130
  best_syllables = dict_sylls
131
+ logger.info(f" Dictionary match used: '{dict_word}' (score: {dict_score:.2f})")
132
+ logger.info(f"🎯 Best match for '{target_clean}': '{best_match}' (score: {best_score:.2f})")
133
+ logger.info(f" Matched syllables: {best_syllables}")
134
  return best_match, best_syllables, best_score
135
 
136
  def get_embedding(syllables):
137
+ """Generate embeddings from syllables"""
138
  token_ids = []
139
  for s in syllables:
140
  ids = processor.tokenizer(s, add_special_tokens=False).input_ids
 
146
  return embeds.mean(dim=0)
147
 
148
  def calculate_pronunciation_score(cosine_sim, match_score):
149
+ """Combine embedding similarity and transcription match into final score"""
150
  combined_score = (cosine_sim * 0.7) + (match_score * 0.3)
151
  if match_score < 0.5:
152
  combined_score *= 0.7
 
161
  request_id = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
162
  start_time = datetime.now()
163
 
164
+ logger.info(f"\n{'='*80}")
165
+ logger.info(f"πŸ“₯ NEW REQUEST [{request_id}]")
166
+ logger.info(f"πŸ“ File: {audio_file.filename} ({audio_file.content_type})")
167
+ logger.info(f"🎯 Target word: '{target_word}'")
168
+
169
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
170
  content = await audio_file.read()
171
  file_size = len(content)
 
175
 
176
  try:
177
  # Load audio
178
+ logger.info(f"🎡 Loading audio from: {tmp_file_path}")
179
  audio, sr = librosa.load(tmp_file_path, sr=SR)
180
+ logger.info(f"🎡 Audio shape: {audio.shape}, duration: {len(audio)/SR:.2f}s")
181
 
182
  # ===== Noise reduction =====
183
+ noise_sample = audio[:int(0.3*SR)]
184
  audio_denoised = nr.reduce_noise(y=audio, y_noise=noise_sample, sr=SR)
185
+ logger.info(f"πŸ”Š Noise reduction applied")
186
 
187
  # Process through Wav2Vec2
188
  input_values = processor(audio_denoised, sampling_rate=SR, return_tensors="pt").input_values.to(DEVICE)
 
190
  logits = model(input_values).logits
191
  pred_ids = torch.argmax(logits, dim=-1)
192
  transcription = processor.batch_decode(pred_ids, group_tokens=True)[0].lower().strip()
193
+ logger.info(f"πŸ“ RAW TRANSCRIPTION: '{transcription}'")
194
 
195
  # Match target word
196
  target_word_clean = target_word.lower().strip()
 
242
  }
243
  }
244
 
245
+ logger.info(f"πŸ“€ RESPONSE: {result}")
246
  return result
247
 
248
  finally:
249
  if os.path.exists(tmp_file_path):
250
  os.remove(tmp_file_path)
251
+ logger.debug(f"πŸ—‘οΈ Cleaned up temporary file: {tmp_file_path}")
252
 
253
  @app.get("/")
254
  def home():
255
+ logger.info("πŸ“ Health check endpoint called")
256
  return {"message": "Cognia Wav2Vec2 Speech API running.", "device": DEVICE}
257
 
258
  @app.get("/health")