tasal9 commited on
Commit
e40895a
·
0 Parent(s):

Add initial implementation of Afghan Pashto Voice Processing and update .gitignore

Browse files
Files changed (3) hide show
  1. .gitignore +4 -0
  2. app.py +662 -0
  3. requirements.txt +2 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+
3
+ #Ignore insiders AI rules
4
+ .github/instructions/codacy.instructions.md
app.py ADDED
@@ -0,0 +1,662 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ COMPLETE Afghan Pashto Voice & Speech Processing Space
3
+ Pure Afghan Pashto - له اصل پښتو سره
4
+ Author: Afghan Voice Technology Initiative
5
+ Version: 2.0 - Lightweight Complete Demo
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ from typing import Any, Dict, List, Tuple
12
+
13
+ import gradio as gr
14
+ import numpy as np
15
+
16
+ try:
17
+ import torch
18
+ except Exception: # pragma: no cover - optional dependency
19
+ torch = None
20
+
21
+
22
+ AFGHAN_PASHTO_DIALECTS: Dict[str, Dict[str, Any]] = {
23
+ "کندهاري (Kandahari)": {
24
+ "code": "ps-kan",
25
+ "region": "کندهار، زابل، ارزگان",
26
+ "characteristics": ["Hard ږ (g)", "ښ as خ", "Emphatic consonants", "Traditional poetry"],
27
+ "traditional_name": "کندهاري غه",
28
+ "voice_models": {"male": "kan_male_v2.pth", "female": "kan_female_v2.pth", "elder": "kan_elder_v2.pth"},
29
+ "pronunciation_guide": "ږ = hard 'g', ښ = 'kh', Retroflex sounds preserved",
30
+ },
31
+ "پکتياوي (Paktiawal)": {
32
+ "code": "ps-pak",
33
+ "region": "پکتيا، پکتيکا، خوست",
34
+ "characteristics": ["Retroflex ڼ", "Nasal vowels", "Tribal vocabulary", "Mountain accent"],
35
+ "traditional_name": "پکتياوي خښه",
36
+ "voice_models": {"male": "pak_male_v2.pth", "female": "pak_female_v2.pth", "elder": "pak_elder_v2.pth"},
37
+ "pronunciation_guide": "ڼ = retroflex 'n', Nasalized vowels, Tribal words",
38
+ },
39
+ "پېښوري (Peshawri)": {
40
+ "code": "ps-pes",
41
+ "region": "پېښور، مردان، سوات",
42
+ "characteristics": ["ښ as ش", "Soft ږ (zh)", "Urban vocabulary", "Trade language"],
43
+ "traditional_name": "پېښوري ژبه",
44
+ "voice_models": {"male": "pes_male_v2.pth", "female": "pes_female_v2.pth", "elder": "pes_elder_v2.pth"},
45
+ "pronunciation_guide": "ښ = 'sh', ږ = soft 'zh', Urban expressions",
46
+ },
47
+ "مزارۍ (Mazari)": {
48
+ "code": "ps-maz",
49
+ "region": "مزار شريف، بلخ، جوزجان",
50
+ "characteristics": ["Uzbek influence", "Northern vowels", "Turkic loanwords", "Plains accent"],
51
+ "traditional_name": "مزارۍ غږ",
52
+ "voice_models": {"male": "maz_male_v2.pth", "female": "maz_female_v2.pth", "elder": "maz_elder_v2.pth"},
53
+ "pronunciation_guide": "Uzbek-influenced vowels, Turkic words, Northern tone",
54
+ },
55
+ "هراتۍ (Herati)": {
56
+ "code": "ps-her",
57
+ "region": "هرات، فراه، نيمروز",
58
+ "characteristics": ["Persian influence", "Western vowels", "Herati accent", "Cultural sophistication"],
59
+ "traditional_name": "هراتۍ لهجه",
60
+ "voice_models": {"male": "her_male_v2.pth", "female": "her_female_v2.pth", "elder": "her_elder_v2.pth"},
61
+ "pronunciation_guide": "Persian-influenced sounds, Western vowels, Cultural words",
62
+ },
63
+ "ننګرهاري (Nangarhari)": {
64
+ "code": "ps-nan",
65
+ "region": "جلال اباد، ننګرهار، کنړ",
66
+ "characteristics": ["Eastern dialect", "Khattak influence", "Jalalabad accent", "Border influences"],
67
+ "traditional_name": "ننګرهاري وړاندې",
68
+ "voice_models": {"male": "nan_male_v2.pth", "female": "nan_female_v2.pth", "elder": "nan_elder_v2.pth"},
69
+ "pronunciation_guide": "Eastern sounds, Khattak influence, Border variations",
70
+ },
71
+ }
72
+
73
+
74
+ CULTURAL_CONTEXTS: Dict[str, Dict[str, Any]] = {
75
+ "ملي (National)": {
76
+ "description": "National songs, anthems, patriotic poetry",
77
+ "examples": ["ملي سرود", "وطن شعرونه", "غازي قومي"],
78
+ "voice_style": "proud, formal, clear",
79
+ "suffix": "د ملي غرور سره",
80
+ },
81
+ "قومي (Tribal)": {
82
+ "description": "Tribal traditions, ethnic heritage, clan stories",
83
+ "examples": ["قومي کیسې", "نسب او شجره", "قبیلوي ویاړونه"],
84
+ "voice_style": "traditional, elder-like, respectful",
85
+ "suffix": "د قومي وياړ سره",
86
+ },
87
+ "مذهبي (Religious)": {
88
+ "description": "Religious content, spiritual guidance, Islamic teachings",
89
+ "examples": ["دیني دروس", "اخلاقي کیسې", "روحاني مواعظ"],
90
+ "voice_style": "soft, respectful, spiritual",
91
+ "suffix": "د مذهبي احترام سره",
92
+ },
93
+ "فرهنګي (Cultural)": {
94
+ "description": "Cultural education, traditional values, customs",
95
+ "examples": ["فرهنګي ارزښتونه", "دودونه او دستورونه", "کلتني کیسې"],
96
+ "voice_style": "educational, warm, cultural",
97
+ "suffix": "د فرهنګي ارزښتونو سره",
98
+ },
99
+ "تاريخي (Historical)": {
100
+ "description": "Historical narratives, ancient stories, past events",
101
+ "examples": ["تاريخي کیسې", "پخوانۍ پیښې", "قدیم افسانې"],
102
+ "voice_style": "storyteller, dramatic, engaging",
103
+ "suffix": "د تاريخي روايت په انداز",
104
+ },
105
+ "سنګي (Musical)": {
106
+ "description": "Traditional music, folk songs, cultural rhythms",
107
+ "examples": ["سنګي ملودۍ", "فولکلوري سندرې", "کلاسیکي موسیقي"],
108
+ "voice_style": "melodic, rhythmic, artistic",
109
+ "suffix": "د دوديزې نغمې په رنګ",
110
+ },
111
+ "پېغلوي (Folk Tales)": {
112
+ "description": "Folk tales, traditional stories, cultural narratives",
113
+ "examples": ["پېغلوي کیسې", "افسانوي کیسې", "کلتني حکیات"],
114
+ "voice_style": "storyteller, engaging, traditional",
115
+ "suffix": "د ولسي کيسې له خوند سره",
116
+ },
117
+ }
118
+
119
+
120
+ COMPLETE_PHONEMES: Dict[str, Dict[str, Dict[str, Any]]] = {
121
+ "پښتني حروف": {
122
+ "ښ": {"symbol": "ښ", "ipa": "/ʂ/", "description": "Voiceless retroflex fricative", "dialects": {"کندهاري": "خ", "پېښوري": "ش"}},
123
+ "ږ": {"symbol": "ږ", "ipa": "/ʐ/", "description": "Voiced retroflex fricative", "dialects": {"کندهاري": "گ", "پېښوري": "ژ"}},
124
+ "ڼ": {"symbol": "ڼ", "ipa": "/ɳ/", "description": "Retroflex nasal", "dialects": {"پکتياوي": "ڼ", "کندهاري": "ن"}},
125
+ "ړ": {"symbol": "ړ", "ipa": "/ɽ/", "description": "Retroflex flap", "dialects": {"ټول": "ړ"}},
126
+ "ټ": {"symbol": "ټ", "ipa": "/ʈ/", "description": "Voiceless retroflex stop", "dialects": {"ټول": "ټ"}},
127
+ "ډ": {"symbol": "ډ", "ipa": "/ɖ/", "description": "Voiced retroflex stop", "dialects": {"ټول": "ډ"}},
128
+ },
129
+ "عربي حروف": {
130
+ "ص": {"symbol": "ص", "ipa": "/sˤ/", "description": "Emphatic voiceless alveolar fricative", "dialects": {}},
131
+ "ض": {"symbol": "ض", "ipa": "/dˤ/", "description": "Emphatic voiced alveolar stop", "dialects": {}},
132
+ "ط": {"symbol": "ط", "ipa": "/tˤ/", "description": "Emphatic voiceless alveolar stop", "dialects": {}},
133
+ "ظ": {"symbol": "ظ", "ipa": "/zˤ/", "description": "Emphatic voiced alveolar fricative", "dialects": {}},
134
+ },
135
+ "ويي": {
136
+ "ا": {"symbol": "ا", "ipa": "/a/", "description": "Open front vowel", "dialects": {}},
137
+ "ې": {"symbol": "ې", "ipa": "/e/", "description": "Close-mid front vowel", "dialects": {}},
138
+ "ۍ": {"symbol": "ۍ", "ipa": "/ei/", "description": "Diphthong", "dialects": {}},
139
+ "و": {"symbol": "و", "ipa": "/o/", "description": "Close-mid back rounded vowel", "dialects": {}},
140
+ "ۀ": {"symbol": "ۀ", "ipa": "/ə/", "description": "Schwa", "dialects": {}},
141
+ },
142
+ }
143
+
144
+
145
+ VOICE_TYPE_MODEL_MAP = {
146
+ "مشر (Elder Male)": "elder",
147
+ "ځوان (Young Male)": "male",
148
+ "ښځينه (Female)": "female",
149
+ "وړکتي (Child)": "child",
150
+ }
151
+
152
+ EMOTION_MAP = {
153
+ "طبيعي (Natural)": "neutral",
154
+ "خوشحال (Joyful)": "joyful",
155
+ "غميز (Sorrowful)": "sorrowful",
156
+ "جګ افتخار (Proud)": "proud",
157
+ }
158
+
159
+
160
+ class AudioProcessor:
161
+ def preprocess_audio(self, audio_input: Tuple[int, np.ndarray] | np.ndarray | None) -> np.ndarray:
162
+ if audio_input is None:
163
+ raise ValueError("No audio input was provided.")
164
+
165
+ if isinstance(audio_input, tuple):
166
+ _, waveform = audio_input
167
+ else:
168
+ waveform = audio_input
169
+
170
+ waveform = np.asarray(waveform, dtype=np.float32).squeeze()
171
+ if waveform.ndim > 1:
172
+ waveform = waveform.mean(axis=1)
173
+
174
+ peak = float(np.max(np.abs(waveform))) if waveform.size else 0.0
175
+ if peak > 0:
176
+ waveform = waveform / peak
177
+ return waveform
178
+
179
+ def analyze_audio(self, waveform: np.ndarray, sample_rate: int) -> Dict[str, Any]:
180
+ if waveform.size == 0:
181
+ return {"duration_seconds": 0.0, "energy": 0.0, "pitch_band": "unknown"}
182
+
183
+ energy = float(np.mean(np.abs(waveform)))
184
+ zero_crossing = float(np.mean(np.abs(np.diff(np.signbit(waveform))))) if waveform.size > 1 else 0.0
185
+ pitch_band = "high" if zero_crossing > 0.12 else "mid" if zero_crossing > 0.05 else "low"
186
+ return {
187
+ "duration_seconds": round(waveform.size / max(sample_rate, 1), 2),
188
+ "energy": round(energy, 4),
189
+ "pitch_band": pitch_band,
190
+ }
191
+
192
+
193
+ class CulturalContextProcessor:
194
+ tribal_terms = ["احمدزي", "محسود", "خټک", "یوسفزي", "دواني", "ننګيال"]
195
+ cultural_terms = ["پښتونولي", "مېلمستيا", "ننګ", "غيرت", "توره", "نګاه"]
196
+ traditional_expressions = ["ښه راغلاست", "په خير", "الله دې مل شه", "ستړی مه شې"]
197
+ honorifics = ["صاحب", "ملا", "خان", "استاد"]
198
+
199
+ def apply_cultural_context(self, text: str, context: str) -> str:
200
+ context_info = CULTURAL_CONTEXTS.get(context)
201
+ if not context_info:
202
+ return text
203
+ return f"{text} ({context_info['suffix']})"
204
+
205
+ def analyze_text(self, text: str) -> Dict[str, List[str]]:
206
+ return {
207
+ "tribal_references": [term for term in self.tribal_terms if term in text],
208
+ "cultural_concepts": [term for term in self.cultural_terms if term in text],
209
+ "traditional_expressions": [term for term in self.traditional_expressions if term in text],
210
+ "honorifics": [term for term in self.honorifics if term in text],
211
+ }
212
+
213
+
214
+ class CompleteAfghanPashtoProcessor:
215
+ def __init__(self) -> None:
216
+ self.device = "cuda" if torch is not None and torch.cuda.is_available() else "cpu"
217
+ self.models: Dict[str, Any] = {}
218
+ self.audio_processor = AudioProcessor()
219
+ self.cultural_processor = CulturalContextProcessor()
220
+ self.dialect_rules = self.load_dialect_rules()
221
+ self.load_all_models()
222
+
223
+ def load_all_models(self) -> None:
224
+ self.models = {
225
+ "tts": {"base": self.load_tts_model(), "dialects": self.load_dialectal_tts_models()},
226
+ "asr": {"base": self.load_asr_model(), "dialectal": self.load_dialectal_asr_models()},
227
+ "voice_clone": self.load_voice_cloning_model(),
228
+ }
229
+
230
+ def load_tts_model(self) -> Dict[str, str]:
231
+ return {"model": "base_tts", "status": "placeholder"}
232
+
233
+ def load_dialectal_tts_models(self) -> Dict[str, Dict[str, str]]:
234
+ return {dialect: info["voice_models"] for dialect, info in AFGHAN_PASHTO_DIALECTS.items()}
235
+
236
+ def load_asr_model(self) -> Dict[str, str]:
237
+ return {"model": "base_asr", "status": "placeholder"}
238
+
239
+ def load_dialectal_asr_models(self) -> Dict[str, str]:
240
+ return {dialect: f"{info['code']}_asr" for dialect, info in AFGHAN_PASHTO_DIALECTS.items()}
241
+
242
+ def load_voice_cloning_model(self) -> Dict[str, str]:
243
+ return {"model": "voice_clone", "status": "placeholder"}
244
+
245
+ def load_dialect_rules(self) -> Dict[str, Dict[str, Dict[str, Any]]]:
246
+ return {
247
+ "pronunciation": {
248
+ "کندهاري (Kandahari)": {"ښ": "خ", "ږ": "گ", "emphatic_consonants": True},
249
+ "پکتياوي (Paktiawal)": {"ڼ": "ڼ", "nasal_vowels": True, "tribal_pronunciation": True},
250
+ "پېښوري (Peshawri)": {"ښ": "ش", "ږ": "ژ", "urban_influence": True},
251
+ "هراتۍ (Herati)": {"ښ": "خ", "ږ": "گ", "western_vowels": True},
252
+ },
253
+ "vocabulary": {
254
+ "کندهاري (Kandahari)": {"traditional_words": ["غه", "خه", "ګه"], "poetic_expressions": True},
255
+ "پکتياوي (Paktiawal)": {"tribal_words": ["خېل", "قوم", "نګهبان"], "mountain_vocabulary": True},
256
+ "هراتۍ (Herati)": {"persian_loanwords": True, "cultural_terms": ["فرهنګ", "تمدن", "ادب"]},
257
+ },
258
+ "grammar": {
259
+ "ننګرهاري (Nangarhari)": {"eastern_constructions": True, "border_influences": True},
260
+ "مزارۍ (Mazari)": {"uzbek_influence": True, "northern_constructions": True},
261
+ },
262
+ }
263
+
264
+ def process_authentic_tts(
265
+ self,
266
+ text: str,
267
+ dialect: str,
268
+ voice_type: str,
269
+ context: str,
270
+ emotion: str,
271
+ speed: float,
272
+ ) -> Tuple[np.ndarray, int, Dict[str, Any]]:
273
+ dialectal_text = self.apply_comprehensive_dialect_rules(text, dialect)
274
+ contextualized_text = self.cultural_processor.apply_cultural_context(dialectal_text, context)
275
+ emotional_text = self.apply_emotional_coloring(contextualized_text, emotion)
276
+ audio, sample_rate = self.generate_synthetic_speech(emotional_text, dialect, voice_type, speed, emotion)
277
+ return audio, sample_rate, {
278
+ "dialectal_text": dialectal_text,
279
+ "contextualized_text": contextualized_text,
280
+ "emotional_text": emotional_text,
281
+ "model": self.resolve_voice_model(dialect, voice_type),
282
+ "device": self.device,
283
+ }
284
+
285
+ def process_authentic_asr(self, audio_input: Tuple[int, np.ndarray] | None, dialect: str) -> Dict[str, Any]:
286
+ if audio_input is None:
287
+ raise ValueError("Please record or upload Pashto speech first.")
288
+
289
+ sample_rate, waveform = audio_input
290
+ processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform))
291
+ result = self.basic_speech_recognition(processed_audio, dialect)
292
+ corrected_text = self.apply_dialectal_corrections(result["text"], dialect)
293
+ cultural_info = self.extract_comprehensive_cultural_markers(corrected_text)
294
+ audio_stats = self.audio_processor.analyze_audio(processed_audio, sample_rate)
295
+ return {
296
+ "text": corrected_text,
297
+ "confidence": result.get("confidence", 0.85),
298
+ "dialect": dialect,
299
+ "audio_stats": audio_stats,
300
+ "cultural_markers": cultural_info,
301
+ "pronunciation_notes": self.get_pronunciation_notes(corrected_text, dialect),
302
+ }
303
+
304
+ def process_voice_cloning(
305
+ self,
306
+ reference_audio: Tuple[int, np.ndarray] | None,
307
+ text: str,
308
+ dialect: str,
309
+ voice_characteristics: Dict[str, str],
310
+ ) -> Tuple[np.ndarray, int, Dict[str, Any]]:
311
+ if reference_audio is None:
312
+ raise ValueError("Reference audio is required for voice cloning.")
313
+
314
+ sample_rate, waveform = reference_audio
315
+ processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform))
316
+ features = self.extract_authentic_voice_features(processed_audio, sample_rate, dialect)
317
+ merged_features = {**features, **voice_characteristics}
318
+ cloned_audio, cloned_rate = self.basic_voice_cloning(text, merged_features, dialect)
319
+ return cloned_audio, cloned_rate, merged_features
320
+
321
+ def apply_comprehensive_dialect_rules(self, text: str, dialect: str) -> str:
322
+ pronunciation_rules = self.dialect_rules.get("pronunciation", {}).get(dialect, {})
323
+ vocabulary_rules = self.dialect_rules.get("vocabulary", {}).get(dialect, {})
324
+
325
+ transformed = text
326
+ for original, replacement in pronunciation_rules.items():
327
+ if isinstance(replacement, str):
328
+ transformed = transformed.replace(original, replacement)
329
+
330
+ if vocabulary_rules.get("poetic_expressions") and "وطن" in transformed:
331
+ transformed = transformed.replace("وطن", "پلرنی وطن")
332
+ if vocabulary_rules.get("persian_loanwords") and "کلتور" in transformed:
333
+ transformed = transformed.replace("کلتور", "فرهنګ")
334
+ return transformed
335
+
336
+ def apply_emotional_coloring(self, text: str, emotion: str) -> str:
337
+ emotional_suffix = {
338
+ "neutral": "په طبيعي انداز",
339
+ "joyful": "په خوشحال رنګ",
340
+ "sorrowful": "په غمجن اهنګ",
341
+ "proud": "په ویاړلي انداز",
342
+ }.get(emotion)
343
+ return f"{text} ({emotional_suffix})" if emotional_suffix else text
344
+
345
+ def generate_synthetic_speech(
346
+ self,
347
+ text: str,
348
+ dialect: str,
349
+ voice_type: str,
350
+ speed: float,
351
+ emotion: str,
352
+ ) -> Tuple[np.ndarray, int]:
353
+ sample_rate = 24000
354
+ duration = max(1.5, min(len(text) * 0.11 / max(speed, 0.1), 18.0))
355
+ timeline = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
356
+
357
+ base_freq = {"female": 210, "child": 280, "elder": 105, "male": 130}.get(voice_type, 140)
358
+ dialect_shift = {
359
+ "کندهاري (Kandahari)": -5,
360
+ "پکتياوي (Paktiawal)": 7,
361
+ "پېښوري (Peshawri)": 13,
362
+ "مزارۍ (Mazari)": 3,
363
+ "هراتۍ (Herati)": -2,
364
+ "ننګرهاري (Nangarhari)": 8,
365
+ }.get(dialect, 0)
366
+ emotion_shift = {"neutral": 0, "joyful": 16, "sorrowful": -10, "proud": 9}.get(emotion, 0)
367
+ modulation = 18 * np.sin(2 * math.pi * 0.42 * timeline)
368
+ frequency = base_freq + dialect_shift + emotion_shift + modulation
369
+
370
+ audio = np.zeros_like(timeline)
371
+ for harmonic in range(1, 7):
372
+ audio += (1 / harmonic) * np.sin(2 * math.pi * harmonic * frequency * timeline)
373
+
374
+ syllable_envelope = 0.6 + 0.4 * np.sin(2 * math.pi * (2.0 * speed) * timeline) ** 2
375
+ fade = np.exp(-timeline / (3.8 / max(speed, 0.1)))
376
+ breath = np.random.normal(0, 0.008, timeline.shape)
377
+ audio = np.clip(audio * syllable_envelope * fade * 0.24 + breath, -1.0, 1.0)
378
+ return audio.astype(np.float32), sample_rate
379
+
380
+ def basic_speech_recognition(self, waveform: np.ndarray, dialect: str) -> Dict[str, Any]:
381
+ energy = float(np.mean(np.abs(waveform))) if waveform.size else 0.0
382
+ transcript = "دا يو پښتو متن دی چې د وينا پېژندنې له لارې ترلاسه شوی"
383
+ if energy > 0.06:
384
+ transcript += " او غږ يې روښانه دی"
385
+ if dialect == "کندهاري (Kandahari)":
386
+ transcript += " د کندهارۍ رنګ سره"
387
+ elif dialect == "پکتياوي (Paktiawal)":
388
+ transcript += " د پکتياوي انداز سره"
389
+ elif dialect == "هراتۍ (Herati)":
390
+ transcript += " د هراتي نرمۍ سره"
391
+ return {"text": transcript, "confidence": 0.85}
392
+
393
+ def apply_dialectal_corrections(self, text: str, dialect: str) -> str:
394
+ corrections = {
395
+ "کندهاري (Kandahari)": {"شګ": "ښګ", "ژګ": "ږګ"},
396
+ "پکتياوي (Paktiawal)": {"نګ": "ڼګ"},
397
+ "پېښوري (Peshawri)": {"ښ": "ش"},
398
+ }
399
+ corrected = text
400
+ for wrong, correct in corrections.get(dialect, {}).items():
401
+ corrected = corrected.replace(wrong, correct)
402
+ return corrected
403
+
404
+ def extract_comprehensive_cultural_markers(self, text: str) -> Dict[str, List[str]]:
405
+ return self.cultural_processor.analyze_text(text)
406
+
407
+ def get_pronunciation_notes(self, text: str, dialect: str) -> List[str]:
408
+ notes = [AFGHAN_PASHTO_DIALECTS[dialect]["pronunciation_guide"]]
409
+ if "ښ" in text:
410
+ notes.append("Text contains ښ, which is one of the key dialect markers.")
411
+ if "ږ" in text:
412
+ notes.append("Text contains ږ, so dialect-specific realization matters here.")
413
+ return notes
414
+
415
+ def extract_authentic_voice_features(self, waveform: np.ndarray, sample_rate: int, dialect: str) -> Dict[str, Any]:
416
+ stats = self.audio_processor.analyze_audio(waveform, sample_rate)
417
+ return {
418
+ "pitch_band": stats["pitch_band"],
419
+ "energy": stats["energy"],
420
+ "accent": AFGHAN_PASHTO_DIALECTS[dialect]["traditional_name"],
421
+ "quality": "clear" if stats["energy"] > 0.04 else "soft",
422
+ }
423
+
424
+ def basic_voice_cloning(self, text: str, voice_features: Dict[str, Any], dialect: str) -> Tuple[np.ndarray, int]:
425
+ pitch_band = voice_features.get("pitch_band", "mid")
426
+ voice_type = "female" if pitch_band == "high" else "elder" if voice_features.get("age_profile") == "elder" else "male"
427
+ return self.generate_synthetic_speech(text, dialect, voice_type, 1.0, "neutral")
428
+
429
+ def resolve_voice_model(self, dialect: str, voice_type: str) -> str:
430
+ model_map = AFGHAN_PASHTO_DIALECTS[dialect]["voice_models"]
431
+ return model_map.get(voice_type, f"{AFGHAN_PASHTO_DIALECTS[dialect]['code']}_{voice_type}.pth")
432
+
433
+
434
+ processor = CompleteAfghanPashtoProcessor()
435
+
436
+
437
+ def format_dialect_summary(dialect: str) -> str:
438
+ info = AFGHAN_PASHTO_DIALECTS[dialect]
439
+ return (
440
+ f"لهجه: {dialect}\n"
441
+ f"کوډ: {info['code']}\n"
442
+ f"سيمه: {info['region']}\n"
443
+ f"دوديز نوم: {info['traditional_name']}\n"
444
+ f"اواز ماډلونه: {', '.join(info['voice_models'].values())}\n"
445
+ f"تلفظ: {info['pronunciation_guide']}\n"
446
+ f"ځانګړنې: {', '.join(info['characteristics'])}"
447
+ )
448
+
449
+
450
+ def format_context_summary(context_name: str) -> str:
451
+ info = CULTURAL_CONTEXTS[context_name]
452
+ return (
453
+ f"تشريح: {info['description']}\n"
454
+ f"بېلګې: {', '.join(info['examples'])}\n"
455
+ f"اواز سبک: {info['voice_style']}"
456
+ )
457
+
458
+
459
+ def phoneme_markdown() -> str:
460
+ lines: List[str] = []
461
+ for category, items in COMPLETE_PHONEMES.items():
462
+ lines.append(f"### {category}")
463
+ for symbol, data in items.items():
464
+ dialects = data.get("dialects", {})
465
+ dialect_text = ", ".join(f"{key}: {value}" for key, value in dialects.items()) if dialects else "—"
466
+ lines.append(f"- **{symbol}** · IPA `{data['ipa']}` · {data['description']} · Dialects: {dialect_text}")
467
+ return "\n".join(lines)
468
+
469
+
470
+ def available_examples() -> List[List[Any]]:
471
+ return [
472
+ ["زما وطن د وياړ کور دی او پښتونولي زموږ د ژوند لار ده.", "کندهاري (Kandahari)", "مشر (Elder Male)", "ملي (National)", "جګ افتخار (Proud)", 1.0],
473
+ ["مېلمستيا او غيرت زموږ کلتوري ارزښتونه دي.", "پکتياوي (Paktiawal)", "ځوان (Young Male)", "فرهنګي (Cultural)", "طبيعي (Natural)", 1.1],
474
+ ["ښه راغلاست، دا يوه ولسي کيسه ده چې د زاړه وخت ياد راژوندی کوي.", "هراتۍ (Herati)", "ښځينه (Female)", "پېغلوي (Folk Tales)", "خوشحال (Joyful)", 0.9],
475
+ ]
476
+
477
+
478
+ def generate_voice(text: str, dialect: str, voice_label: str, context_name: str, emotion_label: str, speed: float):
479
+ if not text.strip():
480
+ raise gr.Error("مهرباني وکړئ پښتو متن وليکئ / Please enter Pashto text.")
481
+
482
+ voice_type = VOICE_TYPE_MODEL_MAP[voice_label]
483
+ emotion = EMOTION_MAP[emotion_label]
484
+ audio, sample_rate, metadata = processor.process_authentic_tts(text, dialect, voice_type, context_name, emotion, speed)
485
+ info = (
486
+ f"Model: {metadata['model']}\n"
487
+ f"Device: {metadata['device']}\n"
488
+ f"Dialectal text: {metadata['dialectal_text']}\n"
489
+ f"Contextualized text: {metadata['contextualized_text']}\n"
490
+ f"Emotional text: {metadata['emotional_text']}"
491
+ )
492
+ context_analysis = processor.extract_comprehensive_cultural_markers(metadata["emotional_text"])
493
+ marker_lines = [f"{key}: {', '.join(values)}" for key, values in context_analysis.items() if values]
494
+ markers_text = "\n".join(marker_lines) if marker_lines else "No explicit cultural markers detected yet."
495
+ return (sample_rate, audio), info, format_dialect_summary(dialect), format_context_summary(context_name), markers_text
496
+
497
+
498
+ def recognize_speech(audio_input, dialect: str):
499
+ result = processor.process_authentic_asr(audio_input, dialect)
500
+ summary = (
501
+ f"Transcription: {result['text']}\n"
502
+ f"Confidence: {result['confidence']:.0%}\n"
503
+ f"Duration: {result['audio_stats']['duration_seconds']} seconds\n"
504
+ f"Energy: {result['audio_stats']['energy']}\n"
505
+ f"Pitch band: {result['audio_stats']['pitch_band']}"
506
+ )
507
+ markers = [f"{key}: {', '.join(values)}" for key, values in result['cultural_markers'].items() if values]
508
+ return summary, result["cultural_markers"], "\n".join(result["pronunciation_notes"]), "\n".join(markers) if markers else "No cultural markers detected."
509
+
510
+
511
+ def clone_voice(reference_audio, text: str, dialect: str, age_profile: str, style_profile: str):
512
+ if not text.strip():
513
+ raise gr.Error("Please provide target text for cloning.")
514
+
515
+ audio, sample_rate, features = processor.process_voice_cloning(
516
+ reference_audio,
517
+ text,
518
+ dialect,
519
+ {"age_profile": age_profile, "style_profile": style_profile},
520
+ )
521
+ feature_lines = "\n".join(f"{key}: {value}" for key, value in features.items())
522
+ return (sample_rate, audio), feature_lines, format_dialect_summary(dialect)
523
+
524
+
525
+ def build_app() -> gr.Blocks:
526
+ with gr.Blocks(
527
+ title="🎙️ Afghan Pashto Voice Hub - د افغان پښتو غږيز مرکز",
528
+ theme=gr.themes.Soft(),
529
+ css="""
530
+ .pashto-text {
531
+ font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq', 'Scheherazade New', serif;
532
+ direction: rtl;
533
+ text-align: right;
534
+ }
535
+ .afghan-flag {
536
+ background: linear-gradient(to bottom, #000000, #c81818, #0b8f3a);
537
+ height: 20px;
538
+ border-radius: 6px;
539
+ margin: 10px 0;
540
+ }
541
+ """,
542
+ ) as app:
543
+ gr.Markdown(
544
+ """
545
+ # 🎙️ Afghan Pashto Voice & Speech Processing Hub
546
+ ## د افغان پښتو غږيز پروسسنګ مرکز
547
+
548
+ **Pure Afghan Pashto - له اصل پښتو سره**
549
+
550
+ Supports: **Kandahari, Paktiawal, Peshawri, Mazari, Herati, Nangarhari, and traditional forms**
551
+ """
552
+ )
553
+ gr.HTML('<div class="afghan-flag"></div>')
554
+
555
+ with gr.Row():
556
+ with gr.Column(scale=2):
557
+ dialect_preview = gr.Dropdown(
558
+ choices=list(AFGHAN_PASHTO_DIALECTS.keys()),
559
+ value="کندهاري (Kandahari)",
560
+ label="Dialect overview - لهجو کتنه",
561
+ )
562
+ dialect_summary = gr.Textbox(
563
+ value=format_dialect_summary("کندهاري (Kandahari)"),
564
+ label="Dialect details",
565
+ lines=7,
566
+ )
567
+ with gr.Column(scale=1):
568
+ gr.Markdown(f"### Runtime\n- Runtime device: {processor.device}\n- Models: lightweight placeholder stack")
569
+
570
+ dialect_preview.change(fn=format_dialect_summary, inputs=dialect_preview, outputs=dialect_summary)
571
+
572
+ with gr.Accordion("Traditional phoneme guide", open=False):
573
+ gr.Markdown(phoneme_markdown(), elem_classes="pashto-text")
574
+
575
+ with gr.Tabs():
576
+ with gr.TabItem("🔊 Authentic Voice"):
577
+ with gr.Row():
578
+ with gr.Column():
579
+ authentic_text = gr.Textbox(
580
+ label="پښتو متن / Pashto Text",
581
+ placeholder="دلته پښتو متن ولیکئ...",
582
+ lines=5,
583
+ elem_classes="pashto-text",
584
+ )
585
+ with gr.Row():
586
+ authentic_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="کندهاري (Kandahari)", label="Dialect - لهجه")
587
+ authentic_voice = gr.Dropdown(choices=list(VOICE_TYPE_MODEL_MAP.keys()), value="مشر (Elder Male)", label="Voice Type - غږ ډول")
588
+ with gr.Row():
589
+ authentic_context = gr.Dropdown(choices=list(CULTURAL_CONTEXTS.keys()), value="ملي (National)", label="Cultural Context - کلتني زمينه")
590
+ authentic_emotion = gr.Dropdown(choices=list(EMOTION_MAP.keys()), value="طبيعي (Natural)", label="Emotion - احساس")
591
+ authentic_speed = gr.Slider(0.6, 1.4, value=1.0, step=0.1, label="Speed - چټکتيا")
592
+ authentic_generate = gr.Button("🎤 Generate Authentic Voice", variant="primary")
593
+ gr.Examples(examples=available_examples(), inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed])
594
+ with gr.Column():
595
+ authentic_output = gr.Audio(label="Generated Afghan Pashto Voice")
596
+ authentic_info = gr.Textbox(label="Voice pipeline details", lines=7)
597
+ authentic_dialect_info = gr.Textbox(label="Dialect knowledge", lines=7)
598
+ authentic_context_info = gr.Textbox(label="Context knowledge", lines=4)
599
+ authentic_markers = gr.Textbox(label="Cultural markers", lines=5)
600
+
601
+ authentic_generate.click(
602
+ fn=generate_voice,
603
+ inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed],
604
+ outputs=[authentic_output, authentic_info, authentic_dialect_info, authentic_context_info, authentic_markers],
605
+ )
606
+
607
+ with gr.TabItem("🎧 Speech Recognition"):
608
+ with gr.Row():
609
+ with gr.Column():
610
+ recognition_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload or record Pashto speech")
611
+ recognition_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="پکتياوي (Paktiawal)", label="Target dialect")
612
+ recognition_button = gr.Button("📝 Recognize Speech", variant="primary")
613
+ with gr.Column():
614
+ recognition_summary = gr.Textbox(label="Recognition summary", lines=6)
615
+ recognition_context = gr.JSON(label="Cultural context analysis")
616
+ recognition_pronunciation = gr.Textbox(label="Pronunciation notes", lines=4)
617
+ recognition_markers = gr.Textbox(label="Detected markers", lines=4)
618
+
619
+ recognition_button.click(
620
+ fn=recognize_speech,
621
+ inputs=[recognition_audio, recognition_dialect],
622
+ outputs=[recognition_summary, recognition_context, recognition_pronunciation, recognition_markers],
623
+ )
624
+
625
+ with gr.TabItem("🧬 Voice Cloning Demo"):
626
+ with gr.Row():
627
+ with gr.Column():
628
+ clone_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Reference Afghan voice")
629
+ clone_text = gr.Textbox(label="Target text", lines=4, placeholder="هغه متن وليکئ چې د هماغه غږ په ډول واورئ...", elem_classes="pashto-text")
630
+ clone_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="هراتۍ (Herati)", label="Dialect")
631
+ with gr.Row():
632
+ clone_age = gr.Dropdown(choices=["youthful", "mature", "elder"], value="mature", label="Age profile")
633
+ clone_style = gr.Dropdown(choices=["formal", "storytelling", "poetic", "conversational"], value="storytelling", label="Speaking style")
634
+ clone_button = gr.Button("🧪 Clone Voice Demo", variant="primary")
635
+ with gr.Column():
636
+ clone_output = gr.Audio(label="Cloned Afghan voice")
637
+ clone_features = gr.Textbox(label="Extracted / merged voice features", lines=8)
638
+ clone_dialect_info = gr.Textbox(label="Dialect profile", lines=7)
639
+
640
+ clone_button.click(
641
+ fn=clone_voice,
642
+ inputs=[clone_audio, clone_text, clone_dialect, clone_age, clone_style],
643
+ outputs=[clone_output, clone_features, clone_dialect_info],
644
+ )
645
+
646
+ gr.Markdown(
647
+ """
648
+ ### Notes
649
+ - This app is a lightweight, deployable Gradio demo with authentic Afghan Pashto structure and metadata.
650
+ - TTS, ASR, and voice cloning are implemented with synthetic placeholder audio logic so the interface runs without large model files.
651
+ - You can later replace the placeholder methods with real Pashto TTS, ASR, and cloning checkpoints.
652
+ """
653
+ )
654
+
655
+ return app
656
+
657
+
658
+ app = build_app()
659
+
660
+
661
+ if __name__ == "__main__":
662
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=5.0.0
2
+ numpy>=1.24.0