Spaces:
Sleeping
Sleeping
Commit ·
e40895a
0
Parent(s):
Add initial implementation of Afghan Pashto Voice Processing and update .gitignore
Browse files- .gitignore +4 -0
- app.py +662 -0
- requirements.txt +2 -0
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
#Ignore insiders AI rules
|
| 4 |
+
.github/instructions/codacy.instructions.md
|
app.py
ADDED
|
@@ -0,0 +1,662 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
COMPLETE Afghan Pashto Voice & Speech Processing Space
|
| 3 |
+
Pure Afghan Pashto - له اصل پښتو سره
|
| 4 |
+
Author: Afghan Voice Technology Initiative
|
| 5 |
+
Version: 2.0 - Lightweight Complete Demo
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import math
|
| 11 |
+
from typing import Any, Dict, List, Tuple
|
| 12 |
+
|
| 13 |
+
import gradio as gr
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
import torch
|
| 18 |
+
except Exception: # pragma: no cover - optional dependency
|
| 19 |
+
torch = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
AFGHAN_PASHTO_DIALECTS: Dict[str, Dict[str, Any]] = {
|
| 23 |
+
"کندهاري (Kandahari)": {
|
| 24 |
+
"code": "ps-kan",
|
| 25 |
+
"region": "کندهار، زابل، ارزگان",
|
| 26 |
+
"characteristics": ["Hard ږ (g)", "ښ as خ", "Emphatic consonants", "Traditional poetry"],
|
| 27 |
+
"traditional_name": "کندهاري غه",
|
| 28 |
+
"voice_models": {"male": "kan_male_v2.pth", "female": "kan_female_v2.pth", "elder": "kan_elder_v2.pth"},
|
| 29 |
+
"pronunciation_guide": "ږ = hard 'g', ښ = 'kh', Retroflex sounds preserved",
|
| 30 |
+
},
|
| 31 |
+
"پکتياوي (Paktiawal)": {
|
| 32 |
+
"code": "ps-pak",
|
| 33 |
+
"region": "پکتيا، پکتيکا، خوست",
|
| 34 |
+
"characteristics": ["Retroflex ڼ", "Nasal vowels", "Tribal vocabulary", "Mountain accent"],
|
| 35 |
+
"traditional_name": "پکتياوي خښه",
|
| 36 |
+
"voice_models": {"male": "pak_male_v2.pth", "female": "pak_female_v2.pth", "elder": "pak_elder_v2.pth"},
|
| 37 |
+
"pronunciation_guide": "ڼ = retroflex 'n', Nasalized vowels, Tribal words",
|
| 38 |
+
},
|
| 39 |
+
"پېښوري (Peshawri)": {
|
| 40 |
+
"code": "ps-pes",
|
| 41 |
+
"region": "پېښور، مردان، سوات",
|
| 42 |
+
"characteristics": ["ښ as ش", "Soft ږ (zh)", "Urban vocabulary", "Trade language"],
|
| 43 |
+
"traditional_name": "پېښوري ژبه",
|
| 44 |
+
"voice_models": {"male": "pes_male_v2.pth", "female": "pes_female_v2.pth", "elder": "pes_elder_v2.pth"},
|
| 45 |
+
"pronunciation_guide": "ښ = 'sh', ږ = soft 'zh', Urban expressions",
|
| 46 |
+
},
|
| 47 |
+
"مزارۍ (Mazari)": {
|
| 48 |
+
"code": "ps-maz",
|
| 49 |
+
"region": "مزار شريف، بلخ، جوزجان",
|
| 50 |
+
"characteristics": ["Uzbek influence", "Northern vowels", "Turkic loanwords", "Plains accent"],
|
| 51 |
+
"traditional_name": "مزارۍ غږ",
|
| 52 |
+
"voice_models": {"male": "maz_male_v2.pth", "female": "maz_female_v2.pth", "elder": "maz_elder_v2.pth"},
|
| 53 |
+
"pronunciation_guide": "Uzbek-influenced vowels, Turkic words, Northern tone",
|
| 54 |
+
},
|
| 55 |
+
"هراتۍ (Herati)": {
|
| 56 |
+
"code": "ps-her",
|
| 57 |
+
"region": "هرات، فراه، نيمروز",
|
| 58 |
+
"characteristics": ["Persian influence", "Western vowels", "Herati accent", "Cultural sophistication"],
|
| 59 |
+
"traditional_name": "هراتۍ لهجه",
|
| 60 |
+
"voice_models": {"male": "her_male_v2.pth", "female": "her_female_v2.pth", "elder": "her_elder_v2.pth"},
|
| 61 |
+
"pronunciation_guide": "Persian-influenced sounds, Western vowels, Cultural words",
|
| 62 |
+
},
|
| 63 |
+
"ننګرهاري (Nangarhari)": {
|
| 64 |
+
"code": "ps-nan",
|
| 65 |
+
"region": "جلال اباد، ننګرهار، کنړ",
|
| 66 |
+
"characteristics": ["Eastern dialect", "Khattak influence", "Jalalabad accent", "Border influences"],
|
| 67 |
+
"traditional_name": "ننګرهاري وړاندې",
|
| 68 |
+
"voice_models": {"male": "nan_male_v2.pth", "female": "nan_female_v2.pth", "elder": "nan_elder_v2.pth"},
|
| 69 |
+
"pronunciation_guide": "Eastern sounds, Khattak influence, Border variations",
|
| 70 |
+
},
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
CULTURAL_CONTEXTS: Dict[str, Dict[str, Any]] = {
|
| 75 |
+
"ملي (National)": {
|
| 76 |
+
"description": "National songs, anthems, patriotic poetry",
|
| 77 |
+
"examples": ["ملي سرود", "وطن شعرونه", "غازي قومي"],
|
| 78 |
+
"voice_style": "proud, formal, clear",
|
| 79 |
+
"suffix": "د ملي غرور سره",
|
| 80 |
+
},
|
| 81 |
+
"قومي (Tribal)": {
|
| 82 |
+
"description": "Tribal traditions, ethnic heritage, clan stories",
|
| 83 |
+
"examples": ["قومي کیسې", "نسب او شجره", "قبیلوي ویاړونه"],
|
| 84 |
+
"voice_style": "traditional, elder-like, respectful",
|
| 85 |
+
"suffix": "د قومي وياړ سره",
|
| 86 |
+
},
|
| 87 |
+
"مذهبي (Religious)": {
|
| 88 |
+
"description": "Religious content, spiritual guidance, Islamic teachings",
|
| 89 |
+
"examples": ["دیني دروس", "اخلاقي کیسې", "روحاني مواعظ"],
|
| 90 |
+
"voice_style": "soft, respectful, spiritual",
|
| 91 |
+
"suffix": "د مذهبي احترام سره",
|
| 92 |
+
},
|
| 93 |
+
"فرهنګي (Cultural)": {
|
| 94 |
+
"description": "Cultural education, traditional values, customs",
|
| 95 |
+
"examples": ["فرهنګي ارزښتونه", "دودونه او دستورونه", "کلتني کیسې"],
|
| 96 |
+
"voice_style": "educational, warm, cultural",
|
| 97 |
+
"suffix": "د فرهنګي ارزښتونو سره",
|
| 98 |
+
},
|
| 99 |
+
"تاريخي (Historical)": {
|
| 100 |
+
"description": "Historical narratives, ancient stories, past events",
|
| 101 |
+
"examples": ["تاريخي کیسې", "پخوانۍ پیښې", "قدیم افسانې"],
|
| 102 |
+
"voice_style": "storyteller, dramatic, engaging",
|
| 103 |
+
"suffix": "د تاريخي روايت په انداز",
|
| 104 |
+
},
|
| 105 |
+
"سنګي (Musical)": {
|
| 106 |
+
"description": "Traditional music, folk songs, cultural rhythms",
|
| 107 |
+
"examples": ["سنګي ملودۍ", "فولکلوري سندرې", "کلاسیکي موسیقي"],
|
| 108 |
+
"voice_style": "melodic, rhythmic, artistic",
|
| 109 |
+
"suffix": "د دوديزې نغمې په رنګ",
|
| 110 |
+
},
|
| 111 |
+
"پېغلوي (Folk Tales)": {
|
| 112 |
+
"description": "Folk tales, traditional stories, cultural narratives",
|
| 113 |
+
"examples": ["پېغلوي کیسې", "افسانوي کیسې", "کلتني حکیات"],
|
| 114 |
+
"voice_style": "storyteller, engaging, traditional",
|
| 115 |
+
"suffix": "د ولسي کيسې له خوند سره",
|
| 116 |
+
},
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
COMPLETE_PHONEMES: Dict[str, Dict[str, Dict[str, Any]]] = {
|
| 121 |
+
"پښتني حروف": {
|
| 122 |
+
"ښ": {"symbol": "ښ", "ipa": "/ʂ/", "description": "Voiceless retroflex fricative", "dialects": {"کندهاري": "خ", "پېښوري": "ش"}},
|
| 123 |
+
"ږ": {"symbol": "ږ", "ipa": "/ʐ/", "description": "Voiced retroflex fricative", "dialects": {"کندهاري": "گ", "پېښوري": "ژ"}},
|
| 124 |
+
"ڼ": {"symbol": "ڼ", "ipa": "/ɳ/", "description": "Retroflex nasal", "dialects": {"پکتياوي": "ڼ", "کندهاري": "ن"}},
|
| 125 |
+
"ړ": {"symbol": "ړ", "ipa": "/ɽ/", "description": "Retroflex flap", "dialects": {"ټول": "ړ"}},
|
| 126 |
+
"ټ": {"symbol": "ټ", "ipa": "/ʈ/", "description": "Voiceless retroflex stop", "dialects": {"ټول": "ټ"}},
|
| 127 |
+
"ډ": {"symbol": "ډ", "ipa": "/ɖ/", "description": "Voiced retroflex stop", "dialects": {"ټول": "ډ"}},
|
| 128 |
+
},
|
| 129 |
+
"عربي حروف": {
|
| 130 |
+
"ص": {"symbol": "ص", "ipa": "/sˤ/", "description": "Emphatic voiceless alveolar fricative", "dialects": {}},
|
| 131 |
+
"ض": {"symbol": "ض", "ipa": "/dˤ/", "description": "Emphatic voiced alveolar stop", "dialects": {}},
|
| 132 |
+
"ط": {"symbol": "ط", "ipa": "/tˤ/", "description": "Emphatic voiceless alveolar stop", "dialects": {}},
|
| 133 |
+
"ظ": {"symbol": "ظ", "ipa": "/zˤ/", "description": "Emphatic voiced alveolar fricative", "dialects": {}},
|
| 134 |
+
},
|
| 135 |
+
"ويي": {
|
| 136 |
+
"ا": {"symbol": "ا", "ipa": "/a/", "description": "Open front vowel", "dialects": {}},
|
| 137 |
+
"ې": {"symbol": "ې", "ipa": "/e/", "description": "Close-mid front vowel", "dialects": {}},
|
| 138 |
+
"ۍ": {"symbol": "ۍ", "ipa": "/ei/", "description": "Diphthong", "dialects": {}},
|
| 139 |
+
"و": {"symbol": "و", "ipa": "/o/", "description": "Close-mid back rounded vowel", "dialects": {}},
|
| 140 |
+
"ۀ": {"symbol": "ۀ", "ipa": "/ə/", "description": "Schwa", "dialects": {}},
|
| 141 |
+
},
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
VOICE_TYPE_MODEL_MAP = {
|
| 146 |
+
"مشر (Elder Male)": "elder",
|
| 147 |
+
"ځوان (Young Male)": "male",
|
| 148 |
+
"ښځينه (Female)": "female",
|
| 149 |
+
"وړکتي (Child)": "child",
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
EMOTION_MAP = {
|
| 153 |
+
"طبيعي (Natural)": "neutral",
|
| 154 |
+
"خوشحال (Joyful)": "joyful",
|
| 155 |
+
"غميز (Sorrowful)": "sorrowful",
|
| 156 |
+
"جګ افتخار (Proud)": "proud",
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class AudioProcessor:
|
| 161 |
+
def preprocess_audio(self, audio_input: Tuple[int, np.ndarray] | np.ndarray | None) -> np.ndarray:
|
| 162 |
+
if audio_input is None:
|
| 163 |
+
raise ValueError("No audio input was provided.")
|
| 164 |
+
|
| 165 |
+
if isinstance(audio_input, tuple):
|
| 166 |
+
_, waveform = audio_input
|
| 167 |
+
else:
|
| 168 |
+
waveform = audio_input
|
| 169 |
+
|
| 170 |
+
waveform = np.asarray(waveform, dtype=np.float32).squeeze()
|
| 171 |
+
if waveform.ndim > 1:
|
| 172 |
+
waveform = waveform.mean(axis=1)
|
| 173 |
+
|
| 174 |
+
peak = float(np.max(np.abs(waveform))) if waveform.size else 0.0
|
| 175 |
+
if peak > 0:
|
| 176 |
+
waveform = waveform / peak
|
| 177 |
+
return waveform
|
| 178 |
+
|
| 179 |
+
def analyze_audio(self, waveform: np.ndarray, sample_rate: int) -> Dict[str, Any]:
|
| 180 |
+
if waveform.size == 0:
|
| 181 |
+
return {"duration_seconds": 0.0, "energy": 0.0, "pitch_band": "unknown"}
|
| 182 |
+
|
| 183 |
+
energy = float(np.mean(np.abs(waveform)))
|
| 184 |
+
zero_crossing = float(np.mean(np.abs(np.diff(np.signbit(waveform))))) if waveform.size > 1 else 0.0
|
| 185 |
+
pitch_band = "high" if zero_crossing > 0.12 else "mid" if zero_crossing > 0.05 else "low"
|
| 186 |
+
return {
|
| 187 |
+
"duration_seconds": round(waveform.size / max(sample_rate, 1), 2),
|
| 188 |
+
"energy": round(energy, 4),
|
| 189 |
+
"pitch_band": pitch_band,
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class CulturalContextProcessor:
|
| 194 |
+
tribal_terms = ["احمدزي", "محسود", "خټک", "یوسفزي", "دواني", "ننګيال"]
|
| 195 |
+
cultural_terms = ["پښتونولي", "مېلمستيا", "ننګ", "غيرت", "توره", "نګاه"]
|
| 196 |
+
traditional_expressions = ["ښه راغلاست", "په خير", "الله دې مل شه", "ستړی مه شې"]
|
| 197 |
+
honorifics = ["صاحب", "ملا", "خان", "استاد"]
|
| 198 |
+
|
| 199 |
+
def apply_cultural_context(self, text: str, context: str) -> str:
|
| 200 |
+
context_info = CULTURAL_CONTEXTS.get(context)
|
| 201 |
+
if not context_info:
|
| 202 |
+
return text
|
| 203 |
+
return f"{text} ({context_info['suffix']})"
|
| 204 |
+
|
| 205 |
+
def analyze_text(self, text: str) -> Dict[str, List[str]]:
|
| 206 |
+
return {
|
| 207 |
+
"tribal_references": [term for term in self.tribal_terms if term in text],
|
| 208 |
+
"cultural_concepts": [term for term in self.cultural_terms if term in text],
|
| 209 |
+
"traditional_expressions": [term for term in self.traditional_expressions if term in text],
|
| 210 |
+
"honorifics": [term for term in self.honorifics if term in text],
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
class CompleteAfghanPashtoProcessor:
|
| 215 |
+
def __init__(self) -> None:
|
| 216 |
+
self.device = "cuda" if torch is not None and torch.cuda.is_available() else "cpu"
|
| 217 |
+
self.models: Dict[str, Any] = {}
|
| 218 |
+
self.audio_processor = AudioProcessor()
|
| 219 |
+
self.cultural_processor = CulturalContextProcessor()
|
| 220 |
+
self.dialect_rules = self.load_dialect_rules()
|
| 221 |
+
self.load_all_models()
|
| 222 |
+
|
| 223 |
+
def load_all_models(self) -> None:
|
| 224 |
+
self.models = {
|
| 225 |
+
"tts": {"base": self.load_tts_model(), "dialects": self.load_dialectal_tts_models()},
|
| 226 |
+
"asr": {"base": self.load_asr_model(), "dialectal": self.load_dialectal_asr_models()},
|
| 227 |
+
"voice_clone": self.load_voice_cloning_model(),
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
def load_tts_model(self) -> Dict[str, str]:
|
| 231 |
+
return {"model": "base_tts", "status": "placeholder"}
|
| 232 |
+
|
| 233 |
+
def load_dialectal_tts_models(self) -> Dict[str, Dict[str, str]]:
|
| 234 |
+
return {dialect: info["voice_models"] for dialect, info in AFGHAN_PASHTO_DIALECTS.items()}
|
| 235 |
+
|
| 236 |
+
def load_asr_model(self) -> Dict[str, str]:
|
| 237 |
+
return {"model": "base_asr", "status": "placeholder"}
|
| 238 |
+
|
| 239 |
+
def load_dialectal_asr_models(self) -> Dict[str, str]:
|
| 240 |
+
return {dialect: f"{info['code']}_asr" for dialect, info in AFGHAN_PASHTO_DIALECTS.items()}
|
| 241 |
+
|
| 242 |
+
def load_voice_cloning_model(self) -> Dict[str, str]:
|
| 243 |
+
return {"model": "voice_clone", "status": "placeholder"}
|
| 244 |
+
|
| 245 |
+
def load_dialect_rules(self) -> Dict[str, Dict[str, Dict[str, Any]]]:
|
| 246 |
+
return {
|
| 247 |
+
"pronunciation": {
|
| 248 |
+
"کندهاري (Kandahari)": {"ښ": "خ", "ږ": "گ", "emphatic_consonants": True},
|
| 249 |
+
"پکتياوي (Paktiawal)": {"ڼ": "ڼ", "nasal_vowels": True, "tribal_pronunciation": True},
|
| 250 |
+
"پېښوري (Peshawri)": {"ښ": "ش", "ږ": "ژ", "urban_influence": True},
|
| 251 |
+
"هراتۍ (Herati)": {"ښ": "خ", "ږ": "گ", "western_vowels": True},
|
| 252 |
+
},
|
| 253 |
+
"vocabulary": {
|
| 254 |
+
"کندهاري (Kandahari)": {"traditional_words": ["غه", "خه", "ګه"], "poetic_expressions": True},
|
| 255 |
+
"پکتياوي (Paktiawal)": {"tribal_words": ["خېل", "قوم", "نګهبان"], "mountain_vocabulary": True},
|
| 256 |
+
"هراتۍ (Herati)": {"persian_loanwords": True, "cultural_terms": ["فرهنګ", "تمدن", "ادب"]},
|
| 257 |
+
},
|
| 258 |
+
"grammar": {
|
| 259 |
+
"ننګرهاري (Nangarhari)": {"eastern_constructions": True, "border_influences": True},
|
| 260 |
+
"مزارۍ (Mazari)": {"uzbek_influence": True, "northern_constructions": True},
|
| 261 |
+
},
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
def process_authentic_tts(
|
| 265 |
+
self,
|
| 266 |
+
text: str,
|
| 267 |
+
dialect: str,
|
| 268 |
+
voice_type: str,
|
| 269 |
+
context: str,
|
| 270 |
+
emotion: str,
|
| 271 |
+
speed: float,
|
| 272 |
+
) -> Tuple[np.ndarray, int, Dict[str, Any]]:
|
| 273 |
+
dialectal_text = self.apply_comprehensive_dialect_rules(text, dialect)
|
| 274 |
+
contextualized_text = self.cultural_processor.apply_cultural_context(dialectal_text, context)
|
| 275 |
+
emotional_text = self.apply_emotional_coloring(contextualized_text, emotion)
|
| 276 |
+
audio, sample_rate = self.generate_synthetic_speech(emotional_text, dialect, voice_type, speed, emotion)
|
| 277 |
+
return audio, sample_rate, {
|
| 278 |
+
"dialectal_text": dialectal_text,
|
| 279 |
+
"contextualized_text": contextualized_text,
|
| 280 |
+
"emotional_text": emotional_text,
|
| 281 |
+
"model": self.resolve_voice_model(dialect, voice_type),
|
| 282 |
+
"device": self.device,
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
def process_authentic_asr(self, audio_input: Tuple[int, np.ndarray] | None, dialect: str) -> Dict[str, Any]:
|
| 286 |
+
if audio_input is None:
|
| 287 |
+
raise ValueError("Please record or upload Pashto speech first.")
|
| 288 |
+
|
| 289 |
+
sample_rate, waveform = audio_input
|
| 290 |
+
processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform))
|
| 291 |
+
result = self.basic_speech_recognition(processed_audio, dialect)
|
| 292 |
+
corrected_text = self.apply_dialectal_corrections(result["text"], dialect)
|
| 293 |
+
cultural_info = self.extract_comprehensive_cultural_markers(corrected_text)
|
| 294 |
+
audio_stats = self.audio_processor.analyze_audio(processed_audio, sample_rate)
|
| 295 |
+
return {
|
| 296 |
+
"text": corrected_text,
|
| 297 |
+
"confidence": result.get("confidence", 0.85),
|
| 298 |
+
"dialect": dialect,
|
| 299 |
+
"audio_stats": audio_stats,
|
| 300 |
+
"cultural_markers": cultural_info,
|
| 301 |
+
"pronunciation_notes": self.get_pronunciation_notes(corrected_text, dialect),
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
def process_voice_cloning(
|
| 305 |
+
self,
|
| 306 |
+
reference_audio: Tuple[int, np.ndarray] | None,
|
| 307 |
+
text: str,
|
| 308 |
+
dialect: str,
|
| 309 |
+
voice_characteristics: Dict[str, str],
|
| 310 |
+
) -> Tuple[np.ndarray, int, Dict[str, Any]]:
|
| 311 |
+
if reference_audio is None:
|
| 312 |
+
raise ValueError("Reference audio is required for voice cloning.")
|
| 313 |
+
|
| 314 |
+
sample_rate, waveform = reference_audio
|
| 315 |
+
processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform))
|
| 316 |
+
features = self.extract_authentic_voice_features(processed_audio, sample_rate, dialect)
|
| 317 |
+
merged_features = {**features, **voice_characteristics}
|
| 318 |
+
cloned_audio, cloned_rate = self.basic_voice_cloning(text, merged_features, dialect)
|
| 319 |
+
return cloned_audio, cloned_rate, merged_features
|
| 320 |
+
|
| 321 |
+
def apply_comprehensive_dialect_rules(self, text: str, dialect: str) -> str:
|
| 322 |
+
pronunciation_rules = self.dialect_rules.get("pronunciation", {}).get(dialect, {})
|
| 323 |
+
vocabulary_rules = self.dialect_rules.get("vocabulary", {}).get(dialect, {})
|
| 324 |
+
|
| 325 |
+
transformed = text
|
| 326 |
+
for original, replacement in pronunciation_rules.items():
|
| 327 |
+
if isinstance(replacement, str):
|
| 328 |
+
transformed = transformed.replace(original, replacement)
|
| 329 |
+
|
| 330 |
+
if vocabulary_rules.get("poetic_expressions") and "وطن" in transformed:
|
| 331 |
+
transformed = transformed.replace("وطن", "پلرنی وطن")
|
| 332 |
+
if vocabulary_rules.get("persian_loanwords") and "کلتور" in transformed:
|
| 333 |
+
transformed = transformed.replace("کلتور", "فرهنګ")
|
| 334 |
+
return transformed
|
| 335 |
+
|
| 336 |
+
def apply_emotional_coloring(self, text: str, emotion: str) -> str:
|
| 337 |
+
emotional_suffix = {
|
| 338 |
+
"neutral": "په طبيعي انداز",
|
| 339 |
+
"joyful": "په خوشحال رنګ",
|
| 340 |
+
"sorrowful": "په غمجن اهنګ",
|
| 341 |
+
"proud": "په ویاړلي انداز",
|
| 342 |
+
}.get(emotion)
|
| 343 |
+
return f"{text} ({emotional_suffix})" if emotional_suffix else text
|
| 344 |
+
|
| 345 |
+
def generate_synthetic_speech(
|
| 346 |
+
self,
|
| 347 |
+
text: str,
|
| 348 |
+
dialect: str,
|
| 349 |
+
voice_type: str,
|
| 350 |
+
speed: float,
|
| 351 |
+
emotion: str,
|
| 352 |
+
) -> Tuple[np.ndarray, int]:
|
| 353 |
+
sample_rate = 24000
|
| 354 |
+
duration = max(1.5, min(len(text) * 0.11 / max(speed, 0.1), 18.0))
|
| 355 |
+
timeline = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
|
| 356 |
+
|
| 357 |
+
base_freq = {"female": 210, "child": 280, "elder": 105, "male": 130}.get(voice_type, 140)
|
| 358 |
+
dialect_shift = {
|
| 359 |
+
"کندهاري (Kandahari)": -5,
|
| 360 |
+
"پکتياوي (Paktiawal)": 7,
|
| 361 |
+
"پېښوري (Peshawri)": 13,
|
| 362 |
+
"مزارۍ (Mazari)": 3,
|
| 363 |
+
"هراتۍ (Herati)": -2,
|
| 364 |
+
"ننګرهاري (Nangarhari)": 8,
|
| 365 |
+
}.get(dialect, 0)
|
| 366 |
+
emotion_shift = {"neutral": 0, "joyful": 16, "sorrowful": -10, "proud": 9}.get(emotion, 0)
|
| 367 |
+
modulation = 18 * np.sin(2 * math.pi * 0.42 * timeline)
|
| 368 |
+
frequency = base_freq + dialect_shift + emotion_shift + modulation
|
| 369 |
+
|
| 370 |
+
audio = np.zeros_like(timeline)
|
| 371 |
+
for harmonic in range(1, 7):
|
| 372 |
+
audio += (1 / harmonic) * np.sin(2 * math.pi * harmonic * frequency * timeline)
|
| 373 |
+
|
| 374 |
+
syllable_envelope = 0.6 + 0.4 * np.sin(2 * math.pi * (2.0 * speed) * timeline) ** 2
|
| 375 |
+
fade = np.exp(-timeline / (3.8 / max(speed, 0.1)))
|
| 376 |
+
breath = np.random.normal(0, 0.008, timeline.shape)
|
| 377 |
+
audio = np.clip(audio * syllable_envelope * fade * 0.24 + breath, -1.0, 1.0)
|
| 378 |
+
return audio.astype(np.float32), sample_rate
|
| 379 |
+
|
| 380 |
+
def basic_speech_recognition(self, waveform: np.ndarray, dialect: str) -> Dict[str, Any]:
|
| 381 |
+
energy = float(np.mean(np.abs(waveform))) if waveform.size else 0.0
|
| 382 |
+
transcript = "دا يو پښتو متن دی چې د وينا پېژندنې له لارې ترلاسه شوی"
|
| 383 |
+
if energy > 0.06:
|
| 384 |
+
transcript += " او غږ يې روښانه دی"
|
| 385 |
+
if dialect == "کندهاري (Kandahari)":
|
| 386 |
+
transcript += " د کندهارۍ رنګ سره"
|
| 387 |
+
elif dialect == "پکتياوي (Paktiawal)":
|
| 388 |
+
transcript += " د پکتياوي انداز سره"
|
| 389 |
+
elif dialect == "هراتۍ (Herati)":
|
| 390 |
+
transcript += " د هراتي نرمۍ سره"
|
| 391 |
+
return {"text": transcript, "confidence": 0.85}
|
| 392 |
+
|
| 393 |
+
def apply_dialectal_corrections(self, text: str, dialect: str) -> str:
|
| 394 |
+
corrections = {
|
| 395 |
+
"کندهاري (Kandahari)": {"شګ": "ښګ", "ژګ": "ږګ"},
|
| 396 |
+
"پکتياوي (Paktiawal)": {"نګ": "ڼګ"},
|
| 397 |
+
"پېښوري (Peshawri)": {"ښ": "ش"},
|
| 398 |
+
}
|
| 399 |
+
corrected = text
|
| 400 |
+
for wrong, correct in corrections.get(dialect, {}).items():
|
| 401 |
+
corrected = corrected.replace(wrong, correct)
|
| 402 |
+
return corrected
|
| 403 |
+
|
| 404 |
+
def extract_comprehensive_cultural_markers(self, text: str) -> Dict[str, List[str]]:
|
| 405 |
+
return self.cultural_processor.analyze_text(text)
|
| 406 |
+
|
| 407 |
+
def get_pronunciation_notes(self, text: str, dialect: str) -> List[str]:
|
| 408 |
+
notes = [AFGHAN_PASHTO_DIALECTS[dialect]["pronunciation_guide"]]
|
| 409 |
+
if "ښ" in text:
|
| 410 |
+
notes.append("Text contains ښ, which is one of the key dialect markers.")
|
| 411 |
+
if "ږ" in text:
|
| 412 |
+
notes.append("Text contains ږ, so dialect-specific realization matters here.")
|
| 413 |
+
return notes
|
| 414 |
+
|
| 415 |
+
def extract_authentic_voice_features(self, waveform: np.ndarray, sample_rate: int, dialect: str) -> Dict[str, Any]:
|
| 416 |
+
stats = self.audio_processor.analyze_audio(waveform, sample_rate)
|
| 417 |
+
return {
|
| 418 |
+
"pitch_band": stats["pitch_band"],
|
| 419 |
+
"energy": stats["energy"],
|
| 420 |
+
"accent": AFGHAN_PASHTO_DIALECTS[dialect]["traditional_name"],
|
| 421 |
+
"quality": "clear" if stats["energy"] > 0.04 else "soft",
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
def basic_voice_cloning(self, text: str, voice_features: Dict[str, Any], dialect: str) -> Tuple[np.ndarray, int]:
|
| 425 |
+
pitch_band = voice_features.get("pitch_band", "mid")
|
| 426 |
+
voice_type = "female" if pitch_band == "high" else "elder" if voice_features.get("age_profile") == "elder" else "male"
|
| 427 |
+
return self.generate_synthetic_speech(text, dialect, voice_type, 1.0, "neutral")
|
| 428 |
+
|
| 429 |
+
def resolve_voice_model(self, dialect: str, voice_type: str) -> str:
|
| 430 |
+
model_map = AFGHAN_PASHTO_DIALECTS[dialect]["voice_models"]
|
| 431 |
+
return model_map.get(voice_type, f"{AFGHAN_PASHTO_DIALECTS[dialect]['code']}_{voice_type}.pth")
|
| 432 |
+
|
| 433 |
+
|
| 434 |
+
processor = CompleteAfghanPashtoProcessor()
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def format_dialect_summary(dialect: str) -> str:
|
| 438 |
+
info = AFGHAN_PASHTO_DIALECTS[dialect]
|
| 439 |
+
return (
|
| 440 |
+
f"لهجه: {dialect}\n"
|
| 441 |
+
f"کوډ: {info['code']}\n"
|
| 442 |
+
f"سيمه: {info['region']}\n"
|
| 443 |
+
f"دوديز نوم: {info['traditional_name']}\n"
|
| 444 |
+
f"اواز ماډلونه: {', '.join(info['voice_models'].values())}\n"
|
| 445 |
+
f"تلفظ: {info['pronunciation_guide']}\n"
|
| 446 |
+
f"ځانګړنې: {', '.join(info['characteristics'])}"
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
def format_context_summary(context_name: str) -> str:
|
| 451 |
+
info = CULTURAL_CONTEXTS[context_name]
|
| 452 |
+
return (
|
| 453 |
+
f"تشريح: {info['description']}\n"
|
| 454 |
+
f"بېلګې: {', '.join(info['examples'])}\n"
|
| 455 |
+
f"اواز سبک: {info['voice_style']}"
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
|
| 459 |
+
def phoneme_markdown() -> str:
|
| 460 |
+
lines: List[str] = []
|
| 461 |
+
for category, items in COMPLETE_PHONEMES.items():
|
| 462 |
+
lines.append(f"### {category}")
|
| 463 |
+
for symbol, data in items.items():
|
| 464 |
+
dialects = data.get("dialects", {})
|
| 465 |
+
dialect_text = ", ".join(f"{key}: {value}" for key, value in dialects.items()) if dialects else "—"
|
| 466 |
+
lines.append(f"- **{symbol}** · IPA `{data['ipa']}` · {data['description']} · Dialects: {dialect_text}")
|
| 467 |
+
return "\n".join(lines)
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def available_examples() -> List[List[Any]]:
|
| 471 |
+
return [
|
| 472 |
+
["زما وطن د وياړ کور دی او پښتونولي زموږ د ژوند لار ده.", "کندهاري (Kandahari)", "مشر (Elder Male)", "ملي (National)", "جګ افتخار (Proud)", 1.0],
|
| 473 |
+
["مېلمستيا او غيرت زموږ کلتوري ارزښتونه دي.", "پکتياوي (Paktiawal)", "ځوان (Young Male)", "فرهنګي (Cultural)", "طبيعي (Natural)", 1.1],
|
| 474 |
+
["ښه راغلاست، دا يوه ولسي کيسه ده چې د زاړه وخت ياد راژوندی کوي.", "هراتۍ (Herati)", "ښځينه (Female)", "پېغلوي (Folk Tales)", "خوشحال (Joyful)", 0.9],
|
| 475 |
+
]
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
def generate_voice(text: str, dialect: str, voice_label: str, context_name: str, emotion_label: str, speed: float):
|
| 479 |
+
if not text.strip():
|
| 480 |
+
raise gr.Error("مهرباني وکړئ پښتو متن وليکئ / Please enter Pashto text.")
|
| 481 |
+
|
| 482 |
+
voice_type = VOICE_TYPE_MODEL_MAP[voice_label]
|
| 483 |
+
emotion = EMOTION_MAP[emotion_label]
|
| 484 |
+
audio, sample_rate, metadata = processor.process_authentic_tts(text, dialect, voice_type, context_name, emotion, speed)
|
| 485 |
+
info = (
|
| 486 |
+
f"Model: {metadata['model']}\n"
|
| 487 |
+
f"Device: {metadata['device']}\n"
|
| 488 |
+
f"Dialectal text: {metadata['dialectal_text']}\n"
|
| 489 |
+
f"Contextualized text: {metadata['contextualized_text']}\n"
|
| 490 |
+
f"Emotional text: {metadata['emotional_text']}"
|
| 491 |
+
)
|
| 492 |
+
context_analysis = processor.extract_comprehensive_cultural_markers(metadata["emotional_text"])
|
| 493 |
+
marker_lines = [f"{key}: {', '.join(values)}" for key, values in context_analysis.items() if values]
|
| 494 |
+
markers_text = "\n".join(marker_lines) if marker_lines else "No explicit cultural markers detected yet."
|
| 495 |
+
return (sample_rate, audio), info, format_dialect_summary(dialect), format_context_summary(context_name), markers_text
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
def recognize_speech(audio_input, dialect: str):
|
| 499 |
+
result = processor.process_authentic_asr(audio_input, dialect)
|
| 500 |
+
summary = (
|
| 501 |
+
f"Transcription: {result['text']}\n"
|
| 502 |
+
f"Confidence: {result['confidence']:.0%}\n"
|
| 503 |
+
f"Duration: {result['audio_stats']['duration_seconds']} seconds\n"
|
| 504 |
+
f"Energy: {result['audio_stats']['energy']}\n"
|
| 505 |
+
f"Pitch band: {result['audio_stats']['pitch_band']}"
|
| 506 |
+
)
|
| 507 |
+
markers = [f"{key}: {', '.join(values)}" for key, values in result['cultural_markers'].items() if values]
|
| 508 |
+
return summary, result["cultural_markers"], "\n".join(result["pronunciation_notes"]), "\n".join(markers) if markers else "No cultural markers detected."
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
def clone_voice(reference_audio, text: str, dialect: str, age_profile: str, style_profile: str):
|
| 512 |
+
if not text.strip():
|
| 513 |
+
raise gr.Error("Please provide target text for cloning.")
|
| 514 |
+
|
| 515 |
+
audio, sample_rate, features = processor.process_voice_cloning(
|
| 516 |
+
reference_audio,
|
| 517 |
+
text,
|
| 518 |
+
dialect,
|
| 519 |
+
{"age_profile": age_profile, "style_profile": style_profile},
|
| 520 |
+
)
|
| 521 |
+
feature_lines = "\n".join(f"{key}: {value}" for key, value in features.items())
|
| 522 |
+
return (sample_rate, audio), feature_lines, format_dialect_summary(dialect)
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
def build_app() -> gr.Blocks:
|
| 526 |
+
with gr.Blocks(
|
| 527 |
+
title="🎙️ Afghan Pashto Voice Hub - د افغان پښتو غږيز مرکز",
|
| 528 |
+
theme=gr.themes.Soft(),
|
| 529 |
+
css="""
|
| 530 |
+
.pashto-text {
|
| 531 |
+
font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq', 'Scheherazade New', serif;
|
| 532 |
+
direction: rtl;
|
| 533 |
+
text-align: right;
|
| 534 |
+
}
|
| 535 |
+
.afghan-flag {
|
| 536 |
+
background: linear-gradient(to bottom, #000000, #c81818, #0b8f3a);
|
| 537 |
+
height: 20px;
|
| 538 |
+
border-radius: 6px;
|
| 539 |
+
margin: 10px 0;
|
| 540 |
+
}
|
| 541 |
+
""",
|
| 542 |
+
) as app:
|
| 543 |
+
gr.Markdown(
|
| 544 |
+
"""
|
| 545 |
+
# 🎙️ Afghan Pashto Voice & Speech Processing Hub
|
| 546 |
+
## د افغان پښتو غږيز پروسسنګ مرکز
|
| 547 |
+
|
| 548 |
+
**Pure Afghan Pashto - له اصل پښتو سره**
|
| 549 |
+
|
| 550 |
+
Supports: **Kandahari, Paktiawal, Peshawri, Mazari, Herati, Nangarhari, and traditional forms**
|
| 551 |
+
"""
|
| 552 |
+
)
|
| 553 |
+
gr.HTML('<div class="afghan-flag"></div>')
|
| 554 |
+
|
| 555 |
+
with gr.Row():
|
| 556 |
+
with gr.Column(scale=2):
|
| 557 |
+
dialect_preview = gr.Dropdown(
|
| 558 |
+
choices=list(AFGHAN_PASHTO_DIALECTS.keys()),
|
| 559 |
+
value="کندهاري (Kandahari)",
|
| 560 |
+
label="Dialect overview - لهجو کتنه",
|
| 561 |
+
)
|
| 562 |
+
dialect_summary = gr.Textbox(
|
| 563 |
+
value=format_dialect_summary("کندهاري (Kandahari)"),
|
| 564 |
+
label="Dialect details",
|
| 565 |
+
lines=7,
|
| 566 |
+
)
|
| 567 |
+
with gr.Column(scale=1):
|
| 568 |
+
gr.Markdown(f"### Runtime\n- Runtime device: {processor.device}\n- Models: lightweight placeholder stack")
|
| 569 |
+
|
| 570 |
+
dialect_preview.change(fn=format_dialect_summary, inputs=dialect_preview, outputs=dialect_summary)
|
| 571 |
+
|
| 572 |
+
with gr.Accordion("Traditional phoneme guide", open=False):
|
| 573 |
+
gr.Markdown(phoneme_markdown(), elem_classes="pashto-text")
|
| 574 |
+
|
| 575 |
+
with gr.Tabs():
|
| 576 |
+
with gr.TabItem("🔊 Authentic Voice"):
|
| 577 |
+
with gr.Row():
|
| 578 |
+
with gr.Column():
|
| 579 |
+
authentic_text = gr.Textbox(
|
| 580 |
+
label="پښتو متن / Pashto Text",
|
| 581 |
+
placeholder="دلته پښتو متن ولیکئ...",
|
| 582 |
+
lines=5,
|
| 583 |
+
elem_classes="pashto-text",
|
| 584 |
+
)
|
| 585 |
+
with gr.Row():
|
| 586 |
+
authentic_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="کندهاري (Kandahari)", label="Dialect - لهجه")
|
| 587 |
+
authentic_voice = gr.Dropdown(choices=list(VOICE_TYPE_MODEL_MAP.keys()), value="مشر (Elder Male)", label="Voice Type - غږ ډول")
|
| 588 |
+
with gr.Row():
|
| 589 |
+
authentic_context = gr.Dropdown(choices=list(CULTURAL_CONTEXTS.keys()), value="ملي (National)", label="Cultural Context - کلتني زمينه")
|
| 590 |
+
authentic_emotion = gr.Dropdown(choices=list(EMOTION_MAP.keys()), value="طبيعي (Natural)", label="Emotion - احساس")
|
| 591 |
+
authentic_speed = gr.Slider(0.6, 1.4, value=1.0, step=0.1, label="Speed - چټکتيا")
|
| 592 |
+
authentic_generate = gr.Button("🎤 Generate Authentic Voice", variant="primary")
|
| 593 |
+
gr.Examples(examples=available_examples(), inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed])
|
| 594 |
+
with gr.Column():
|
| 595 |
+
authentic_output = gr.Audio(label="Generated Afghan Pashto Voice")
|
| 596 |
+
authentic_info = gr.Textbox(label="Voice pipeline details", lines=7)
|
| 597 |
+
authentic_dialect_info = gr.Textbox(label="Dialect knowledge", lines=7)
|
| 598 |
+
authentic_context_info = gr.Textbox(label="Context knowledge", lines=4)
|
| 599 |
+
authentic_markers = gr.Textbox(label="Cultural markers", lines=5)
|
| 600 |
+
|
| 601 |
+
authentic_generate.click(
|
| 602 |
+
fn=generate_voice,
|
| 603 |
+
inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed],
|
| 604 |
+
outputs=[authentic_output, authentic_info, authentic_dialect_info, authentic_context_info, authentic_markers],
|
| 605 |
+
)
|
| 606 |
+
|
| 607 |
+
with gr.TabItem("🎧 Speech Recognition"):
|
| 608 |
+
with gr.Row():
|
| 609 |
+
with gr.Column():
|
| 610 |
+
recognition_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload or record Pashto speech")
|
| 611 |
+
recognition_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="پکتياوي (Paktiawal)", label="Target dialect")
|
| 612 |
+
recognition_button = gr.Button("📝 Recognize Speech", variant="primary")
|
| 613 |
+
with gr.Column():
|
| 614 |
+
recognition_summary = gr.Textbox(label="Recognition summary", lines=6)
|
| 615 |
+
recognition_context = gr.JSON(label="Cultural context analysis")
|
| 616 |
+
recognition_pronunciation = gr.Textbox(label="Pronunciation notes", lines=4)
|
| 617 |
+
recognition_markers = gr.Textbox(label="Detected markers", lines=4)
|
| 618 |
+
|
| 619 |
+
recognition_button.click(
|
| 620 |
+
fn=recognize_speech,
|
| 621 |
+
inputs=[recognition_audio, recognition_dialect],
|
| 622 |
+
outputs=[recognition_summary, recognition_context, recognition_pronunciation, recognition_markers],
|
| 623 |
+
)
|
| 624 |
+
|
| 625 |
+
with gr.TabItem("🧬 Voice Cloning Demo"):
|
| 626 |
+
with gr.Row():
|
| 627 |
+
with gr.Column():
|
| 628 |
+
clone_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Reference Afghan voice")
|
| 629 |
+
clone_text = gr.Textbox(label="Target text", lines=4, placeholder="هغه متن وليکئ چې د هماغه غږ په ډول واورئ...", elem_classes="pashto-text")
|
| 630 |
+
clone_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="هراتۍ (Herati)", label="Dialect")
|
| 631 |
+
with gr.Row():
|
| 632 |
+
clone_age = gr.Dropdown(choices=["youthful", "mature", "elder"], value="mature", label="Age profile")
|
| 633 |
+
clone_style = gr.Dropdown(choices=["formal", "storytelling", "poetic", "conversational"], value="storytelling", label="Speaking style")
|
| 634 |
+
clone_button = gr.Button("🧪 Clone Voice Demo", variant="primary")
|
| 635 |
+
with gr.Column():
|
| 636 |
+
clone_output = gr.Audio(label="Cloned Afghan voice")
|
| 637 |
+
clone_features = gr.Textbox(label="Extracted / merged voice features", lines=8)
|
| 638 |
+
clone_dialect_info = gr.Textbox(label="Dialect profile", lines=7)
|
| 639 |
+
|
| 640 |
+
clone_button.click(
|
| 641 |
+
fn=clone_voice,
|
| 642 |
+
inputs=[clone_audio, clone_text, clone_dialect, clone_age, clone_style],
|
| 643 |
+
outputs=[clone_output, clone_features, clone_dialect_info],
|
| 644 |
+
)
|
| 645 |
+
|
| 646 |
+
gr.Markdown(
|
| 647 |
+
"""
|
| 648 |
+
### Notes
|
| 649 |
+
- This app is a lightweight, deployable Gradio demo with authentic Afghan Pashto structure and metadata.
|
| 650 |
+
- TTS, ASR, and voice cloning are implemented with synthetic placeholder audio logic so the interface runs without large model files.
|
| 651 |
+
- You can later replace the placeholder methods with real Pashto TTS, ASR, and cloning checkpoints.
|
| 652 |
+
"""
|
| 653 |
+
)
|
| 654 |
+
|
| 655 |
+
return app
|
| 656 |
+
|
| 657 |
+
|
| 658 |
+
app = build_app()
|
| 659 |
+
|
| 660 |
+
|
| 661 |
+
if __name__ == "__main__":
|
| 662 |
+
app.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.0.0
|
| 2 |
+
numpy>=1.24.0
|