Spaces:
Sleeping
Sleeping
| """ | |
| COMPLETE Afghan Pashto Voice & Speech Processing Space | |
| Pure Afghan Pashto - له اصل پښتو سره | |
| Author: Afghan Voice Technology Initiative | |
| Version: 2.0 - Lightweight Complete Demo | |
| """ | |
| from __future__ import annotations | |
| import math | |
| from typing import Any, Dict, List, Tuple | |
| import gradio as gr | |
| import numpy as np | |
| try: | |
| import torch | |
| except Exception: # pragma: no cover - optional dependency | |
| torch = None | |
| AFGHAN_PASHTO_DIALECTS: Dict[str, Dict[str, Any]] = { | |
| "کندهاري (Kandahari)": { | |
| "code": "ps-kan", | |
| "region": "کندهار، زابل، ارزگان", | |
| "characteristics": ["Hard ږ (g)", "ښ as خ", "Emphatic consonants", "Traditional poetry"], | |
| "traditional_name": "کندهاري غه", | |
| "voice_models": {"male": "kan_male_v2.pth", "female": "kan_female_v2.pth", "elder": "kan_elder_v2.pth"}, | |
| "pronunciation_guide": "ږ = hard 'g', ښ = 'kh', Retroflex sounds preserved", | |
| }, | |
| "پکتياوي (Paktiawal)": { | |
| "code": "ps-pak", | |
| "region": "پکتيا، پکتيکا، خوست", | |
| "characteristics": ["Retroflex ڼ", "Nasal vowels", "Tribal vocabulary", "Mountain accent"], | |
| "traditional_name": "پکتياوي خښه", | |
| "voice_models": {"male": "pak_male_v2.pth", "female": "pak_female_v2.pth", "elder": "pak_elder_v2.pth"}, | |
| "pronunciation_guide": "ڼ = retroflex 'n', Nasalized vowels, Tribal words", | |
| }, | |
| "پېښوري (Peshawri)": { | |
| "code": "ps-pes", | |
| "region": "پېښور، مردان، سوات", | |
| "characteristics": ["ښ as ش", "Soft ږ (zh)", "Urban vocabulary", "Trade language"], | |
| "traditional_name": "پېښوري ژبه", | |
| "voice_models": {"male": "pes_male_v2.pth", "female": "pes_female_v2.pth", "elder": "pes_elder_v2.pth"}, | |
| "pronunciation_guide": "ښ = 'sh', ږ = soft 'zh', Urban expressions", | |
| }, | |
| "مزارۍ (Mazari)": { | |
| "code": "ps-maz", | |
| "region": "مزار شريف، بلخ، جوزجان", | |
| "characteristics": ["Uzbek influence", "Northern vowels", "Turkic loanwords", "Plains accent"], | |
| "traditional_name": "مزارۍ غږ", | |
| "voice_models": {"male": "maz_male_v2.pth", "female": "maz_female_v2.pth", "elder": "maz_elder_v2.pth"}, | |
| "pronunciation_guide": "Uzbek-influenced vowels, Turkic words, Northern tone", | |
| }, | |
| "هراتۍ (Herati)": { | |
| "code": "ps-her", | |
| "region": "هرات، فراه، نيمروز", | |
| "characteristics": ["Persian influence", "Western vowels", "Herati accent", "Cultural sophistication"], | |
| "traditional_name": "هراتۍ لهجه", | |
| "voice_models": {"male": "her_male_v2.pth", "female": "her_female_v2.pth", "elder": "her_elder_v2.pth"}, | |
| "pronunciation_guide": "Persian-influenced sounds, Western vowels, Cultural words", | |
| }, | |
| "ننګرهاري (Nangarhari)": { | |
| "code": "ps-nan", | |
| "region": "جلال اباد، ننګرهار، کنړ", | |
| "characteristics": ["Eastern dialect", "Khattak influence", "Jalalabad accent", "Border influences"], | |
| "traditional_name": "ننګرهاري وړاندې", | |
| "voice_models": {"male": "nan_male_v2.pth", "female": "nan_female_v2.pth", "elder": "nan_elder_v2.pth"}, | |
| "pronunciation_guide": "Eastern sounds, Khattak influence, Border variations", | |
| }, | |
| } | |
| CULTURAL_CONTEXTS: Dict[str, Dict[str, Any]] = { | |
| "ملي (National)": { | |
| "description": "National songs, anthems, patriotic poetry", | |
| "examples": ["ملي سرود", "وطن شعرونه", "غازي قومي"], | |
| "voice_style": "proud, formal, clear", | |
| "suffix": "د ملي غرور سره", | |
| }, | |
| "قومي (Tribal)": { | |
| "description": "Tribal traditions, ethnic heritage, clan stories", | |
| "examples": ["قومي کیسې", "نسب او شجره", "قبیلوي ویاړونه"], | |
| "voice_style": "traditional, elder-like, respectful", | |
| "suffix": "د قومي وياړ سره", | |
| }, | |
| "مذهبي (Religious)": { | |
| "description": "Religious content, spiritual guidance, Islamic teachings", | |
| "examples": ["دیني دروس", "اخلاقي کیسې", "روحاني مواعظ"], | |
| "voice_style": "soft, respectful, spiritual", | |
| "suffix": "د مذهبي احترام سره", | |
| }, | |
| "فرهنګي (Cultural)": { | |
| "description": "Cultural education, traditional values, customs", | |
| "examples": ["فرهنګي ارزښتونه", "دودونه او دستورونه", "کلتني کیسې"], | |
| "voice_style": "educational, warm, cultural", | |
| "suffix": "د فرهنګي ارزښتونو سره", | |
| }, | |
| "تاريخي (Historical)": { | |
| "description": "Historical narratives, ancient stories, past events", | |
| "examples": ["تاريخي کیسې", "پخوانۍ پیښې", "قدیم افسانې"], | |
| "voice_style": "storyteller, dramatic, engaging", | |
| "suffix": "د تاريخي روايت په انداز", | |
| }, | |
| "سنګي (Musical)": { | |
| "description": "Traditional music, folk songs, cultural rhythms", | |
| "examples": ["سنګي ملودۍ", "فولکلوري سندرې", "کلاسیکي موسیقي"], | |
| "voice_style": "melodic, rhythmic, artistic", | |
| "suffix": "د دوديزې نغمې په رنګ", | |
| }, | |
| "پېغلوي (Folk Tales)": { | |
| "description": "Folk tales, traditional stories, cultural narratives", | |
| "examples": ["پېغلوي کیسې", "افسانوي کیسې", "کلتني حکیات"], | |
| "voice_style": "storyteller, engaging, traditional", | |
| "suffix": "د ولسي کيسې له خوند سره", | |
| }, | |
| } | |
| COMPLETE_PHONEMES: Dict[str, Dict[str, Dict[str, Any]]] = { | |
| "پښتني حروف": { | |
| "ښ": {"symbol": "ښ", "ipa": "/ʂ/", "description": "Voiceless retroflex fricative", "dialects": {"کندهاري": "خ", "پېښوري": "ش"}}, | |
| "ږ": {"symbol": "ږ", "ipa": "/ʐ/", "description": "Voiced retroflex fricative", "dialects": {"کندهاري": "گ", "پېښوري": "ژ"}}, | |
| "ڼ": {"symbol": "ڼ", "ipa": "/ɳ/", "description": "Retroflex nasal", "dialects": {"پکتياوي": "ڼ", "کندهاري": "ن"}}, | |
| "ړ": {"symbol": "ړ", "ipa": "/ɽ/", "description": "Retroflex flap", "dialects": {"ټول": "ړ"}}, | |
| "ټ": {"symbol": "ټ", "ipa": "/ʈ/", "description": "Voiceless retroflex stop", "dialects": {"ټول": "ټ"}}, | |
| "ډ": {"symbol": "ډ", "ipa": "/ɖ/", "description": "Voiced retroflex stop", "dialects": {"ټول": "ډ"}}, | |
| }, | |
| "عربي حروف": { | |
| "ص": {"symbol": "ص", "ipa": "/sˤ/", "description": "Emphatic voiceless alveolar fricative", "dialects": {}}, | |
| "ض": {"symbol": "ض", "ipa": "/dˤ/", "description": "Emphatic voiced alveolar stop", "dialects": {}}, | |
| "ط": {"symbol": "ط", "ipa": "/tˤ/", "description": "Emphatic voiceless alveolar stop", "dialects": {}}, | |
| "ظ": {"symbol": "ظ", "ipa": "/zˤ/", "description": "Emphatic voiced alveolar fricative", "dialects": {}}, | |
| }, | |
| "ويي": { | |
| "ا": {"symbol": "ا", "ipa": "/a/", "description": "Open front vowel", "dialects": {}}, | |
| "ې": {"symbol": "ې", "ipa": "/e/", "description": "Close-mid front vowel", "dialects": {}}, | |
| "ۍ": {"symbol": "ۍ", "ipa": "/ei/", "description": "Diphthong", "dialects": {}}, | |
| "و": {"symbol": "و", "ipa": "/o/", "description": "Close-mid back rounded vowel", "dialects": {}}, | |
| "ۀ": {"symbol": "ۀ", "ipa": "/ə/", "description": "Schwa", "dialects": {}}, | |
| }, | |
| } | |
| VOICE_TYPE_MODEL_MAP = { | |
| "مشر (Elder Male)": "elder", | |
| "ځوان (Young Male)": "male", | |
| "ښځينه (Female)": "female", | |
| "وړکتي (Child)": "child", | |
| } | |
| EMOTION_MAP = { | |
| "طبيعي (Natural)": "neutral", | |
| "خوشحال (Joyful)": "joyful", | |
| "غميز (Sorrowful)": "sorrowful", | |
| "جګ افتخار (Proud)": "proud", | |
| } | |
| class AudioProcessor: | |
| def preprocess_audio(self, audio_input: Tuple[int, np.ndarray] | np.ndarray | None) -> np.ndarray: | |
| if audio_input is None: | |
| raise ValueError("No audio input was provided.") | |
| if isinstance(audio_input, tuple): | |
| _, waveform = audio_input | |
| else: | |
| waveform = audio_input | |
| waveform = np.asarray(waveform, dtype=np.float32).squeeze() | |
| if waveform.ndim > 1: | |
| waveform = waveform.mean(axis=1) | |
| peak = float(np.max(np.abs(waveform))) if waveform.size else 0.0 | |
| if peak > 0: | |
| waveform = waveform / peak | |
| return waveform | |
| def analyze_audio(self, waveform: np.ndarray, sample_rate: int) -> Dict[str, Any]: | |
| if waveform.size == 0: | |
| return {"duration_seconds": 0.0, "energy": 0.0, "pitch_band": "unknown"} | |
| energy = float(np.mean(np.abs(waveform))) | |
| zero_crossing = float(np.mean(np.abs(np.diff(np.signbit(waveform))))) if waveform.size > 1 else 0.0 | |
| pitch_band = "high" if zero_crossing > 0.12 else "mid" if zero_crossing > 0.05 else "low" | |
| return { | |
| "duration_seconds": round(waveform.size / max(sample_rate, 1), 2), | |
| "energy": round(energy, 4), | |
| "pitch_band": pitch_band, | |
| } | |
| class CulturalContextProcessor: | |
| tribal_terms = ["احمدزي", "محسود", "خټک", "یوسفزي", "دواني", "ننګيال"] | |
| cultural_terms = ["پښتونولي", "مېلمستيا", "ننګ", "غيرت", "توره", "نګاه"] | |
| traditional_expressions = ["ښه راغلاست", "په خير", "الله دې مل شه", "ستړی مه شې"] | |
| honorifics = ["صاحب", "ملا", "خان", "استاد"] | |
| def apply_cultural_context(self, text: str, context: str) -> str: | |
| context_info = CULTURAL_CONTEXTS.get(context) | |
| if not context_info: | |
| return text | |
| return f"{text} ({context_info['suffix']})" | |
| def analyze_text(self, text: str) -> Dict[str, List[str]]: | |
| return { | |
| "tribal_references": [term for term in self.tribal_terms if term in text], | |
| "cultural_concepts": [term for term in self.cultural_terms if term in text], | |
| "traditional_expressions": [term for term in self.traditional_expressions if term in text], | |
| "honorifics": [term for term in self.honorifics if term in text], | |
| } | |
| class CompleteAfghanPashtoProcessor: | |
| def __init__(self) -> None: | |
| self.device = "cuda" if torch is not None and torch.cuda.is_available() else "cpu" | |
| self.models: Dict[str, Any] = {} | |
| self.audio_processor = AudioProcessor() | |
| self.cultural_processor = CulturalContextProcessor() | |
| self.dialect_rules = self.load_dialect_rules() | |
| self.load_all_models() | |
| def load_all_models(self) -> None: | |
| self.models = { | |
| "tts": {"base": self.load_tts_model(), "dialects": self.load_dialectal_tts_models()}, | |
| "asr": {"base": self.load_asr_model(), "dialectal": self.load_dialectal_asr_models()}, | |
| "voice_clone": self.load_voice_cloning_model(), | |
| } | |
| def load_tts_model(self) -> Dict[str, str]: | |
| return {"model": "base_tts", "status": "placeholder"} | |
| def load_dialectal_tts_models(self) -> Dict[str, Dict[str, str]]: | |
| return {dialect: info["voice_models"] for dialect, info in AFGHAN_PASHTO_DIALECTS.items()} | |
| def load_asr_model(self) -> Dict[str, str]: | |
| return {"model": "base_asr", "status": "placeholder"} | |
| def load_dialectal_asr_models(self) -> Dict[str, str]: | |
| return {dialect: f"{info['code']}_asr" for dialect, info in AFGHAN_PASHTO_DIALECTS.items()} | |
| def load_voice_cloning_model(self) -> Dict[str, str]: | |
| return {"model": "voice_clone", "status": "placeholder"} | |
| def load_dialect_rules(self) -> Dict[str, Dict[str, Dict[str, Any]]]: | |
| return { | |
| "pronunciation": { | |
| "کندهاري (Kandahari)": {"ښ": "خ", "ږ": "گ", "emphatic_consonants": True}, | |
| "پکتياوي (Paktiawal)": {"ڼ": "ڼ", "nasal_vowels": True, "tribal_pronunciation": True}, | |
| "پېښوري (Peshawri)": {"ښ": "ش", "ږ": "ژ", "urban_influence": True}, | |
| "هراتۍ (Herati)": {"ښ": "خ", "ږ": "گ", "western_vowels": True}, | |
| }, | |
| "vocabulary": { | |
| "کندهاري (Kandahari)": {"traditional_words": ["غه", "خه", "ګه"], "poetic_expressions": True}, | |
| "پکتياوي (Paktiawal)": {"tribal_words": ["خېل", "قوم", "نګهبان"], "mountain_vocabulary": True}, | |
| "هراتۍ (Herati)": {"persian_loanwords": True, "cultural_terms": ["فرهنګ", "تمدن", "ادب"]}, | |
| }, | |
| "grammar": { | |
| "ننګرهاري (Nangarhari)": {"eastern_constructions": True, "border_influences": True}, | |
| "مزارۍ (Mazari)": {"uzbek_influence": True, "northern_constructions": True}, | |
| }, | |
| } | |
| def process_authentic_tts( | |
| self, | |
| text: str, | |
| dialect: str, | |
| voice_type: str, | |
| context: str, | |
| emotion: str, | |
| speed: float, | |
| ) -> Tuple[np.ndarray, int, Dict[str, Any]]: | |
| dialectal_text = self.apply_comprehensive_dialect_rules(text, dialect) | |
| contextualized_text = self.cultural_processor.apply_cultural_context(dialectal_text, context) | |
| emotional_text = self.apply_emotional_coloring(contextualized_text, emotion) | |
| audio, sample_rate = self.generate_synthetic_speech(emotional_text, dialect, voice_type, speed, emotion) | |
| return audio, sample_rate, { | |
| "dialectal_text": dialectal_text, | |
| "contextualized_text": contextualized_text, | |
| "emotional_text": emotional_text, | |
| "model": self.resolve_voice_model(dialect, voice_type), | |
| "device": self.device, | |
| } | |
| def process_authentic_asr(self, audio_input: Tuple[int, np.ndarray] | None, dialect: str) -> Dict[str, Any]: | |
| if audio_input is None: | |
| raise ValueError("Please record or upload Pashto speech first.") | |
| sample_rate, waveform = audio_input | |
| processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform)) | |
| result = self.basic_speech_recognition(processed_audio, dialect) | |
| corrected_text = self.apply_dialectal_corrections(result["text"], dialect) | |
| cultural_info = self.extract_comprehensive_cultural_markers(corrected_text) | |
| audio_stats = self.audio_processor.analyze_audio(processed_audio, sample_rate) | |
| return { | |
| "text": corrected_text, | |
| "confidence": result.get("confidence", 0.85), | |
| "dialect": dialect, | |
| "audio_stats": audio_stats, | |
| "cultural_markers": cultural_info, | |
| "pronunciation_notes": self.get_pronunciation_notes(corrected_text, dialect), | |
| } | |
| def process_voice_cloning( | |
| self, | |
| reference_audio: Tuple[int, np.ndarray] | None, | |
| text: str, | |
| dialect: str, | |
| voice_characteristics: Dict[str, str], | |
| ) -> Tuple[np.ndarray, int, Dict[str, Any]]: | |
| if reference_audio is None: | |
| raise ValueError("Reference audio is required for voice cloning.") | |
| sample_rate, waveform = reference_audio | |
| processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform)) | |
| features = self.extract_authentic_voice_features(processed_audio, sample_rate, dialect) | |
| merged_features = {**features, **voice_characteristics} | |
| cloned_audio, cloned_rate = self.basic_voice_cloning(text, merged_features, dialect) | |
| return cloned_audio, cloned_rate, merged_features | |
| def apply_comprehensive_dialect_rules(self, text: str, dialect: str) -> str: | |
| pronunciation_rules = self.dialect_rules.get("pronunciation", {}).get(dialect, {}) | |
| vocabulary_rules = self.dialect_rules.get("vocabulary", {}).get(dialect, {}) | |
| transformed = text | |
| for original, replacement in pronunciation_rules.items(): | |
| if isinstance(replacement, str): | |
| transformed = transformed.replace(original, replacement) | |
| if vocabulary_rules.get("poetic_expressions") and "وطن" in transformed: | |
| transformed = transformed.replace("وطن", "پلرنی وطن") | |
| if vocabulary_rules.get("persian_loanwords") and "کلتور" in transformed: | |
| transformed = transformed.replace("کلتور", "فرهنګ") | |
| return transformed | |
| def apply_emotional_coloring(self, text: str, emotion: str) -> str: | |
| emotional_suffix = { | |
| "neutral": "په طبيعي انداز", | |
| "joyful": "په خوشحال رنګ", | |
| "sorrowful": "په غمجن اهنګ", | |
| "proud": "په ویاړلي انداز", | |
| }.get(emotion) | |
| return f"{text} ({emotional_suffix})" if emotional_suffix else text | |
| def generate_synthetic_speech( | |
| self, | |
| text: str, | |
| dialect: str, | |
| voice_type: str, | |
| speed: float, | |
| emotion: str, | |
| ) -> Tuple[np.ndarray, int]: | |
| sample_rate = 24000 | |
| duration = max(1.5, min(len(text) * 0.11 / max(speed, 0.1), 18.0)) | |
| timeline = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) | |
| base_freq = {"female": 210, "child": 280, "elder": 105, "male": 130}.get(voice_type, 140) | |
| dialect_shift = { | |
| "کندهاري (Kandahari)": -5, | |
| "پکتياوي (Paktiawal)": 7, | |
| "پېښوري (Peshawri)": 13, | |
| "مزارۍ (Mazari)": 3, | |
| "هراتۍ (Herati)": -2, | |
| "ننګرهاري (Nangarhari)": 8, | |
| }.get(dialect, 0) | |
| emotion_shift = {"neutral": 0, "joyful": 16, "sorrowful": -10, "proud": 9}.get(emotion, 0) | |
| modulation = 18 * np.sin(2 * math.pi * 0.42 * timeline) | |
| frequency = base_freq + dialect_shift + emotion_shift + modulation | |
| audio = np.zeros_like(timeline) | |
| for harmonic in range(1, 7): | |
| audio += (1 / harmonic) * np.sin(2 * math.pi * harmonic * frequency * timeline) | |
| syllable_envelope = 0.6 + 0.4 * np.sin(2 * math.pi * (2.0 * speed) * timeline) ** 2 | |
| fade = np.exp(-timeline / (3.8 / max(speed, 0.1))) | |
| breath = np.random.normal(0, 0.008, timeline.shape) | |
| audio = np.clip(audio * syllable_envelope * fade * 0.24 + breath, -1.0, 1.0) | |
| return audio.astype(np.float32), sample_rate | |
| def basic_speech_recognition(self, waveform: np.ndarray, dialect: str) -> Dict[str, Any]: | |
| energy = float(np.mean(np.abs(waveform))) if waveform.size else 0.0 | |
| transcript = "دا يو پښتو متن دی چې د وينا پېژندنې له لارې ترلاسه شوی" | |
| if energy > 0.06: | |
| transcript += " او غږ يې روښانه دی" | |
| if dialect == "کندهاري (Kandahari)": | |
| transcript += " د کندهارۍ رنګ سره" | |
| elif dialect == "پکتياوي (Paktiawal)": | |
| transcript += " د پکتياوي انداز سره" | |
| elif dialect == "هراتۍ (Herati)": | |
| transcript += " د هراتي نرمۍ سره" | |
| return {"text": transcript, "confidence": 0.85} | |
| def apply_dialectal_corrections(self, text: str, dialect: str) -> str: | |
| corrections = { | |
| "کندهاري (Kandahari)": {"شګ": "ښګ", "ژګ": "ږګ"}, | |
| "پکتياوي (Paktiawal)": {"نګ": "ڼګ"}, | |
| "پېښوري (Peshawri)": {"ښ": "ش"}, | |
| } | |
| corrected = text | |
| for wrong, correct in corrections.get(dialect, {}).items(): | |
| corrected = corrected.replace(wrong, correct) | |
| return corrected | |
| def extract_comprehensive_cultural_markers(self, text: str) -> Dict[str, List[str]]: | |
| return self.cultural_processor.analyze_text(text) | |
| def get_pronunciation_notes(self, text: str, dialect: str) -> List[str]: | |
| notes = [AFGHAN_PASHTO_DIALECTS[dialect]["pronunciation_guide"]] | |
| if "ښ" in text: | |
| notes.append("Text contains ښ, which is one of the key dialect markers.") | |
| if "ږ" in text: | |
| notes.append("Text contains ږ, so dialect-specific realization matters here.") | |
| return notes | |
| def extract_authentic_voice_features(self, waveform: np.ndarray, sample_rate: int, dialect: str) -> Dict[str, Any]: | |
| stats = self.audio_processor.analyze_audio(waveform, sample_rate) | |
| return { | |
| "pitch_band": stats["pitch_band"], | |
| "energy": stats["energy"], | |
| "accent": AFGHAN_PASHTO_DIALECTS[dialect]["traditional_name"], | |
| "quality": "clear" if stats["energy"] > 0.04 else "soft", | |
| } | |
| def basic_voice_cloning(self, text: str, voice_features: Dict[str, Any], dialect: str) -> Tuple[np.ndarray, int]: | |
| pitch_band = voice_features.get("pitch_band", "mid") | |
| voice_type = "female" if pitch_band == "high" else "elder" if voice_features.get("age_profile") == "elder" else "male" | |
| return self.generate_synthetic_speech(text, dialect, voice_type, 1.0, "neutral") | |
| def resolve_voice_model(self, dialect: str, voice_type: str) -> str: | |
| model_map = AFGHAN_PASHTO_DIALECTS[dialect]["voice_models"] | |
| return model_map.get(voice_type, f"{AFGHAN_PASHTO_DIALECTS[dialect]['code']}_{voice_type}.pth") | |
| processor = CompleteAfghanPashtoProcessor() | |
| def format_dialect_summary(dialect: str) -> str: | |
| info = AFGHAN_PASHTO_DIALECTS[dialect] | |
| return ( | |
| f"لهجه: {dialect}\n" | |
| f"کوډ: {info['code']}\n" | |
| f"سيمه: {info['region']}\n" | |
| f"دوديز نوم: {info['traditional_name']}\n" | |
| f"اواز ماډلونه: {', '.join(info['voice_models'].values())}\n" | |
| f"تلفظ: {info['pronunciation_guide']}\n" | |
| f"ځانګړنې: {', '.join(info['characteristics'])}" | |
| ) | |
| def format_context_summary(context_name: str) -> str: | |
| info = CULTURAL_CONTEXTS[context_name] | |
| return ( | |
| f"تشريح: {info['description']}\n" | |
| f"بېلګې: {', '.join(info['examples'])}\n" | |
| f"اواز سبک: {info['voice_style']}" | |
| ) | |
| def phoneme_markdown() -> str: | |
| lines: List[str] = [] | |
| for category, items in COMPLETE_PHONEMES.items(): | |
| lines.append(f"### {category}") | |
| for symbol, data in items.items(): | |
| dialects = data.get("dialects", {}) | |
| dialect_text = ", ".join(f"{key}: {value}" for key, value in dialects.items()) if dialects else "—" | |
| lines.append(f"- **{symbol}** · IPA `{data['ipa']}` · {data['description']} · Dialects: {dialect_text}") | |
| return "\n".join(lines) | |
| def available_examples() -> List[List[Any]]: | |
| return [ | |
| ["زما وطن د وياړ کور دی او پښتونولي زموږ د ژوند لار ده.", "کندهاري (Kandahari)", "مشر (Elder Male)", "ملي (National)", "جګ افتخار (Proud)", 1.0], | |
| ["مېلمستيا او غيرت زموږ کلتوري ارزښتونه دي.", "پکتياوي (Paktiawal)", "ځوان (Young Male)", "فرهنګي (Cultural)", "طبيعي (Natural)", 1.1], | |
| ["ښه راغلاست، دا يوه ولسي کيسه ده چې د زاړه وخت ياد راژوندی کوي.", "هراتۍ (Herati)", "ښځينه (Female)", "پېغلوي (Folk Tales)", "خوشحال (Joyful)", 0.9], | |
| ] | |
| def generate_voice(text: str, dialect: str, voice_label: str, context_name: str, emotion_label: str, speed: float): | |
| if not text.strip(): | |
| raise gr.Error("مهرباني وکړئ پښتو متن وليکئ / Please enter Pashto text.") | |
| voice_type = VOICE_TYPE_MODEL_MAP[voice_label] | |
| emotion = EMOTION_MAP[emotion_label] | |
| audio, sample_rate, metadata = processor.process_authentic_tts(text, dialect, voice_type, context_name, emotion, speed) | |
| info = ( | |
| f"Model: {metadata['model']}\n" | |
| f"Device: {metadata['device']}\n" | |
| f"Dialectal text: {metadata['dialectal_text']}\n" | |
| f"Contextualized text: {metadata['contextualized_text']}\n" | |
| f"Emotional text: {metadata['emotional_text']}" | |
| ) | |
| context_analysis = processor.extract_comprehensive_cultural_markers(metadata["emotional_text"]) | |
| marker_lines = [f"{key}: {', '.join(values)}" for key, values in context_analysis.items() if values] | |
| markers_text = "\n".join(marker_lines) if marker_lines else "No explicit cultural markers detected yet." | |
| return (sample_rate, audio), info, format_dialect_summary(dialect), format_context_summary(context_name), markers_text | |
| def recognize_speech(audio_input, dialect: str): | |
| result = processor.process_authentic_asr(audio_input, dialect) | |
| summary = ( | |
| f"Transcription: {result['text']}\n" | |
| f"Confidence: {result['confidence']:.0%}\n" | |
| f"Duration: {result['audio_stats']['duration_seconds']} seconds\n" | |
| f"Energy: {result['audio_stats']['energy']}\n" | |
| f"Pitch band: {result['audio_stats']['pitch_band']}" | |
| ) | |
| markers = [f"{key}: {', '.join(values)}" for key, values in result['cultural_markers'].items() if values] | |
| return summary, result["cultural_markers"], "\n".join(result["pronunciation_notes"]), "\n".join(markers) if markers else "No cultural markers detected." | |
| def clone_voice(reference_audio, text: str, dialect: str, age_profile: str, style_profile: str): | |
| if not text.strip(): | |
| raise gr.Error("Please provide target text for cloning.") | |
| audio, sample_rate, features = processor.process_voice_cloning( | |
| reference_audio, | |
| text, | |
| dialect, | |
| {"age_profile": age_profile, "style_profile": style_profile}, | |
| ) | |
| feature_lines = "\n".join(f"{key}: {value}" for key, value in features.items()) | |
| return (sample_rate, audio), feature_lines, format_dialect_summary(dialect) | |
| def build_app() -> gr.Blocks: | |
| with gr.Blocks( | |
| title="🎙️ Afghan Pashto Voice Hub - د افغان پښتو غږيز مرکز", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .pashto-text { | |
| font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq', 'Scheherazade New', serif; | |
| direction: rtl; | |
| text-align: right; | |
| } | |
| .afghan-flag { | |
| background: linear-gradient(to bottom, #000000, #c81818, #0b8f3a); | |
| height: 20px; | |
| border-radius: 6px; | |
| margin: 10px 0; | |
| } | |
| """, | |
| ) as app: | |
| gr.Markdown( | |
| """ | |
| # 🎙️ Afghan Pashto Voice & Speech Processing Hub | |
| ## د افغان پښتو غږيز پروسسنګ مرکز | |
| **Pure Afghan Pashto - له اصل پښتو سره** | |
| Supports: **Kandahari, Paktiawal, Peshawri, Mazari, Herati, Nangarhari, and traditional forms** | |
| """ | |
| ) | |
| gr.HTML('<div class="afghan-flag"></div>') | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| dialect_preview = gr.Dropdown( | |
| choices=list(AFGHAN_PASHTO_DIALECTS.keys()), | |
| value="کندهاري (Kandahari)", | |
| label="Dialect overview - لهجو کتنه", | |
| ) | |
| dialect_summary = gr.Textbox( | |
| value=format_dialect_summary("کندهاري (Kandahari)"), | |
| label="Dialect details", | |
| lines=7, | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown(f"### Runtime\n- Runtime device: {processor.device}\n- Models: lightweight placeholder stack") | |
| dialect_preview.change(fn=format_dialect_summary, inputs=dialect_preview, outputs=dialect_summary) | |
| with gr.Accordion("Traditional phoneme guide", open=False): | |
| gr.Markdown(phoneme_markdown(), elem_classes="pashto-text") | |
| with gr.Tabs(): | |
| with gr.TabItem("🔊 Authentic Voice"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| authentic_text = gr.Textbox( | |
| label="پښتو متن / Pashto Text", | |
| placeholder="دلته پښتو متن ولیکئ...", | |
| lines=5, | |
| elem_classes="pashto-text", | |
| ) | |
| with gr.Row(): | |
| authentic_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="کندهاري (Kandahari)", label="Dialect - لهجه") | |
| authentic_voice = gr.Dropdown(choices=list(VOICE_TYPE_MODEL_MAP.keys()), value="مشر (Elder Male)", label="Voice Type - غږ ډول") | |
| with gr.Row(): | |
| authentic_context = gr.Dropdown(choices=list(CULTURAL_CONTEXTS.keys()), value="ملي (National)", label="Cultural Context - کلتني زمينه") | |
| authentic_emotion = gr.Dropdown(choices=list(EMOTION_MAP.keys()), value="طبيعي (Natural)", label="Emotion - احساس") | |
| authentic_speed = gr.Slider(0.6, 1.4, value=1.0, step=0.1, label="Speed - چټکتيا") | |
| authentic_generate = gr.Button("🎤 Generate Authentic Voice", variant="primary") | |
| gr.Examples(examples=available_examples(), inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed]) | |
| with gr.Column(): | |
| authentic_output = gr.Audio(label="Generated Afghan Pashto Voice") | |
| authentic_info = gr.Textbox(label="Voice pipeline details", lines=7) | |
| authentic_dialect_info = gr.Textbox(label="Dialect knowledge", lines=7) | |
| authentic_context_info = gr.Textbox(label="Context knowledge", lines=4) | |
| authentic_markers = gr.Textbox(label="Cultural markers", lines=5) | |
| authentic_generate.click( | |
| fn=generate_voice, | |
| inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed], | |
| outputs=[authentic_output, authentic_info, authentic_dialect_info, authentic_context_info, authentic_markers], | |
| ) | |
| with gr.TabItem("🎧 Speech Recognition"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| recognition_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload or record Pashto speech") | |
| recognition_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="پکتياوي (Paktiawal)", label="Target dialect") | |
| recognition_button = gr.Button("📝 Recognize Speech", variant="primary") | |
| with gr.Column(): | |
| recognition_summary = gr.Textbox(label="Recognition summary", lines=6) | |
| recognition_context = gr.JSON(label="Cultural context analysis") | |
| recognition_pronunciation = gr.Textbox(label="Pronunciation notes", lines=4) | |
| recognition_markers = gr.Textbox(label="Detected markers", lines=4) | |
| recognition_button.click( | |
| fn=recognize_speech, | |
| inputs=[recognition_audio, recognition_dialect], | |
| outputs=[recognition_summary, recognition_context, recognition_pronunciation, recognition_markers], | |
| ) | |
| with gr.TabItem("🧬 Voice Cloning Demo"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| clone_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Reference Afghan voice") | |
| clone_text = gr.Textbox(label="Target text", lines=4, placeholder="هغه متن وليکئ چې د هماغه غږ په ډول واورئ...", elem_classes="pashto-text") | |
| clone_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="هراتۍ (Herati)", label="Dialect") | |
| with gr.Row(): | |
| clone_age = gr.Dropdown(choices=["youthful", "mature", "elder"], value="mature", label="Age profile") | |
| clone_style = gr.Dropdown(choices=["formal", "storytelling", "poetic", "conversational"], value="storytelling", label="Speaking style") | |
| clone_button = gr.Button("🧪 Clone Voice Demo", variant="primary") | |
| with gr.Column(): | |
| clone_output = gr.Audio(label="Cloned Afghan voice") | |
| clone_features = gr.Textbox(label="Extracted / merged voice features", lines=8) | |
| clone_dialect_info = gr.Textbox(label="Dialect profile", lines=7) | |
| clone_button.click( | |
| fn=clone_voice, | |
| inputs=[clone_audio, clone_text, clone_dialect, clone_age, clone_style], | |
| outputs=[clone_output, clone_features, clone_dialect_info], | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### Notes | |
| - This app is a lightweight, deployable Gradio demo with authentic Afghan Pashto structure and metadata. | |
| - TTS, ASR, and voice cloning are implemented with synthetic placeholder audio logic so the interface runs without large model files. | |
| - You can later replace the placeholder methods with real Pashto TTS, ASR, and cloning checkpoints. | |
| """ | |
| ) | |
| return app | |
| app = build_app() | |
| if __name__ == "__main__": | |
| app.launch() | |