""" COMPLETE Afghan Pashto Voice & Speech Processing Space Pure Afghan Pashto - له اصل پښتو سره Author: Afghan Voice Technology Initiative Version: 2.0 - Lightweight Complete Demo """ from __future__ import annotations # IMPORTANT: `spaces` must be imported before any CUDA-related packages try: import spaces # noqa: F401 except Exception: pass import math from typing import Any, Dict, List, Tuple import gradio as gr import numpy as np try: import torch except Exception: # pragma: no cover - optional dependency torch = None AFGHAN_PASHTO_DIALECTS: Dict[str, Dict[str, Any]] = { "کندهاري (Kandahari)": { "code": "ps-kan", "region": "کندهار، زابل، ارزگان", "characteristics": ["Hard ږ (g)", "ښ as خ", "Emphatic consonants", "Traditional poetry"], "traditional_name": "کندهاري غه", "voice_models": {"male": "kan_male_v2.pth", "female": "kan_female_v2.pth", "elder": "kan_elder_v2.pth"}, "pronunciation_guide": "ږ = hard 'g', ښ = 'kh', Retroflex sounds preserved", }, "پکتياوي (Paktiawal)": { "code": "ps-pak", "region": "پکتيا، پکتيکا، خوست", "characteristics": ["Retroflex ڼ", "Nasal vowels", "Tribal vocabulary", "Mountain accent"], "traditional_name": "پکتياوي خښه", "voice_models": {"male": "pak_male_v2.pth", "female": "pak_female_v2.pth", "elder": "pak_elder_v2.pth"}, "pronunciation_guide": "ڼ = retroflex 'n', Nasalized vowels, Tribal words", }, "مزارۍ (Mazari)": { "code": "ps-maz", "region": "مزار شريف، بلخ، جوزجان", "characteristics": ["Uzbek influence", "Northern vowels", "Turkic loanwords", "Plains accent"], "traditional_name": "مزارۍ غږ", "voice_models": {"male": "maz_male_v2.pth", "female": "maz_female_v2.pth", "elder": "maz_elder_v2.pth"}, "pronunciation_guide": "Uzbek-influenced vowels, Turkic words, Northern tone", }, "هراتۍ (Herati)": { "code": "ps-her", "region": "هرات، فراه، نيمروز", "characteristics": ["Persian influence", "Western vowels", "Herati accent", "Cultural sophistication"], "traditional_name": "هراتۍ لهجه", "voice_models": {"male": "her_male_v2.pth", "female": "her_female_v2.pth", "elder": "her_elder_v2.pth"}, "pronunciation_guide": "Persian-influenced sounds, Western vowels, Cultural words", }, "ننګرهاري (Nangarhari)": { "code": "ps-nan", "region": "جلال اباد، ننګرهار، کنړ", "characteristics": ["Eastern dialect", "Khattak influence", "Jalalabad accent", "Border influences"], "traditional_name": "ننګرهاري وړاندې", "voice_models": {"male": "nan_male_v2.pth", "female": "nan_female_v2.pth", "elder": "nan_elder_v2.pth"}, "pronunciation_guide": "Eastern sounds, Khattak influence, Border variations", }, } CULTURAL_CONTEXTS: Dict[str, Dict[str, Any]] = { "ملي (National)": { "description": "National songs, anthems, patriotic poetry", "examples": ["ملي سرود", "وطن شعرونه", "غازي قومي"], "voice_style": "proud, formal, clear", "suffix": "د ملي غرور سره", }, "قومي (Tribal)": { "description": "Tribal traditions, ethnic heritage, clan stories", "examples": ["قومي کیسې", "نسب او شجره", "قبیلوي ویاړونه"], "voice_style": "traditional, elder-like, respectful", "suffix": "د قومي وياړ سره", }, "مذهبي (Religious)": { "description": "Religious content, spiritual guidance, Islamic teachings", "examples": ["دیني دروس", "اخلاقي کیسې", "روحاني مواعظ"], "voice_style": "soft, respectful, spiritual", "suffix": "د مذهبي احترام سره", }, "فرهنګي (Cultural)": { "description": "Cultural education, traditional values, customs", "examples": ["فرهنګي ارزښتونه", "دودونه او دستورونه", "کلتني کیسې"], "voice_style": "educational, warm, cultural", "suffix": "د فرهنګي ارزښتونو سره", }, "تاريخي (Historical)": { "description": "Historical narratives, ancient stories, past events", "examples": ["تاريخي کیسې", "پخوانۍ پیښې", "قدیم افسانې"], "voice_style": "storyteller, dramatic, engaging", "suffix": "د تاريخي روايت په انداز", }, "سنګي (Musical)": { "description": "Traditional music, folk songs, cultural rhythms", "examples": ["سنګي ملودۍ", "فولکلوري سندرې", "کلاسیکي موسیقي"], "voice_style": "melodic, rhythmic, artistic", "suffix": "د دوديزې نغمې په رنګ", }, "پېغلوي (Folk Tales)": { "description": "Folk tales, traditional stories, cultural narratives", "examples": ["پېغلوي کیسې", "افسانوي کیسې", "کلتني حکیات"], "voice_style": "storyteller, engaging, traditional", "suffix": "د ولسي کيسې له خوند سره", }, } COMPLETE_PHONEMES: Dict[str, Dict[str, Dict[str, Any]]] = { "پښتني حروف": { "ښ": {"symbol": "ښ", "ipa": "/ʂ/", "description": "Voiceless retroflex fricative", "dialects": {"کندهاري": "خ"}}, "ږ": {"symbol": "ږ", "ipa": "/ʐ/", "description": "Voiced retroflex fricative", "dialects": {"کندهاري": "گ"}}, "ڼ": {"symbol": "ڼ", "ipa": "/ɳ/", "description": "Retroflex nasal", "dialects": {"پکتياوي": "ڼ", "کندهاري": "ن"}}, "ړ": {"symbol": "ړ", "ipa": "/ɽ/", "description": "Retroflex flap", "dialects": {"ټول": "ړ"}}, "ټ": {"symbol": "ټ", "ipa": "/ʈ/", "description": "Voiceless retroflex stop", "dialects": {"ټول": "ټ"}}, "ډ": {"symbol": "ډ", "ipa": "/ɖ/", "description": "Voiced retroflex stop", "dialects": {"ټول": "ډ"}}, }, "عربي حروف": { "ص": {"symbol": "ص", "ipa": "/sˤ/", "description": "Emphatic voiceless alveolar fricative", "dialects": {}}, "ض": {"symbol": "ض", "ipa": "/dˤ/", "description": "Emphatic voiced alveolar stop", "dialects": {}}, "ط": {"symbol": "ط", "ipa": "/tˤ/", "description": "Emphatic voiceless alveolar stop", "dialects": {}}, "ظ": {"symbol": "ظ", "ipa": "/zˤ/", "description": "Emphatic voiced alveolar fricative", "dialects": {}}, }, "ويي": { "ا": {"symbol": "ا", "ipa": "/a/", "description": "Open front vowel", "dialects": {}}, "ې": {"symbol": "ې", "ipa": "/e/", "description": "Close-mid front vowel", "dialects": {}}, "ۍ": {"symbol": "ۍ", "ipa": "/ei/", "description": "Diphthong", "dialects": {}}, "و": {"symbol": "و", "ipa": "/o/", "description": "Close-mid back rounded vowel", "dialects": {}}, "ۀ": {"symbol": "ۀ", "ipa": "/ə/", "description": "Schwa", "dialects": {}}, }, } VOICE_TYPE_MODEL_MAP = { "مشر (Elder Male)": "elder", "ځوان (Young Male)": "male", "ښځينه (Female)": "female", "وړکتي (Child)": "child", } EMOTION_MAP = { "طبيعي (Natural)": "neutral", "خوشحال (Joyful)": "joyful", "غميز (Sorrowful)": "sorrowful", "جګ افتخار (Proud)": "proud", } class AudioProcessor: def preprocess_audio(self, audio_input: Tuple[int, np.ndarray] | np.ndarray | None) -> np.ndarray: if audio_input is None: raise ValueError("No audio input was provided.") if isinstance(audio_input, tuple): _, waveform = audio_input else: waveform = audio_input waveform = np.asarray(waveform, dtype=np.float32).squeeze() if waveform.ndim > 1: waveform = waveform.mean(axis=1) peak = float(np.max(np.abs(waveform))) if waveform.size else 0.0 if peak > 0: waveform = waveform / peak return waveform def analyze_audio(self, waveform: np.ndarray, sample_rate: int) -> Dict[str, Any]: if waveform.size == 0: return {"duration_seconds": 0.0, "energy": 0.0, "pitch_band": "unknown"} energy = float(np.mean(np.abs(waveform))) zero_crossing = float(np.mean(np.abs(np.diff(np.signbit(waveform))))) if waveform.size > 1 else 0.0 pitch_band = "high" if zero_crossing > 0.12 else "mid" if zero_crossing > 0.05 else "low" return { "duration_seconds": round(waveform.size / max(sample_rate, 1), 2), "energy": round(energy, 4), "pitch_band": pitch_band, } class CulturalContextProcessor: tribal_terms = ["احمدزي", "محسود", "خټک", "یوسفزي", "دواني", "ننګيال"] cultural_terms = ["پښتونولي", "مېلمستيا", "ننګ", "غيرت", "توره", "نګاه"] traditional_expressions = ["ښه راغلاست", "په خير", "الله دې مل شه", "ستړی مه شې"] honorifics = ["صاحب", "ملا", "خان", "استاد"] def apply_cultural_context(self, text: str, context: str) -> str: context_info = CULTURAL_CONTEXTS.get(context) if not context_info: return text return f"{text} ({context_info['suffix']})" def analyze_text(self, text: str) -> Dict[str, List[str]]: return { "tribal_references": [term for term in self.tribal_terms if term in text], "cultural_concepts": [term for term in self.cultural_terms if term in text], "traditional_expressions": [term for term in self.traditional_expressions if term in text], "honorifics": [term for term in self.honorifics if term in text], } class CompleteAfghanPashtoProcessor: def __init__(self) -> None: self.device = "cuda" if torch is not None and torch.cuda.is_available() else "cpu" self.models: Dict[str, Any] = {} self.audio_processor = AudioProcessor() self.cultural_processor = CulturalContextProcessor() self.dialect_rules = self.load_dialect_rules() self.load_all_models() def load_all_models(self) -> None: self.models = { "tts": {"base": self.load_tts_model(), "dialects": self.load_dialectal_tts_models()}, "asr": {"base": self.load_asr_model(), "dialectal": self.load_dialectal_asr_models()}, "voice_clone": self.load_voice_cloning_model(), } def load_tts_model(self) -> Dict[str, str]: return {"model": "base_tts", "status": "placeholder"} def load_dialectal_tts_models(self) -> Dict[str, Dict[str, str]]: return {dialect: info["voice_models"] for dialect, info in AFGHAN_PASHTO_DIALECTS.items()} def load_asr_model(self) -> Dict[str, str]: return {"model": "base_asr", "status": "placeholder"} def load_dialectal_asr_models(self) -> Dict[str, str]: return {dialect: f"{info['code']}_asr" for dialect, info in AFGHAN_PASHTO_DIALECTS.items()} def load_voice_cloning_model(self) -> Dict[str, str]: return {"model": "voice_clone", "status": "placeholder"} def load_dialect_rules(self) -> Dict[str, Dict[str, Dict[str, Any]]]: return { "pronunciation": { "کندهاري (Kandahari)": {"ښ": "خ", "ږ": "گ", "emphatic_consonants": True}, "پکتياوي (Paktiawal)": {"ڼ": "ڼ", "nasal_vowels": True, "tribal_pronunciation": True}, "هراتۍ (Herati)": {"ښ": "خ", "ږ": "گ", "western_vowels": True}, }, "vocabulary": { "کندهاري (Kandahari)": {"traditional_words": ["غه", "خه", "ګه"], "poetic_expressions": True}, "پکتياوي (Paktiawal)": {"tribal_words": ["خېل", "قوم", "نګهبان"], "mountain_vocabulary": True}, "هراتۍ (Herati)": {"persian_loanwords": True, "cultural_terms": ["فرهنګ", "تمدن", "ادب"]}, }, "grammar": { "ننګرهاري (Nangarhari)": {"eastern_constructions": True, "border_influences": True}, "مزارۍ (Mazari)": {"uzbek_influence": True, "northern_constructions": True}, }, } def process_authentic_tts( self, text: str, dialect: str, voice_type: str, context: str, emotion: str, speed: float, ) -> Tuple[np.ndarray, int, Dict[str, Any]]: dialectal_text = self.apply_comprehensive_dialect_rules(text, dialect) contextualized_text = self.cultural_processor.apply_cultural_context(dialectal_text, context) emotional_text = self.apply_emotional_coloring(contextualized_text, emotion) audio, sample_rate = self.generate_synthetic_speech(emotional_text, dialect, voice_type, speed, emotion) return audio, sample_rate, { "dialectal_text": dialectal_text, "contextualized_text": contextualized_text, "emotional_text": emotional_text, "model": self.resolve_voice_model(dialect, voice_type), "device": self.device, } def process_authentic_asr(self, audio_input: Tuple[int, np.ndarray] | None, dialect: str) -> Dict[str, Any]: if audio_input is None: raise ValueError("Please record or upload Pashto speech first.") sample_rate, waveform = audio_input processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform)) result = self.basic_speech_recognition(processed_audio, dialect) corrected_text = self.apply_dialectal_corrections(result["text"], dialect) cultural_info = self.extract_comprehensive_cultural_markers(corrected_text) audio_stats = self.audio_processor.analyze_audio(processed_audio, sample_rate) return { "text": corrected_text, "confidence": result.get("confidence", 0.85), "dialect": dialect, "audio_stats": audio_stats, "cultural_markers": cultural_info, "pronunciation_notes": self.get_pronunciation_notes(corrected_text, dialect), } def process_voice_cloning( self, reference_audio: Tuple[int, np.ndarray] | None, text: str, dialect: str, voice_characteristics: Dict[str, str], ) -> Tuple[np.ndarray, int, Dict[str, Any]]: if reference_audio is None: raise ValueError("Reference audio is required for voice cloning.") sample_rate, waveform = reference_audio processed_audio = self.audio_processor.preprocess_audio((sample_rate, waveform)) features = self.extract_authentic_voice_features(processed_audio, sample_rate, dialect) merged_features = {**features, **voice_characteristics} cloned_audio, cloned_rate = self.basic_voice_cloning(text, merged_features, dialect) return cloned_audio, cloned_rate, merged_features def apply_comprehensive_dialect_rules(self, text: str, dialect: str) -> str: pronunciation_rules = self.dialect_rules.get("pronunciation", {}).get(dialect, {}) vocabulary_rules = self.dialect_rules.get("vocabulary", {}).get(dialect, {}) transformed = text for original, replacement in pronunciation_rules.items(): if isinstance(replacement, str): transformed = transformed.replace(original, replacement) if vocabulary_rules.get("poetic_expressions") and "وطن" in transformed: transformed = transformed.replace("وطن", "پلرنی وطن") if vocabulary_rules.get("persian_loanwords") and "کلتور" in transformed: transformed = transformed.replace("کلتور", "فرهنګ") return transformed def apply_emotional_coloring(self, text: str, emotion: str) -> str: emotional_suffix = { "neutral": "په طبيعي انداز", "joyful": "په خوشحال رنګ", "sorrowful": "په غمجن اهنګ", "proud": "په ویاړلي انداز", }.get(emotion) return f"{text} ({emotional_suffix})" if emotional_suffix else text def generate_synthetic_speech( self, text: str, dialect: str, voice_type: str, speed: float, emotion: str, ) -> Tuple[np.ndarray, int]: sample_rate = 24000 duration = max(1.5, min(len(text) * 0.11 / max(speed, 0.1), 18.0)) timeline = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) base_freq = {"female": 210, "child": 280, "elder": 105, "male": 130}.get(voice_type, 140) dialect_shift = { "کندهاري (Kandahari)": -5, "پکتياوي (Paktiawal)": 7, "مزارۍ (Mazari)": 3, "هراتۍ (Herati)": -2, "ننګرهاري (Nangarhari)": 8, }.get(dialect, 0) emotion_shift = {"neutral": 0, "joyful": 16, "sorrowful": -10, "proud": 9}.get(emotion, 0) f0 = base_freq + dialect_shift + emotion_shift # Quasi-periodic glottal excitation with natural jitter jitter = 1.0 + 0.003 * np.random.randn(timeline.shape[0]) glottal = np.zeros_like(timeline) for h in range(1, 15): glottal += (1.0 / h) * np.sin(2 * math.pi * h * f0 * jitter * timeline) # Slow F0 contour (sentence-level intonation) f0_contour = 1.0 + 0.08 * np.sin(2 * math.pi * 0.15 * timeline) f0_contour *= np.exp(-timeline / (duration * 0.8)) # Declination # Formant-based speech synthesis (vowel-like resonances) formants = [ {"freq": 500, "bw": 80, "amp": 1.0}, # F1 {"freq": 1500, "bw": 120, "amp": 0.6}, # F2 {"freq": 2500, "bw": 150, "amp": 0.35}, # F3 {"freq": 3500, "bw": 200, "amp": 0.2}, # F4 ] audio = np.zeros_like(timeline) for fm in formants: phase = 2 * math.pi * fm["freq"] * timeline * f0_contour damped = np.exp(-timeline * fm["bw"] * 0.001) audio += fm["amp"] * np.sin(phase) * damped # Combine with glottal source audio = 0.4 * audio + 0.6 * glottal # Syllable rhythm (3-5 syllables per second) syllable_rate = 3.5 * speed syllable_env = 0.5 + 0.5 * np.abs(np.sin(2 * math.pi * syllable_rate * timeline)) ** 1.5 # Word-level pauses (brief dips every ~1.2 seconds) word_pause = 1.0 - 0.3 * np.exp(-0.5 * ((timeline % 1.2) / 0.08) ** 2) # Amplitude envelope with natural fade fade = np.ones_like(timeline) fade[:int(0.02 * sample_rate)] = np.linspace(0, 1, int(0.02 * sample_rate)) # Attack fade[int(0.8 * len(fade)):] = np.linspace(1, 0.01, len(fade) - int(0.8 * len(fade))) # Release # Breath noise (fricative-like) breath = np.random.normal(0, 0.012, timeline.shape) breath = np.convolve(breath, np.ones(int(0.002 * sample_rate)) / int(0.002 * sample_rate), mode="same") audio = np.clip(audio * syllable_env * word_pause * fade * 0.3 + breath * syllable_env, -1.0, 1.0) return audio.astype(np.float32), sample_rate def basic_speech_recognition(self, waveform: np.ndarray, dialect: str) -> Dict[str, Any]: energy = float(np.mean(np.abs(waveform))) if waveform.size else 0.0 transcript = "دا يو پښتو متن دی چې د وينا پېژندنې له لارې ترلاسه شوی" if energy > 0.06: transcript += " او غږ يې روښانه دی" if dialect == "کندهاري (Kandahari)": transcript += " د کندهارۍ رنګ سره" elif dialect == "پکتياوي (Paktiawal)": transcript += " د پکتياوي انداز سره" elif dialect == "هراتۍ (Herati)": transcript += " د هراتي نرمۍ سره" return {"text": transcript, "confidence": 0.85} def apply_dialectal_corrections(self, text: str, dialect: str) -> str: corrections = { "کندهاري (Kandahari)": {"شګ": "ښګ", "ژګ": "ږګ"}, "پکتياوي (Paktiawal)": {"نګ": "ڼګ"}, } corrected = text for wrong, correct in corrections.get(dialect, {}).items(): corrected = corrected.replace(wrong, correct) return corrected def extract_comprehensive_cultural_markers(self, text: str) -> Dict[str, List[str]]: return self.cultural_processor.analyze_text(text) def get_pronunciation_notes(self, text: str, dialect: str) -> List[str]: notes = [AFGHAN_PASHTO_DIALECTS[dialect]["pronunciation_guide"]] if "ښ" in text: notes.append("Text contains ښ, which is one of the key dialect markers.") if "ږ" in text: notes.append("Text contains ږ, so dialect-specific realization matters here.") return notes def extract_authentic_voice_features(self, waveform: np.ndarray, sample_rate: int, dialect: str) -> Dict[str, Any]: stats = self.audio_processor.analyze_audio(waveform, sample_rate) return { "pitch_band": stats["pitch_band"], "energy": stats["energy"], "accent": AFGHAN_PASHTO_DIALECTS[dialect]["traditional_name"], "quality": "clear" if stats["energy"] > 0.04 else "soft", } def basic_voice_cloning(self, text: str, voice_features: Dict[str, Any], dialect: str) -> Tuple[np.ndarray, int]: pitch_band = voice_features.get("pitch_band", "mid") voice_type = "female" if pitch_band == "high" else "elder" if voice_features.get("age_profile") == "elder" else "male" return self.generate_synthetic_speech(text, dialect, voice_type, 1.0, "neutral") def resolve_voice_model(self, dialect: str, voice_type: str) -> str: model_map = AFGHAN_PASHTO_DIALECTS[dialect]["voice_models"] return model_map.get(voice_type, f"{AFGHAN_PASHTO_DIALECTS[dialect]['code']}_{voice_type}.pth") processor = CompleteAfghanPashtoProcessor() def format_dialect_summary(dialect: str) -> str: info = AFGHAN_PASHTO_DIALECTS[dialect] return ( f"لهجه: {dialect}\n" f"کوډ: {info['code']}\n" f"سيمه: {info['region']}\n" f"دوديز نوم: {info['traditional_name']}\n" f"اواز ماډلونه: {', '.join(info['voice_models'].values())}\n" f"تلفظ: {info['pronunciation_guide']}\n" f"ځانګړنې: {', '.join(info['characteristics'])}" ) def format_context_summary(context_name: str) -> str: info = CULTURAL_CONTEXTS[context_name] return ( f"تشريح: {info['description']}\n" f"بېلګې: {', '.join(info['examples'])}\n" f"اواز سبک: {info['voice_style']}" ) def phoneme_markdown() -> str: lines: List[str] = [] for category, items in COMPLETE_PHONEMES.items(): lines.append(f"### {category}") for symbol, data in items.items(): dialects = data.get("dialects", {}) dialect_text = ", ".join(f"{key}: {value}" for key, value in dialects.items()) if dialects else "—" lines.append(f"- **{symbol}** · IPA `{data['ipa']}` · {data['description']} · Dialects: {dialect_text}") return "\n".join(lines) def available_examples() -> List[List[Any]]: return [ ["زما وطن د وياړ کور دی او پښتونولي زموږ د ژوند لار ده.", "کندهاري (Kandahari)", "مشر (Elder Male)", "ملي (National)", "جګ افتخار (Proud)", 1.0], ["مېلمستيا او غيرت زموږ کلتوري ارزښتونه دي.", "پکتياوي (Paktiawal)", "ځوان (Young Male)", "فرهنګي (Cultural)", "طبيعي (Natural)", 1.1], ["ښه راغلاست، دا يوه ولسي کيسه ده چې د زاړه وخت ياد راژوندی کوي.", "هراتۍ (Herati)", "ښځينه (Female)", "پېغلوي (Folk Tales)", "خوشحال (Joyful)", 0.9], ] def generate_voice(text: str, dialect: str, voice_label: str, context_name: str, emotion_label: str, speed: float): if not text.strip(): raise gr.Error("مهرباني وکړئ پښتو متن وليکئ / Please enter Pashto text.") voice_type = VOICE_TYPE_MODEL_MAP[voice_label] emotion = EMOTION_MAP[emotion_label] audio, sample_rate, metadata = processor.process_authentic_tts(text, dialect, voice_type, context_name, emotion, speed) info = ( f"Model: {metadata['model']}\n" f"Device: {metadata['device']}\n" f"Dialectal text: {metadata['dialectal_text']}\n" f"Contextualized text: {metadata['contextualized_text']}\n" f"Emotional text: {metadata['emotional_text']}" ) context_analysis = processor.extract_comprehensive_cultural_markers(metadata["emotional_text"]) marker_lines = [f"{key}: {', '.join(values)}" for key, values in context_analysis.items() if values] markers_text = "\n".join(marker_lines) if marker_lines else "No explicit cultural markers detected yet." return (sample_rate, audio), info, format_dialect_summary(dialect), format_context_summary(context_name), markers_text def recognize_speech(audio_input, dialect: str): result = processor.process_authentic_asr(audio_input, dialect) summary = ( f"Transcription: {result['text']}\n" f"Confidence: {result['confidence']:.0%}\n" f"Duration: {result['audio_stats']['duration_seconds']} seconds\n" f"Energy: {result['audio_stats']['energy']}\n" f"Pitch band: {result['audio_stats']['pitch_band']}" ) markers = [f"{key}: {', '.join(values)}" for key, values in result['cultural_markers'].items() if values] return summary, result["cultural_markers"], "\n".join(result["pronunciation_notes"]), "\n".join(markers) if markers else "No cultural markers detected." def clone_voice(reference_audio, text: str, dialect: str, age_profile: str, style_profile: str): if not text.strip(): raise gr.Error("Please provide target text for cloning.") audio, sample_rate, features = processor.process_voice_cloning( reference_audio, text, dialect, {"age_profile": age_profile, "style_profile": style_profile}, ) feature_lines = "\n".join(f"{key}: {value}" for key, value in features.items()) return (sample_rate, audio), feature_lines, format_dialect_summary(dialect) # GPU-decorated wrappers for Hugging Face Spaces if spaces is not None and hasattr(spaces, "GPU"): @spaces.GPU def generate_voice_gpu(text: str, dialect: str, voice_label: str, context_name: str, emotion_label: str, speed: float): return generate_voice(text, dialect, voice_label, context_name, emotion_label, speed) else: def generate_voice_gpu(text: str, dialect: str, voice_label: str, context_name: str, emotion_label: str, speed: float): return generate_voice(text, dialect, voice_label, context_name, emotion_label, speed) if spaces is not None and hasattr(spaces, "GPU"): @spaces.GPU def recognize_speech_gpu(audio_input, dialect: str): return recognize_speech(audio_input, dialect) else: def recognize_speech_gpu(audio_input, dialect: str): return recognize_speech(audio_input, dialect) if spaces is not None and hasattr(spaces, "GPU"): @spaces.GPU def clone_voice_gpu(reference_audio, text: str, dialect: str, age_profile: str, style_profile: str): return clone_voice(reference_audio, text, dialect, age_profile, style_profile) else: def clone_voice_gpu(reference_audio, text: str, dialect: str, age_profile: str, style_profile: str): return clone_voice(reference_audio, text, dialect, age_profile, style_profile) def build_app() -> gr.Blocks: with gr.Blocks( title="🎙️ Afghan Pashto Voice Hub - د افغان پښتو غږيز مرکز", ) as app: gr.Markdown( """ # 🎙️ Afghan Pashto Voice & Speech Processing Hub ## د افغان پښتو غږيز پروسسنګ مرکز **Pure Afghan Pashto - له اصل پښتو سره** Supports: **Kandahari, Paktiawal, Mazari, Herati, Nangarhari, and traditional forms** """ ) gr.HTML('
') with gr.Row(): with gr.Column(scale=2): dialect_preview = gr.Dropdown( choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="کندهاري (Kandahari)", label="Dialect overview - لهجو کتنه", ) dialect_summary = gr.Textbox( value=format_dialect_summary("کندهاري (Kandahari)"), label="Dialect details", lines=7, ) with gr.Column(scale=1): gr.Markdown(f"### Runtime\n- Runtime device: {processor.device}\n- Models: lightweight placeholder stack") dialect_preview.change(fn=format_dialect_summary, inputs=dialect_preview, outputs=dialect_summary) with gr.Accordion("Traditional phoneme guide", open=False): gr.Markdown(phoneme_markdown(), elem_classes="pashto-text") with gr.Tabs(): with gr.TabItem("🔊 Authentic Voice"): with gr.Row(): with gr.Column(): authentic_text = gr.Textbox( label="پښتو متن / Pashto Text", placeholder="دلته پښتو متن ولیکئ...", lines=5, elem_classes="pashto-text", ) with gr.Row(): authentic_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="کندهاري (Kandahari)", label="Dialect - لهجه") authentic_voice = gr.Dropdown(choices=list(VOICE_TYPE_MODEL_MAP.keys()), value="مشر (Elder Male)", label="Voice Type - غږ ډول") with gr.Row(): authentic_context = gr.Dropdown(choices=list(CULTURAL_CONTEXTS.keys()), value="ملي (National)", label="Cultural Context - کلتني زمينه") authentic_emotion = gr.Dropdown(choices=list(EMOTION_MAP.keys()), value="طبيعي (Natural)", label="Emotion - احساس") authentic_speed = gr.Slider(0.6, 1.4, value=1.0, step=0.1, label="Speed - چټکتيا") authentic_generate = gr.Button("🎤 Generate Authentic Voice", variant="primary") gr.Examples(examples=available_examples(), inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed]) with gr.Column(): authentic_output = gr.Audio(label="Generated Afghan Pashto Voice") authentic_info = gr.Textbox(label="Voice pipeline details", lines=7) authentic_dialect_info = gr.Textbox(label="Dialect knowledge", lines=7) authentic_context_info = gr.Textbox(label="Context knowledge", lines=4) authentic_markers = gr.Textbox(label="Cultural markers", lines=5) authentic_generate.click( fn=generate_voice_gpu, inputs=[authentic_text, authentic_dialect, authentic_voice, authentic_context, authentic_emotion, authentic_speed], outputs=[authentic_output, authentic_info, authentic_dialect_info, authentic_context_info, authentic_markers], ) with gr.TabItem("🎧 Speech Recognition"): with gr.Row(): with gr.Column(): recognition_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload or record Pashto speech") recognition_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="پکتياوي (Paktiawal)", label="Target dialect") recognition_button = gr.Button("📝 Recognize Speech", variant="primary") with gr.Column(): recognition_summary = gr.Textbox(label="Recognition summary", lines=6) recognition_context = gr.JSON(label="Cultural context analysis") recognition_pronunciation = gr.Textbox(label="Pronunciation notes", lines=4) recognition_markers = gr.Textbox(label="Detected markers", lines=4) recognition_button.click( fn=recognize_speech_gpu, inputs=[recognition_audio, recognition_dialect], outputs=[recognition_summary, recognition_context, recognition_pronunciation, recognition_markers], ) with gr.TabItem("🧬 Voice Cloning Demo"): with gr.Row(): with gr.Column(): clone_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Reference Afghan voice") clone_text = gr.Textbox(label="Target text", lines=4, placeholder="هغه متن وليکئ چې د هماغه غږ په ډول واورئ...", elem_classes="pashto-text") clone_dialect = gr.Dropdown(choices=list(AFGHAN_PASHTO_DIALECTS.keys()), value="هراتۍ (Herati)", label="Dialect") with gr.Row(): clone_age = gr.Dropdown(choices=["youthful", "mature", "elder"], value="mature", label="Age profile") clone_style = gr.Dropdown(choices=["formal", "storytelling", "poetic", "conversational"], value="storytelling", label="Speaking style") clone_button = gr.Button("🧪 Clone Voice Demo", variant="primary") with gr.Column(): clone_output = gr.Audio(label="Cloned Afghan voice") clone_features = gr.Textbox(label="Extracted / merged voice features", lines=8) clone_dialect_info = gr.Textbox(label="Dialect profile", lines=7) clone_button.click( fn=clone_voice_gpu, inputs=[clone_audio, clone_text, clone_dialect, clone_age, clone_style], outputs=[clone_output, clone_features, clone_dialect_info], ) gr.Markdown( """ ### Notes - This app is a lightweight, deployable Gradio demo with authentic Afghan Pashto structure and metadata. - TTS, ASR, and voice cloning are implemented with synthetic placeholder audio logic so the interface runs without large model files. - You can later replace the placeholder methods with real Pashto TTS, ASR, and cloning checkpoints. """ ) return app app = build_app() if __name__ == "__main__": app.launch( theme=gr.themes.Soft(), css=""" .pashto-text { font-family: 'Noto Nastaliq Urdu', 'Jameel Noori Nastaleeq', 'Scheherazade New', serif; direction: rtl; text-align: right; } .afghan-flag { background: linear-gradient(to bottom, #000000, #c81818, #0b8f3a); height: 20px; border-radius: 6px; margin: 10px 0; } """, )