Spaces:
Build error
Build error
| import random | |
| import numpy as np | |
| import torch | |
| from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES | |
| import gradio as gr | |
| import spaces | |
| #DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| DEVICE = "cpu" | |
| print(f"🚀 Running on device: {DEVICE}") | |
| # --- Global Model Initialization --- | |
| MODEL = None | |
| LANGUAGE_CONFIG = { | |
| "ar": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac", | |
| "text": "في الشهر الماضي، وصلنا إلى معلم جديد بمليارين من المشاهدات على قناتنا على يوتيوب." | |
| }, | |
| "da": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac", | |
| "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal." | |
| }, | |
| "de": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac", | |
| "text": "Letzten Monat haben wir einen neuen Meilenstein erreicht: zwei Milliarden Aufrufe auf unserem YouTube-Kanal." | |
| }, | |
| "el": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac", | |
| "text": "Τον περασμένο μήνα, φτάσαμε σε ένα νέο ορόσημο με δύο δισεκατομμύρια προβολές στο κανάλι μας στο YouTube." | |
| }, | |
| "en": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac", | |
| "text": "Last month, we reached a new milestone with two billion views on our YouTube channel." | |
| }, | |
| "es": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac", | |
| "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones en nuestro canal de YouTube." | |
| }, | |
| "fi": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac", | |
| "text": "Viime kuussa saavutimme uuden virstanpylvään kahden miljardin katselukerran kanssa YouTube-kanavallamme." | |
| }, | |
| "fr": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac", | |
| "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues sur notre chaîne YouTube." | |
| }, | |
| "he": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac", | |
| "text": "בחודש שעבר הגענו לאבן דרך חדשה עם שני מיליארד צפיות בערוץ היוטיוב שלנו." | |
| }, | |
| "hi": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac", | |
| "text": "पिछले महीने हमने एक नया मील का पत्थर छुआ: हमारे YouTube चैनल पर दो अरब व्यूज़।" | |
| }, | |
| "it": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac", | |
| "text": "Il mese scorso abbiamo raggiunto un nuovo traguardo: due miliardi di visualizzazioni sul nostro canale YouTube." | |
| }, | |
| "ja": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac", | |
| "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。" | |
| }, | |
| "ko": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac", | |
| "text": "지난달 우리는 유튜브 채널에서 이십억 조회수라는 새로운 이정표에 도달했습니다." | |
| }, | |
| "ms": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac", | |
| "text": "Bulan lepas, kami mencapai pencapaian baru dengan dua bilion tontonan di saluran YouTube kami." | |
| }, | |
| "nl": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac", | |
| "text": "Vorige maand bereikten we een nieuwe mijlpaal met twee miljard weergaven op ons YouTube-kanaal." | |
| }, | |
| "no": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac", | |
| "text": "Forrige måned nådde vi en ny milepæl med to milliarder visninger på YouTube-kanalen vår." | |
| }, | |
| "pl": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac", | |
| "text": "W zeszłym miesiącu osiągnęliśmy nowy kamień milowy z dwoma miliardami wyświetleń na naszym kanale YouTube." | |
| }, | |
| "pt": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac", | |
| "text": "No mês passado, alcançámos um novo marco: dois mil milhões de visualizações no nosso canal do YouTube." | |
| }, | |
| "ru": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac", | |
| "text": "В прошлом месяце мы достигли нового рубежа: два миллиарда просмотров на нашем YouTube-канале." | |
| }, | |
| "sv": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac", | |
| "text": "Förra månaden nådde vi en ny milstolpe med två miljarder visningar på vår YouTube-kanal." | |
| }, | |
| "sw": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac", | |
| "text": "Mwezi uliopita, tulifika hatua mpya ya maoni ya bilioni mbili kweny kituo chetu cha YouTube." | |
| }, | |
| "tr": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac", | |
| "text": "Geçen ay YouTube kanalımızda iki milyar görüntüleme ile yeni bir dönüm noktasına ulaştık." | |
| }, | |
| "zh": { | |
| "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac", | |
| "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。" | |
| }, | |
| } | |
| # --- UI Helpers --- | |
| def default_audio_for_ui(lang: str) -> str | None: | |
| return LANGUAGE_CONFIG.get(lang, {}).get("audio") | |
| def default_text_for_ui(lang: str) -> str: | |
| return LANGUAGE_CONFIG.get(lang, {}).get("text", "") | |
| def get_supported_languages_display() -> str: | |
| """Generate a formatted display of all supported languages.""" | |
| language_items = [] | |
| for code, name in sorted(SUPPORTED_LANGUAGES.items()): | |
| language_items.append(f"**{name}** (`{code}`)") | |
| # Split into 2 lines | |
| mid = len(language_items) // 2 | |
| line1 = " • ".join(language_items[:mid]) | |
| line2 = " • ".join(language_items[mid:]) | |
| return f""" | |
| ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total) | |
| {line1} | |
| {line2} | |
| """ | |
| def get_or_load_model(): | |
| """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already, | |
| and ensures it's on the correct device.""" | |
| global MODEL | |
| if MODEL is None: | |
| print("Model not loaded, initializing...") | |
| try: | |
| MODEL = ChatterboxMultilingualTTS.from_pretrained(DEVICE) | |
| if hasattr(MODEL, 'to') and str(MODEL.device) != DEVICE: | |
| MODEL.to(DEVICE) | |
| print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}") | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| raise | |
| return MODEL | |
| # Attempt to load the model at startup. | |
| try: | |
| get_or_load_model() | |
| except Exception as e: | |
| print(f"CRITICAL: Failed to load model on startup. Application may not function. Error: {e}") | |
| def set_seed(seed: int): | |
| """Sets the random seed for reproducibility across torch, numpy, and random.""" | |
| torch.manual_seed(seed) | |
| if DEVICE == "cuda": | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| def resolve_audio_prompt(language_id: str, provided_path: str | None) -> str | None: | |
| """ | |
| Decide which audio prompt to use: | |
| - If user provided a path (upload/mic/url), use it. | |
| - Else, fall back to language-specific default (if any). | |
| """ | |
| if provided_path and str(provided_path).strip(): | |
| return provided_path | |
| return LANGUAGE_CONFIG.get(language_id, {}).get("audio") | |
| def generate_tts_audio( | |
| text_input: str, | |
| language_id: str, | |
| audio_prompt_path_input: str = None, | |
| exaggeration_input: float = 0.5, | |
| temperature_input: float = 0.8, | |
| seed_num_input: int = 0, | |
| cfgw_input: float = 0.5 | |
| ) -> tuple[int, np.ndarray]: | |
| """ | |
| Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling. | |
| Supported languages: English, French, German, Spanish, Italian, Portuguese, and Hindi. | |
| This tool synthesizes natural-sounding speech from input text. When a reference audio file | |
| is provided, it captures the speaker's voice characteristics and speaking style. The generated audio | |
| maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided. | |
| Args: | |
| text_input (str): The text to synthesize into speech (maximum 500 characters) | |
| language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi) | |
| audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None. | |
| exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5. | |
| temperature_input (float, optional): Controls randomness in generation (0.05-5.0, higher=more varied). Defaults to 0.8. | |
| seed_num_input (int, optional): Random seed for reproducible results (0 for random generation). Defaults to 0. | |
| cfgw_input (float, optional): CFG/Pace weight controlling generation guidance (0.2-1.0). Defaults to 0.5, 0 for language transfer. | |
| Returns: | |
| tuple[int, np.ndarray]: A tuple containing the sample rate (int) and the generated audio waveform (numpy.ndarray) | |
| """ | |
| current_model = get_or_load_model() | |
| if current_model is None: | |
| raise RuntimeError("TTS model is not loaded.") | |
| if seed_num_input != 0: | |
| set_seed(int(seed_num_input)) | |
| print(f"Generating audio for text: '{text_input[:50]}...'") | |
| # Handle optional audio prompt | |
| chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id) | |
| generate_kwargs = { | |
| "exaggeration": exaggeration_input, | |
| "temperature": temperature_input, | |
| "cfg_weight": cfgw_input, | |
| } | |
| if chosen_prompt: | |
| generate_kwargs["audio_prompt_path"] = chosen_prompt | |
| print(f"Using audio prompt: {chosen_prompt}") | |
| else: | |
| print("No audio prompt provided; using default voice.") | |
| wav = current_model.generate( | |
| text_input[:500], # Truncate text to max chars | |
| language_id=language_id, | |
| **generate_kwargs | |
| ) | |
| print("Audio generation complete.") | |
| return (current_model.sr, wav.squeeze(0).numpy()) | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # Chatterbox Multilingual Demo para CPU | |
| Genera Voz de alta calidad multilingue con referencia de audio + estilo, soporta 23 languajes. | |
| Como usa CPU es bastante lento, como 150 caracteres por 2 minutos | |
| Para una version tuneada, visita [resemble.ai](https://app.resemble.ai) | |
| """ | |
| ) | |
| # Display supported languages | |
| gr.Markdown(get_supported_languages_display()) | |
| with gr.Row(): | |
| with gr.Column(): | |
| initial_lang = "es" | |
| text = gr.Textbox( | |
| value=default_text_for_ui(initial_lang), | |
| label="Texto a sintetizar (max 500 caracteres)", | |
| max_lines=5 | |
| ) | |
| language_id = gr.Dropdown( | |
| choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()), | |
| value=initial_lang, | |
| label="Language", | |
| info="Seleccion el lenguaje para la sintesis" | |
| ) | |
| ref_wav = gr.Audio( | |
| sources=["upload", "microphone"], | |
| type="filepath", | |
| label="Audio de Referencia (Opcional)", | |
| value=default_audio_for_ui(initial_lang) | |
| ) | |
| gr.Markdown( | |
| "💡 **Nota**: Asegurarse que el audio de referencia y el del texto sean el mismo. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.", | |
| elem_classes=["audio-note"] | |
| ) | |
| exaggeration = gr.Slider( | |
| 0.25, 2, step=.05, label="Exageracion (Neutral = 0.5, valores extremos son inestables)", value=.5 | |
| ) | |
| cfg_weight = gr.Slider( | |
| 0.2, 1, step=.05, label="CFG/Pace", value=0.5 | |
| ) | |
| with gr.Accordion("Mas Opciones", open=False): | |
| seed_num = gr.Number(value=0, label="Random seed (0 for random)") | |
| temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8) | |
| run_btn = gr.Button("Generar", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Salida de Audio") | |
| def on_language_change(lang, current_ref, current_text): | |
| return default_audio_for_ui(lang), default_text_for_ui(lang) | |
| language_id.change( | |
| fn=on_language_change, | |
| inputs=[language_id, ref_wav, text], | |
| outputs=[ref_wav, text], | |
| show_progress=False | |
| ) | |
| run_btn.click( | |
| fn=generate_tts_audio, | |
| inputs=[ | |
| text, | |
| language_id, | |
| ref_wav, | |
| exaggeration, | |
| temp, | |
| seed_num, | |
| cfg_weight, | |
| ], | |
| outputs=[audio_output], | |
| ) | |
| demo.launch(mcp_server=True) | |