Chatterbox-Multilingual-TTS

Build error

App Files Files Community

JymNils commited on Oct 25, 2025

Commit

5e4e0c2

verified ·

1 Parent(s): a930ed8

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -196,7 +196,7 @@ def generate_tts_audio(
     maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
-        text_input (str): The text to synthesize into speech (maximum 300 characters)
         language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
         audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
         exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
@@ -232,7 +232,7 @@ def generate_tts_audio(
         print("No audio prompt provided; using default voice.")
     wav = current_model.generate(
-        text_input[:300],  # Truncate text to max chars
         language_id=language_id,
         **generate_kwargs
     )
@@ -242,10 +242,10 @@ def generate_tts_audio(
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        # Chatterbox Multilingual Demo
-        Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
-        For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
         """
     )
@@ -253,10 +253,10 @@ with gr.Blocks() as demo:
     gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
-            initial_lang = "fr"
             text = gr.Textbox(
                 value=default_text_for_ui(initial_lang),
-                label="Text to synthesize (max chars 300)",
                 max_lines=5
             )
@@ -264,23 +264,23 @@ with gr.Blocks() as demo:
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
                 label="Language",
-                info="Select the language for text-to-speech synthesis"
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
-                label="Reference Audio File (Optional)",
                 value=default_audio_for_ui(initial_lang)
             )
             gr.Markdown(
-                "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
                 elem_classes=["audio-note"]
             )
             exaggeration = gr.Slider(
-                0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
             )
             cfg_weight = gr.Slider(
                 0.2, 1, step=.05, label="CFG/Pace", value=0.5
@@ -290,7 +290,7 @@ with gr.Blocks() as demo:
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
-            run_btn = gr.Button("Generate", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")

     maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
     Args:
+        text_input (str): The text to synthesize into speech (maximum 1000 characters)
         language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
         audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
         exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
         print("No audio prompt provided; using default voice.")
     wav = current_model.generate(
+        text_input[:1000],  # Truncate text to max chars
         language_id=language_id,
         **generate_kwargs
     )
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        # Chatterbox Multilingual Demo for CPU
+        Genera Voz de alta calidad multilingue con referencia deaudio + stylo, suporta 23 languajes.
+        Para una version tuneada, visita [resemble.ai](https://app.resemble.ai)
         """
     )
     gr.Markdown(get_supported_languages_display())
     with gr.Row():
         with gr.Column():
+            initial_lang = "es"
             text = gr.Textbox(
                 value=default_text_for_ui(initial_lang),
+                label="Texto a sintetizar (max 1000 caracteres)",
                 max_lines=5
             )
                 choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
                 value=initial_lang,
                 label="Language",
+                info="Seleccion el laeguagj para la sintesis"
             )
             ref_wav = gr.Audio(
                 sources=["upload", "microphone"],
                 type="filepath",
+                label="Audio de Referencia (Opcional)",
                 value=default_audio_for_ui(initial_lang)
             )
             gr.Markdown(
+                "💡 **Note**: Asegurarse que el audio de referencia y el del texto sean el mismo. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
                 elem_classes=["audio-note"]
             )
             exaggeration = gr.Slider(
+                0.25, 2, step=.05, label="Exageracion (Neutral = 0.5, valores extremos son inestables)", value=.5
             )
             cfg_weight = gr.Slider(
                 0.2, 1, step=.05, label="CFG/Pace", value=0.5
                 seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                 temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
+            run_btn = gr.Button("Generar", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Output Audio")