Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -196,7 +196,7 @@ def generate_tts_audio(
|
|
| 196 |
maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
|
| 197 |
|
| 198 |
Args:
|
| 199 |
-
text_input (str): The text to synthesize into speech (maximum
|
| 200 |
language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
|
| 201 |
audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
|
| 202 |
exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
|
|
@@ -232,7 +232,7 @@ def generate_tts_audio(
|
|
| 232 |
print("No audio prompt provided; using default voice.")
|
| 233 |
|
| 234 |
wav = current_model.generate(
|
| 235 |
-
text_input[:
|
| 236 |
language_id=language_id,
|
| 237 |
**generate_kwargs
|
| 238 |
)
|
|
@@ -242,10 +242,10 @@ def generate_tts_audio(
|
|
| 242 |
with gr.Blocks() as demo:
|
| 243 |
gr.Markdown(
|
| 244 |
"""
|
| 245 |
-
# Chatterbox Multilingual Demo
|
| 246 |
-
|
| 247 |
|
| 248 |
-
|
| 249 |
"""
|
| 250 |
)
|
| 251 |
|
|
@@ -253,10 +253,10 @@ with gr.Blocks() as demo:
|
|
| 253 |
gr.Markdown(get_supported_languages_display())
|
| 254 |
with gr.Row():
|
| 255 |
with gr.Column():
|
| 256 |
-
initial_lang = "
|
| 257 |
text = gr.Textbox(
|
| 258 |
value=default_text_for_ui(initial_lang),
|
| 259 |
-
label="
|
| 260 |
max_lines=5
|
| 261 |
)
|
| 262 |
|
|
@@ -264,23 +264,23 @@ with gr.Blocks() as demo:
|
|
| 264 |
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
|
| 265 |
value=initial_lang,
|
| 266 |
label="Language",
|
| 267 |
-
info="
|
| 268 |
)
|
| 269 |
|
| 270 |
ref_wav = gr.Audio(
|
| 271 |
sources=["upload", "microphone"],
|
| 272 |
type="filepath",
|
| 273 |
-
label="
|
| 274 |
value=default_audio_for_ui(initial_lang)
|
| 275 |
)
|
| 276 |
|
| 277 |
gr.Markdown(
|
| 278 |
-
"💡 **Note**:
|
| 279 |
elem_classes=["audio-note"]
|
| 280 |
)
|
| 281 |
|
| 282 |
exaggeration = gr.Slider(
|
| 283 |
-
0.25, 2, step=.05, label="
|
| 284 |
)
|
| 285 |
cfg_weight = gr.Slider(
|
| 286 |
0.2, 1, step=.05, label="CFG/Pace", value=0.5
|
|
@@ -290,7 +290,7 @@ with gr.Blocks() as demo:
|
|
| 290 |
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
|
| 291 |
temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
|
| 292 |
|
| 293 |
-
run_btn = gr.Button("
|
| 294 |
|
| 295 |
with gr.Column():
|
| 296 |
audio_output = gr.Audio(label="Output Audio")
|
|
|
|
| 196 |
maintains the prosody, tone, and vocal qualities of the reference speaker, or uses default voice if no reference is provided.
|
| 197 |
|
| 198 |
Args:
|
| 199 |
+
text_input (str): The text to synthesize into speech (maximum 1000 characters)
|
| 200 |
language_id (str): The language code for synthesis (eg. en, fr, de, es, it, pt, hi)
|
| 201 |
audio_prompt_path_input (str, optional): File path or URL to the reference audio file that defines the target voice style. Defaults to None.
|
| 202 |
exaggeration_input (float, optional): Controls speech expressiveness (0.25-2.0, neutral=0.5, extreme values may be unstable). Defaults to 0.5.
|
|
|
|
| 232 |
print("No audio prompt provided; using default voice.")
|
| 233 |
|
| 234 |
wav = current_model.generate(
|
| 235 |
+
text_input[:1000], # Truncate text to max chars
|
| 236 |
language_id=language_id,
|
| 237 |
**generate_kwargs
|
| 238 |
)
|
|
|
|
| 242 |
with gr.Blocks() as demo:
|
| 243 |
gr.Markdown(
|
| 244 |
"""
|
| 245 |
+
# Chatterbox Multilingual Demo for CPU
|
| 246 |
+
Genera Voz de alta calidad multilingue con referencia deaudio + stylo, suporta 23 languajes.
|
| 247 |
|
| 248 |
+
Para una version tuneada, visita [resemble.ai](https://app.resemble.ai)
|
| 249 |
"""
|
| 250 |
)
|
| 251 |
|
|
|
|
| 253 |
gr.Markdown(get_supported_languages_display())
|
| 254 |
with gr.Row():
|
| 255 |
with gr.Column():
|
| 256 |
+
initial_lang = "es"
|
| 257 |
text = gr.Textbox(
|
| 258 |
value=default_text_for_ui(initial_lang),
|
| 259 |
+
label="Texto a sintetizar (max 1000 caracteres)",
|
| 260 |
max_lines=5
|
| 261 |
)
|
| 262 |
|
|
|
|
| 264 |
choices=list(ChatterboxMultilingualTTS.get_supported_languages().keys()),
|
| 265 |
value=initial_lang,
|
| 266 |
label="Language",
|
| 267 |
+
info="Seleccion el laeguagj para la sintesis"
|
| 268 |
)
|
| 269 |
|
| 270 |
ref_wav = gr.Audio(
|
| 271 |
sources=["upload", "microphone"],
|
| 272 |
type="filepath",
|
| 273 |
+
label="Audio de Referencia (Opcional)",
|
| 274 |
value=default_audio_for_ui(initial_lang)
|
| 275 |
)
|
| 276 |
|
| 277 |
gr.Markdown(
|
| 278 |
+
"💡 **Note**: Asegurarse que el audio de referencia y el del texto sean el mismo. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
|
| 279 |
elem_classes=["audio-note"]
|
| 280 |
)
|
| 281 |
|
| 282 |
exaggeration = gr.Slider(
|
| 283 |
+
0.25, 2, step=.05, label="Exageracion (Neutral = 0.5, valores extremos son inestables)", value=.5
|
| 284 |
)
|
| 285 |
cfg_weight = gr.Slider(
|
| 286 |
0.2, 1, step=.05, label="CFG/Pace", value=0.5
|
|
|
|
| 290 |
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
|
| 291 |
temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
|
| 292 |
|
| 293 |
+
run_btn = gr.Button("Generar", variant="primary")
|
| 294 |
|
| 295 |
with gr.Column():
|
| 296 |
audio_output = gr.Audio(label="Output Audio")
|