Spaces:
Running on Zero
Running on Zero
File size: 4,750 Bytes
c30faec 0d2648c c30faec f707318 c30faec e871805 c30faec f707318 c30faec 0d2648c c30faec 2519e76 c30faec 2519e76 c30faec 0d2648c c30faec 2519e76 c30faec e871805 c30faec e871805 c30faec 2519e76 c30faec 0d2648c c30faec e871805 c30faec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | import random
import numpy as np
import torch
from chatterbox.src.chatterbox.tts import ChatterboxTTS
import gradio as gr
import spaces
MODEL = None
# ZeroGPU supports CUDA placement at module load time via CUDA emulation.
TARGET_DEVICE = "cuda"
DEFAULT_CONFIG = {
"audio": 'https://storage.googleapis.com/chatterbox-demo-samples/mtl-v3-single-language-prompts/es-latam/es_mx_f1.wav',
"text": '¡Hola! ¿Qué onda? Hoy hace un clima padrísimo para salir a caminar.',
}
FIXED_LANGUAGE_ID = 'es'
EXAMPLES = [
['¡Hola! ¿Qué onda? Hoy hace un clima padrísimo para salir a caminar.', 'https://storage.googleapis.com/chatterbox-demo-samples/mtl-v3-single-language-prompts/es-latam/es_mx_f1.wav', 0.5, 0.8, 0, 0.5],
['En la Ciudad de México hay tantas cosas por descubrir, desde tacos al pastor hasta murales de Diego Rivera.', 'https://storage.googleapis.com/chatterbox-demo-samples/mtl-v3-single-language-prompts/es-latam/es_mx_f1.wav', 0.5, 0.8, 0, 0.5],
['¿Vamos por unos tacos y un agua de horchata después del trabajo?', 'https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_m1.flac', 0.5, 0.8, 0, 0.5]
]
def default_audio_for_ui():
return DEFAULT_CONFIG.get("audio")
def default_text_for_ui():
return DEFAULT_CONFIG.get("text", "")
def get_or_load_model():
global MODEL
if MODEL is None:
print(f"Model not loaded, initializing on {TARGET_DEVICE}...")
try:
MODEL = ChatterboxTTS.from_pretrained(TARGET_DEVICE)
except Exception as exc:
if TARGET_DEVICE != "cuda":
raise
print(f"CUDA model initialization failed, falling back to CPU: {exc}")
MODEL = ChatterboxTTS.from_pretrained("cpu")
print(f"Model loaded on {MODEL.device}.")
return MODEL
def set_seed(seed: int, device: str):
torch.manual_seed(seed)
if device == "cuda" and torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
@spaces.GPU
def generate_tts_audio(
text_input: str,
audio_prompt_path_input: str = None,
exaggeration_input: float = 0.5,
temperature_input: float = 0.8,
seed_num_input: int = 0,
cfgw_input: float = 0.5,
):
"""Generate speech from text with optional reference audio styling."""
current_model = get_or_load_model()
device = current_model.device
if seed_num_input != 0:
set_seed(int(seed_num_input), device)
chosen_prompt = audio_prompt_path_input or default_audio_for_ui()
lang = FIXED_LANGUAGE_ID
print(f"Generating on {device} (lang={lang}) for text: '{text_input[:50]}...'")
generate_kwargs = {
"exaggeration": exaggeration_input,
"temperature": temperature_input,
"cfg_weight": cfgw_input,
"language_id": lang,
}
if chosen_prompt:
generate_kwargs["audio_prompt_path"] = chosen_prompt
wav = current_model.generate(text_input[:300], **generate_kwargs)
return (current_model.sr, wav.squeeze(0).cpu().numpy())
get_or_load_model()
with gr.Blocks() as demo:
gr.Markdown(
"""
# Chatterbox Multilingual TTS — Spanish (LatAm)
Chatterbox TTS for LatAm Spanish (es-MX).
Powered by model [`ResembleAI/Chatterbox-Multilingual-es-mx-latam`](https://huggingface.co/ResembleAI/Chatterbox-Multilingual-es-mx-latam).
"""
)
with gr.Row():
with gr.Column():
text = gr.Textbox(
value=default_text_for_ui(),
label="Text to synthesize (max chars 300)",
max_lines=5,
)
ref_wav = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Reference Audio File (Optional)",
value=default_audio_for_ui(),
)
exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5)", value=.5)
cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG/Pace", value=0.5)
with gr.Accordion("More options", open=False):
seed_num = gr.Number(value=0, label="Random seed (0 for random)")
temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
run_btn = gr.Button("Generate", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Output Audio")
inputs = [text, ref_wav, exaggeration, temp, seed_num, cfg_weight]
run_btn.click(fn=generate_tts_audio, inputs=inputs, outputs=[audio_output])
gr.Examples(
examples=EXAMPLES,
inputs=inputs,
label="Examples",
)
demo.launch(mcp_server=True)
|