File size: 4,750 Bytes
c30faec
 
 
 
 
 
 
 
0d2648c
 
c30faec
 
f707318
c30faec
 
 
e871805
 
c30faec
f707318
 
c30faec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d2648c
 
 
 
 
 
 
 
 
c30faec
 
 
2519e76
c30faec
2519e76
c30faec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d2648c
c30faec
2519e76
c30faec
e871805
 
c30faec
 
 
 
e871805
c30faec
 
 
 
2519e76
c30faec
 
0d2648c
 
 
c30faec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e871805
c30faec
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import random
import numpy as np
import torch
from chatterbox.src.chatterbox.tts import ChatterboxTTS
import gradio as gr
import spaces

MODEL = None
# ZeroGPU supports CUDA placement at module load time via CUDA emulation.
TARGET_DEVICE = "cuda"

DEFAULT_CONFIG = {
    "audio": 'https://storage.googleapis.com/chatterbox-demo-samples/mtl-v3-single-language-prompts/es-latam/es_mx_f1.wav',
    "text": '¡Hola! ¿Qué onda? Hoy hace un clima padrísimo para salir a caminar.',
}

FIXED_LANGUAGE_ID = 'es'

EXAMPLES = [
        ['¡Hola! ¿Qué onda? Hoy hace un clima padrísimo para salir a caminar.', 'https://storage.googleapis.com/chatterbox-demo-samples/mtl-v3-single-language-prompts/es-latam/es_mx_f1.wav', 0.5, 0.8, 0, 0.5],
        ['En la Ciudad de México hay tantas cosas por descubrir, desde tacos al pastor hasta murales de Diego Rivera.', 'https://storage.googleapis.com/chatterbox-demo-samples/mtl-v3-single-language-prompts/es-latam/es_mx_f1.wav', 0.5, 0.8, 0, 0.5],
        ['¿Vamos por unos tacos y un agua de horchata después del trabajo?', 'https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_m1.flac', 0.5, 0.8, 0, 0.5]
]


def default_audio_for_ui():
    return DEFAULT_CONFIG.get("audio")


def default_text_for_ui():
    return DEFAULT_CONFIG.get("text", "")


def get_or_load_model():
    global MODEL
    if MODEL is None:
        print(f"Model not loaded, initializing on {TARGET_DEVICE}...")
        try:
            MODEL = ChatterboxTTS.from_pretrained(TARGET_DEVICE)
        except Exception as exc:
            if TARGET_DEVICE != "cuda":
                raise
            print(f"CUDA model initialization failed, falling back to CPU: {exc}")
            MODEL = ChatterboxTTS.from_pretrained("cpu")
        print(f"Model loaded on {MODEL.device}.")
    return MODEL


def set_seed(seed: int, device: str):
    torch.manual_seed(seed)
    if device == "cuda" and torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)


@spaces.GPU
def generate_tts_audio(
    text_input: str,
    audio_prompt_path_input: str = None,
    exaggeration_input: float = 0.5,
    temperature_input: float = 0.8,
    seed_num_input: int = 0,
    cfgw_input: float = 0.5,
):
    """Generate speech from text with optional reference audio styling."""
    current_model = get_or_load_model()
    device = current_model.device
    if seed_num_input != 0:
        set_seed(int(seed_num_input), device)
    chosen_prompt = audio_prompt_path_input or default_audio_for_ui()
    lang = FIXED_LANGUAGE_ID
    print(f"Generating on {device} (lang={lang}) for text: '{text_input[:50]}...'")
    generate_kwargs = {
        "exaggeration": exaggeration_input,
        "temperature": temperature_input,
        "cfg_weight": cfgw_input,
        "language_id": lang,
    }
    if chosen_prompt:
        generate_kwargs["audio_prompt_path"] = chosen_prompt
    wav = current_model.generate(text_input[:300], **generate_kwargs)
    return (current_model.sr, wav.squeeze(0).cpu().numpy())


get_or_load_model()


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # Chatterbox Multilingual TTS — Spanish (LatAm)
        Chatterbox TTS for LatAm Spanish (es-MX).
        Powered by model [`ResembleAI/Chatterbox-Multilingual-es-mx-latam`](https://huggingface.co/ResembleAI/Chatterbox-Multilingual-es-mx-latam).
        """
    )
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(
                value=default_text_for_ui(),
                label="Text to synthesize (max chars 300)",
                max_lines=5,
            )
            ref_wav = gr.Audio(
                sources=["upload", "microphone"],
                type="filepath",
                label="Reference Audio File (Optional)",
                value=default_audio_for_ui(),
            )
            exaggeration = gr.Slider(0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5)", value=.5)
            cfg_weight = gr.Slider(0.2, 1, step=.05, label="CFG/Pace", value=0.5)
            with gr.Accordion("More options", open=False):
                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
                temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
            run_btn = gr.Button("Generate", variant="primary")
        with gr.Column():
            audio_output = gr.Audio(label="Output Audio")

    inputs = [text, ref_wav, exaggeration, temp, seed_num, cfg_weight]
    run_btn.click(fn=generate_tts_audio, inputs=inputs, outputs=[audio_output])

    gr.Examples(
        examples=EXAMPLES,
        inputs=inputs,
        label="Examples",
    )

demo.launch(mcp_server=True)