File size: 6,257 Bytes
4428f5f
 
8b5510c
 
4428f5f
71ee5ef
 
8b5510c
71ee5ef
 
8b5510c
71ee5ef
d39efb0
8b5510c
 
58fd207
8b5510c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71ee5ef
8b5510c
 
 
 
71ee5ef
8b5510c
52c31e0
8b5510c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6620877
8b5510c
 
 
 
52c31e0
8b5510c
 
 
 
 
6620877
8b5510c
 
 
 
 
 
 
6620877
8b5510c
 
 
 
 
 
6620877
8b5510c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c92f551
6699d9a
8b5510c
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/usr/bin/env python3
"""
NumberBlocks One Voice Cloning Space - VoxCPM V3
使用 VoxCPM 2 模型进行音色克隆推理
"""

import os
import gradio as gr
import tempfile
import soundfile as sf
import traceback
from pathlib import Path

# 环境变量检查
HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))

def load_model():
    """加载 VoxCPM 模型"""
    try:
        from voxcpm import VoxCPM
        import torch
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Loading VoxCPM model on {device}...")
        
        # V3: optimize=False 避免兼容性问题
        model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
        print("Model loaded successfully!")
        return model, device, None
    except Exception as e:
        print(f"Error loading model: {e}")
        traceback.print_exc()
        return None, "cpu", str(e)

# 全局模型状态
MODEL_STATE = {
    "model": None,
    "device": "cpu",
    "error": None,
    "loading": False
}

def ensure_model():
    """确保模型已加载"""
    if MODEL_STATE["model"] is None and not MODEL_STATE["loading"]:
        MODEL_STATE["loading"] = True
        try:
            model, device, error = load_model()
            MODEL_STATE["model"] = model
            MODEL_STATE["device"] = device
            MODEL_STATE["error"] = error
        except Exception as e:
            MODEL_STATE["error"] = str(e)
            traceback.print_exc()
        finally:
            MODEL_STATE["loading"] = False
    return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]

def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
    """生成音频"""
    if not text or not reference_audio:
        return None, "❌ 请输入文本和参考音频"
    
    if not text.strip():
        return None, "❌ 文本不能为空"
    
    try:
        model, device, error = ensure_model()
        if error:
            return None, f"❌ 模型加载失败: {error}"
        if model is None:
            return None, "❌ 模型正在加载中,请稍候..."
        
        # 读取参考音频
        ref_audio, sr = sf.read(reference_audio)
        
        # 如果是立体声,转换为单声道
        if len(ref_audio.shape) > 1:
            ref_audio = ref_audio[:, 0]
        
        # 保存到临时文件
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            sf.write(tmp.name, ref_audio, sr)
            ref_path = tmp.name
        
        print(f"Generating with text: {text[:50]}...")
        print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz")
        
        # 生成音频
        import time
        t0 = time.time()
        wav = model.generate(
            text=text,
            reference_wav_path=ref_path,
            cfg_value=float(cfg_value),
            inference_timesteps=int(steps),
        )
        elapsed = time.time() - t0
        
        # 保存输出
        sample_rate = model.tts_model.sample_rate
        output_path = "/tmp/voxcpm_output.wav"
        sf.write(output_path, wav, sample_rate)
        
        duration = len(wav) / sample_rate
        msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s"
        print(msg)
        
        # 清理临时文件
        os.unlink(ref_path)
        
        return output_path, msg
        
    except Exception as e:
        error_msg = f"❌ 生成失败: {str(e)}"
        print(f"Error: {e}")
        traceback.print_exc()
        return None, error_msg

# 预设文本
PRESET_TEXTS = {
    "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
    "计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!",
    "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
}

# 创建 Gradio 界面
with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
    gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)")
    gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="输入文本",
                placeholder="输入要合成的文本...",
                lines=3,
                value=PRESET_TEXTS["问候"]
            )
            
            with gr.Row():
                for name, txt in PRESET_TEXTS.items():
                    gr.Button(name).click(lambda t=txt: t, inputs=None, outputs=text_input)
        
        with gr.Column():
            ref_audio_input = gr.Audio(
                label="参考音频 (One 的声音)",
                type="filepath"
            )
    
    with gr.Row():
        cfg_slider = gr.Slider(
            minimum=0.5,
            maximum=5.0,
            value=2.0,
            step=0.1,
            label="CFG Value (越高越像参考音色)"
        )
        steps_slider = gr.Slider(
            minimum=5,
            maximum=50,
            value=10,
            step=1,
            label="推理步数 (越高质量越好但越慢)"
        )
    
    generate_btn = gr.Button("🎙️ 生成音频", variant="primary")
    
    with gr.Row():
        output_audio = gr.Audio(label="生成结果")
        status_msg = gr.Markdown(value="⏸️ 等待生成...")
    
    generate_btn.click(
        fn=generate_audio,
        inputs=[text_input, ref_audio_input, cfg_slider, steps_slider],
        outputs=[output_audio, status_msg]
    )
    
    gr.Markdown("---")
    gr.Markdown("### 说明")
    gr.Markdown("""
    - **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音)
    - **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色
    - **推理步数**: 默认 10,越高质量越好但生成越慢
    - **模型**: VoxCPM 2 (openbmb/VoxCPM2)
    """)

if __name__ == "__main__":
    # 启动时预加载模型
    import threading
    def preload():
        print("Preloading VoxCPM model...")
        ensure_model()
    
    threading.Thread(target=preload, daemon=True).start()
    
    demo.launch()