Spaces:
Sleeping
Sleeping
File size: 6,257 Bytes
4428f5f 8b5510c 4428f5f 71ee5ef 8b5510c 71ee5ef 8b5510c 71ee5ef d39efb0 8b5510c 58fd207 8b5510c 71ee5ef 8b5510c 71ee5ef 8b5510c 52c31e0 8b5510c 6620877 8b5510c 52c31e0 8b5510c 6620877 8b5510c 6620877 8b5510c 6620877 8b5510c c92f551 6699d9a 8b5510c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | #!/usr/bin/env python3
"""
NumberBlocks One Voice Cloning Space - VoxCPM V3
使用 VoxCPM 2 模型进行音色克隆推理
"""
import os
import gradio as gr
import tempfile
import soundfile as sf
import traceback
from pathlib import Path
# 环境变量检查
HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
def load_model():
"""加载 VoxCPM 模型"""
try:
from voxcpm import VoxCPM
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading VoxCPM model on {device}...")
# V3: optimize=False 避免兼容性问题
model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
print("Model loaded successfully!")
return model, device, None
except Exception as e:
print(f"Error loading model: {e}")
traceback.print_exc()
return None, "cpu", str(e)
# 全局模型状态
MODEL_STATE = {
"model": None,
"device": "cpu",
"error": None,
"loading": False
}
def ensure_model():
"""确保模型已加载"""
if MODEL_STATE["model"] is None and not MODEL_STATE["loading"]:
MODEL_STATE["loading"] = True
try:
model, device, error = load_model()
MODEL_STATE["model"] = model
MODEL_STATE["device"] = device
MODEL_STATE["error"] = error
except Exception as e:
MODEL_STATE["error"] = str(e)
traceback.print_exc()
finally:
MODEL_STATE["loading"] = False
return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]
def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
"""生成音频"""
if not text or not reference_audio:
return None, "❌ 请输入文本和参考音频"
if not text.strip():
return None, "❌ 文本不能为空"
try:
model, device, error = ensure_model()
if error:
return None, f"❌ 模型加载失败: {error}"
if model is None:
return None, "❌ 模型正在加载中,请稍候..."
# 读取参考音频
ref_audio, sr = sf.read(reference_audio)
# 如果是立体声,转换为单声道
if len(ref_audio.shape) > 1:
ref_audio = ref_audio[:, 0]
# 保存到临时文件
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
sf.write(tmp.name, ref_audio, sr)
ref_path = tmp.name
print(f"Generating with text: {text[:50]}...")
print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz")
# 生成音频
import time
t0 = time.time()
wav = model.generate(
text=text,
reference_wav_path=ref_path,
cfg_value=float(cfg_value),
inference_timesteps=int(steps),
)
elapsed = time.time() - t0
# 保存输出
sample_rate = model.tts_model.sample_rate
output_path = "/tmp/voxcpm_output.wav"
sf.write(output_path, wav, sample_rate)
duration = len(wav) / sample_rate
msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s"
print(msg)
# 清理临时文件
os.unlink(ref_path)
return output_path, msg
except Exception as e:
error_msg = f"❌ 生成失败: {str(e)}"
print(f"Error: {e}")
traceback.print_exc()
return None, error_msg
# 预设文本
PRESET_TEXTS = {
"问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
"计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!",
"情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
}
# 创建 Gradio 界面
with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)")
gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="输入文本",
placeholder="输入要合成的文本...",
lines=3,
value=PRESET_TEXTS["问候"]
)
with gr.Row():
for name, txt in PRESET_TEXTS.items():
gr.Button(name).click(lambda t=txt: t, inputs=None, outputs=text_input)
with gr.Column():
ref_audio_input = gr.Audio(
label="参考音频 (One 的声音)",
type="filepath"
)
with gr.Row():
cfg_slider = gr.Slider(
minimum=0.5,
maximum=5.0,
value=2.0,
step=0.1,
label="CFG Value (越高越像参考音色)"
)
steps_slider = gr.Slider(
minimum=5,
maximum=50,
value=10,
step=1,
label="推理步数 (越高质量越好但越慢)"
)
generate_btn = gr.Button("🎙️ 生成音频", variant="primary")
with gr.Row():
output_audio = gr.Audio(label="生成结果")
status_msg = gr.Markdown(value="⏸️ 等待生成...")
generate_btn.click(
fn=generate_audio,
inputs=[text_input, ref_audio_input, cfg_slider, steps_slider],
outputs=[output_audio, status_msg]
)
gr.Markdown("---")
gr.Markdown("### 说明")
gr.Markdown("""
- **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音)
- **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色
- **推理步数**: 默认 10,越高质量越好但生成越慢
- **模型**: VoxCPM 2 (openbmb/VoxCPM2)
""")
if __name__ == "__main__":
# 启动时预加载模型
import threading
def preload():
print("Preloading VoxCPM model...")
ensure_model()
threading.Thread(target=preload, daemon=True).start()
demo.launch()
|