ayf3's picture
Upload app.py with huggingface_hub
8b5510c verified
raw
history blame
6.26 kB
#!/usr/bin/env python3
"""
NumberBlocks One Voice Cloning Space - VoxCPM V3
使用 VoxCPM 2 模型进行音色克隆推理
"""
import os
import gradio as gr
import tempfile
import soundfile as sf
import traceback
from pathlib import Path
# 环境变量检查
HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN"))
def load_model():
"""加载 VoxCPM 模型"""
try:
from voxcpm import VoxCPM
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading VoxCPM model on {device}...")
# V3: optimize=False 避免兼容性问题
model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False)
print("Model loaded successfully!")
return model, device, None
except Exception as e:
print(f"Error loading model: {e}")
traceback.print_exc()
return None, "cpu", str(e)
# 全局模型状态
MODEL_STATE = {
"model": None,
"device": "cpu",
"error": None,
"loading": False
}
def ensure_model():
"""确保模型已加载"""
if MODEL_STATE["model"] is None and not MODEL_STATE["loading"]:
MODEL_STATE["loading"] = True
try:
model, device, error = load_model()
MODEL_STATE["model"] = model
MODEL_STATE["device"] = device
MODEL_STATE["error"] = error
except Exception as e:
MODEL_STATE["error"] = str(e)
traceback.print_exc()
finally:
MODEL_STATE["loading"] = False
return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"]
def generate_audio(text, reference_audio, cfg_value=2.0, steps=10):
"""生成音频"""
if not text or not reference_audio:
return None, "❌ 请输入文本和参考音频"
if not text.strip():
return None, "❌ 文本不能为空"
try:
model, device, error = ensure_model()
if error:
return None, f"❌ 模型加载失败: {error}"
if model is None:
return None, "❌ 模型正在加载中,请稍候..."
# 读取参考音频
ref_audio, sr = sf.read(reference_audio)
# 如果是立体声,转换为单声道
if len(ref_audio.shape) > 1:
ref_audio = ref_audio[:, 0]
# 保存到临时文件
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
sf.write(tmp.name, ref_audio, sr)
ref_path = tmp.name
print(f"Generating with text: {text[:50]}...")
print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz")
# 生成音频
import time
t0 = time.time()
wav = model.generate(
text=text,
reference_wav_path=ref_path,
cfg_value=float(cfg_value),
inference_timesteps=int(steps),
)
elapsed = time.time() - t0
# 保存输出
sample_rate = model.tts_model.sample_rate
output_path = "/tmp/voxcpm_output.wav"
sf.write(output_path, wav, sample_rate)
duration = len(wav) / sample_rate
msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s"
print(msg)
# 清理临时文件
os.unlink(ref_path)
return output_path, msg
except Exception as e:
error_msg = f"❌ 生成失败: {str(e)}"
print(f"Error: {e}")
traceback.print_exc()
return None, error_msg
# 预设文本
PRESET_TEXTS = {
"问候": "Hello! I am One! I am the first Numberblock, and I love being number one!",
"计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!",
"情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!",
}
# 创建 Gradio 界面
with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo:
gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)")
gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="输入文本",
placeholder="输入要合成的文本...",
lines=3,
value=PRESET_TEXTS["问候"]
)
with gr.Row():
for name, txt in PRESET_TEXTS.items():
gr.Button(name).click(lambda t=txt: t, inputs=None, outputs=text_input)
with gr.Column():
ref_audio_input = gr.Audio(
label="参考音频 (One 的声音)",
type="filepath"
)
with gr.Row():
cfg_slider = gr.Slider(
minimum=0.5,
maximum=5.0,
value=2.0,
step=0.1,
label="CFG Value (越高越像参考音色)"
)
steps_slider = gr.Slider(
minimum=5,
maximum=50,
value=10,
step=1,
label="推理步数 (越高质量越好但越慢)"
)
generate_btn = gr.Button("🎙️ 生成音频", variant="primary")
with gr.Row():
output_audio = gr.Audio(label="生成结果")
status_msg = gr.Markdown(value="⏸️ 等待生成...")
generate_btn.click(
fn=generate_audio,
inputs=[text_input, ref_audio_input, cfg_slider, steps_slider],
outputs=[output_audio, status_msg]
)
gr.Markdown("---")
gr.Markdown("### 说明")
gr.Markdown("""
- **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音)
- **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色
- **推理步数**: 默认 10,越高质量越好但生成越慢
- **模型**: VoxCPM 2 (openbmb/VoxCPM2)
""")
if __name__ == "__main__":
# 启动时预加载模型
import threading
def preload():
print("Preloading VoxCPM model...")
ensure_model()
threading.Thread(target=preload, daemon=True).start()
demo.launch()