#!/usr/bin/env python3 """ NumberBlocks One Voice Cloning Space - VoxCPM V3 使用 VoxCPM 2 模型进行音色克隆推理 """ import os import gradio as gr import tempfile import soundfile as sf import traceback from pathlib import Path # 环境变量检查 HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN")) def load_model(): """加载 VoxCPM 模型""" try: from voxcpm import VoxCPM import torch device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading VoxCPM model on {device}...") # V3: optimize=False 避免兼容性问题 model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False) print("Model loaded successfully!") return model, device, None except Exception as e: print(f"Error loading model: {e}") traceback.print_exc() return None, "cpu", str(e) # 全局模型状态 MODEL_STATE = { "model": None, "device": "cpu", "error": None, "loading": False } def ensure_model(): """确保模型已加载""" if MODEL_STATE["model"] is None and not MODEL_STATE["loading"]: MODEL_STATE["loading"] = True try: model, device, error = load_model() MODEL_STATE["model"] = model MODEL_STATE["device"] = device MODEL_STATE["error"] = error except Exception as e: MODEL_STATE["error"] = str(e) traceback.print_exc() finally: MODEL_STATE["loading"] = False return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"] def generate_audio(text, reference_audio, cfg_value=2.0, steps=10): """生成音频""" if not text or not reference_audio: return None, "❌ 请输入文本和参考音频" if not text.strip(): return None, "❌ 文本不能为空" try: model, device, error = ensure_model() if error: return None, f"❌ 模型加载失败: {error}" if model is None: return None, "❌ 模型正在加载中,请稍候..." # 读取参考音频 ref_audio, sr = sf.read(reference_audio) # 如果是立体声,转换为单声道 if len(ref_audio.shape) > 1: ref_audio = ref_audio[:, 0] # 保存到临时文件 with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: sf.write(tmp.name, ref_audio, sr) ref_path = tmp.name print(f"Generating with text: {text[:50]}...") print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz") # 生成音频 import time t0 = time.time() wav = model.generate( text=text, reference_wav_path=ref_path, cfg_value=float(cfg_value), inference_timesteps=int(steps), ) elapsed = time.time() - t0 # 保存输出 sample_rate = model.tts_model.sample_rate output_path = "/tmp/voxcpm_output.wav" sf.write(output_path, wav, sample_rate) duration = len(wav) / sample_rate msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s" print(msg) # 清理临时文件 os.unlink(ref_path) return output_path, msg except Exception as e: error_msg = f"❌ 生成失败: {str(e)}" print(f"Error: {e}") traceback.print_exc() return None, error_msg # 预设文本 PRESET_TEXTS = { "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!", "计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!", "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!", } # 创建 Gradio 界面 with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo: gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)") gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="输入文本", placeholder="输入要合成的文本...", lines=3, value=PRESET_TEXTS["问候"] ) with gr.Row(): for name, txt in PRESET_TEXTS.items(): gr.Button(name).click(lambda t=txt: t, inputs=None, outputs=text_input) with gr.Column(): ref_audio_input = gr.Audio( label="参考音频 (One 的声音)", type="filepath" ) with gr.Row(): cfg_slider = gr.Slider( minimum=0.5, maximum=5.0, value=2.0, step=0.1, label="CFG Value (越高越像参考音色)" ) steps_slider = gr.Slider( minimum=5, maximum=50, value=10, step=1, label="推理步数 (越高质量越好但越慢)" ) generate_btn = gr.Button("🎙️ 生成音频", variant="primary") with gr.Row(): output_audio = gr.Audio(label="生成结果") status_msg = gr.Markdown(value="⏸️ 等待生成...") generate_btn.click( fn=generate_audio, inputs=[text_input, ref_audio_input, cfg_slider, steps_slider], outputs=[output_audio, status_msg] ) gr.Markdown("---") gr.Markdown("### 说明") gr.Markdown(""" - **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音) - **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色 - **推理步数**: 默认 10,越高质量越好但生成越慢 - **模型**: VoxCPM 2 (openbmb/VoxCPM2) """) if __name__ == "__main__": # 启动时预加载模型 import threading def preload(): print("Preloading VoxCPM model...") ensure_model() threading.Thread(target=preload, daemon=True).start() demo.launch()