Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| NumberBlocks One Voice Cloning Space - VoxCPM V3 | |
| 使用 VoxCPM 2 模型进行音色克隆推理 | |
| """ | |
| import os | |
| import gradio as gr | |
| import tempfile | |
| import soundfile as sf | |
| import traceback | |
| from pathlib import Path | |
| # 环境变量检查 | |
| HF_TOKEN = os.environ.get("HF_TOKEN", os.environ.get("HUGGINGFACE_TOKEN")) | |
| def load_model(): | |
| """加载 VoxCPM 模型""" | |
| try: | |
| from voxcpm import VoxCPM | |
| import torch | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Loading VoxCPM model on {device}...") | |
| # V3: optimize=False 避免兼容性问题 | |
| model = VoxCPM.from_pretrained("openbmb/VoxCPM2", load_denoiser=False, optimize=False) | |
| print("Model loaded successfully!") | |
| return model, device, None | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| traceback.print_exc() | |
| return None, "cpu", str(e) | |
| # 全局模型状态 | |
| MODEL_STATE = { | |
| "model": None, | |
| "device": "cpu", | |
| "error": None, | |
| "loading": False | |
| } | |
| def ensure_model(): | |
| """确保模型已加载""" | |
| if MODEL_STATE["model"] is None and not MODEL_STATE["loading"]: | |
| MODEL_STATE["loading"] = True | |
| try: | |
| model, device, error = load_model() | |
| MODEL_STATE["model"] = model | |
| MODEL_STATE["device"] = device | |
| MODEL_STATE["error"] = error | |
| except Exception as e: | |
| MODEL_STATE["error"] = str(e) | |
| traceback.print_exc() | |
| finally: | |
| MODEL_STATE["loading"] = False | |
| return MODEL_STATE["model"], MODEL_STATE["device"], MODEL_STATE["error"] | |
| def generate_audio(text, reference_audio, cfg_value=2.0, steps=10): | |
| """生成音频""" | |
| if not text or not reference_audio: | |
| return None, "❌ 请输入文本和参考音频" | |
| if not text.strip(): | |
| return None, "❌ 文本不能为空" | |
| try: | |
| model, device, error = ensure_model() | |
| if error: | |
| return None, f"❌ 模型加载失败: {error}" | |
| if model is None: | |
| return None, "❌ 模型正在加载中,请稍候..." | |
| # 读取参考音频 | |
| ref_audio, sr = sf.read(reference_audio) | |
| # 如果是立体声,转换为单声道 | |
| if len(ref_audio.shape) > 1: | |
| ref_audio = ref_audio[:, 0] | |
| # 保存到临时文件 | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
| sf.write(tmp.name, ref_audio, sr) | |
| ref_path = tmp.name | |
| print(f"Generating with text: {text[:50]}...") | |
| print(f"Reference audio: {len(ref_audio)/sr:.2f}s at {sr}Hz") | |
| # 生成音频 | |
| import time | |
| t0 = time.time() | |
| wav = model.generate( | |
| text=text, | |
| reference_wav_path=ref_path, | |
| cfg_value=float(cfg_value), | |
| inference_timesteps=int(steps), | |
| ) | |
| elapsed = time.time() - t0 | |
| # 保存输出 | |
| sample_rate = model.tts_model.sample_rate | |
| output_path = "/tmp/voxcpm_output.wav" | |
| sf.write(output_path, wav, sample_rate) | |
| duration = len(wav) / sample_rate | |
| msg = f"✅ 生成成功! 时长: {duration:.2f}s, 耗时: {elapsed:.1f}s" | |
| print(msg) | |
| # 清理临时文件 | |
| os.unlink(ref_path) | |
| return output_path, msg | |
| except Exception as e: | |
| error_msg = f"❌ 生成失败: {str(e)}" | |
| print(f"Error: {e}") | |
| traceback.print_exc() | |
| return None, error_msg | |
| # 预设文本 | |
| PRESET_TEXTS = { | |
| "问候": "Hello! I am One! I am the first Numberblock, and I love being number one!", | |
| "计数": "One, two, three, four, five! Counting is so much fun! I can count all the way to ten!", | |
| "情感": "Sometimes I feel a little lonely being just one, but then I remember that one is the start of everything!", | |
| } | |
| # 创建 Gradio 界面 | |
| with gr.Blocks(title="NumberBlocks One Voice Cloning") as demo: | |
| gr.Markdown("# 🎭 NumberBlocks One Voice Cloning (VoxCPM V3)") | |
| gr.Markdown("### 使用 VoxCPM 2 模型克隆 One 的声音") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="输入文本", | |
| placeholder="输入要合成的文本...", | |
| lines=3, | |
| value=PRESET_TEXTS["问候"] | |
| ) | |
| with gr.Row(): | |
| for name, txt in PRESET_TEXTS.items(): | |
| gr.Button(name).click(lambda t=txt: t, inputs=None, outputs=text_input) | |
| with gr.Column(): | |
| ref_audio_input = gr.Audio( | |
| label="参考音频 (One 的声音)", | |
| type="filepath" | |
| ) | |
| with gr.Row(): | |
| cfg_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=5.0, | |
| value=2.0, | |
| step=0.1, | |
| label="CFG Value (越高越像参考音色)" | |
| ) | |
| steps_slider = gr.Slider( | |
| minimum=5, | |
| maximum=50, | |
| value=10, | |
| step=1, | |
| label="推理步数 (越高质量越好但越慢)" | |
| ) | |
| generate_btn = gr.Button("🎙️ 生成音频", variant="primary") | |
| with gr.Row(): | |
| output_audio = gr.Audio(label="生成结果") | |
| status_msg = gr.Markdown(value="⏸️ 等待生成...") | |
| generate_btn.click( | |
| fn=generate_audio, | |
| inputs=[text_input, ref_audio_input, cfg_slider, steps_slider], | |
| outputs=[output_audio, status_msg] | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("### 说明") | |
| gr.Markdown(""" | |
| - **参考音频**: 上传 One 的声音片段(建议 5-15 秒清晰语音) | |
| - **CFG Value**: 控制音色相似度,默认 2.0,越高越像参考音色 | |
| - **推理步数**: 默认 10,越高质量越好但生成越慢 | |
| - **模型**: VoxCPM 2 (openbmb/VoxCPM2) | |
| """) | |
| if __name__ == "__main__": | |
| # 启动时预加载模型 | |
| import threading | |
| def preload(): | |
| print("Preloading VoxCPM model...") | |
| ensure_model() | |
| threading.Thread(target=preload, daemon=True).start() | |
| demo.launch() | |