Numberblocks1Voice

Sleeping

App Files Files Community

Numberblocks1Voice / app.py

ayf3

Upload app.py with huggingface_hub

71ee5ef verified about 2 months ago

raw

history blame

19.6 kB

	#!/usr/bin/env python3
	"""
	NumberBlocks One Voice Cloner - HiFi-GAN V2
	集成 HiFi-GAN vocoder 提升推理音质

	功能：
	1. 上传音频 → RVC 音色转换（使用 HiFi-GAN vocoder）
	2. 随机采样生成 One 的语音
	3. 音高调节

	技术栈：
	- RVC 模型 (one_voice_rvc_v2.pth, 60.7MB VITS-like)
	- HiFi-GAN Universal Vocoder (预训练)
	- Gradio UI
	"""

	import os
	import json
	import random
	import tempfile
	import numpy as np
	import soundfile as sf
	import librosa
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import gradio as gr
	from pathlib import Path
	from huggingface_hub import hf_hub_download, HfApi

	# ============================================================
	# 模型定义 - VITS-like RVC Model
	# ============================================================

	class PosteriorEncoder(nn.Module):
	def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
	super().__init__()
	self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
	self.enc = nn.ModuleList()
	for _ in range(n_layers):
	self.enc.append(nn.Sequential(
	nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
	padding=(kernel_size - 1) * dilation_rate // 2,
	dilation=dilation_rate),
	nn.GLU(dim=1),
	))
	self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1)

	def forward(self, x):
	x = self.pre(x)
	for layer in self.enc:
	x = x + layer(x)
	stats = self.proj(x)
	m, logs = stats.chunk(2, dim=1)
	return m, logs


	class ResidualCouplingBlock(nn.Module):
	def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
	super().__init__()
	self.flows = nn.ModuleList()
	for _ in range(n_flows):
	self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
	self.flows.append(Flip())

	def forward(self, x, reverse=False):
	if not reverse:
	for flow in self.flows:
	x, _ = flow(x, reverse=reverse)
	else:
	for flow in reversed(self.flows):
	x = flow(x, reverse=reverse)
	return x


	class ResidualCouplingLayer(nn.Module):
	def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
	super().__init__()
	self.pre = nn.Conv1d(channels, hidden_channels, 1)
	self.enc = nn.ModuleList()
	for _ in range(n_layers):
	self.enc.append(nn.Sequential(
	nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
	padding=(kernel_size - 1) * dilation_rate // 2,
	dilation=dilation_rate),
	nn.GLU(dim=1),
	))
	self.post = nn.Conv1d(hidden_channels, channels * 2, 1)
	self.post.weight.data.zero_()
	self.post.bias.data.zero_()

	def forward(self, x, reverse=False):
	h = self.pre(x)
	for layer in self.enc:
	h = h + layer(h)
	stats = self.post(h)
	m, logs = stats.chunk(2, dim=1)
	if not reverse:
	log_s = torch.clamp(logs, -5.0, 5.0)
	y = m + x * torch.exp(log_s)
	logdet = torch.sum(log_s)
	return y, logdet
	else:
	log_s = torch.clamp(logs, -5.0, 5.0)
	y = (x - m) * torch.exp(-log_s)
	return y


	class Flip(nn.Module):
	def forward(self, x, reverse=False):
	if not reverse:
	return torch.flip(x, [1]), 0
	else:
	return torch.flip(x, [1])


	class Decoder(nn.Module):
	def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
	super().__init__()
	self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1)
	self.dec = nn.ModuleList()
	for _ in range(n_layers):
	self.dec.append(nn.Sequential(
	nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
	padding=(kernel_size - 1) * dilation_rate // 2,
	dilation=dilation_rate),
	nn.GLU(dim=1),
	))
	self.proj = nn.Conv1d(hidden_channels, out_channels, 1)

	def forward(self, x):
	x = self.pre(x)
	for layer in self.dec:
	x = x + layer(x)
	return self.proj(x)


	class RVCModel(nn.Module):
	"""VITS-like RVC v3.0 Model (5.3M params)"""
	def __init__(self, n_mels=80, hidden_channels=192):
	super().__init__()
	self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
	self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels)
	self.dec = Decoder(hidden_channels, n_mels)
	self.n_mels = n_mels

	def forward(self, mel):
	m, logs = self.enc_p(mel)
	z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0
	z_p = self.flow(z)
	z_back = self.flow(z_p, reverse=True)
	mel_out = self.dec(z_back)
	return mel_out

	def infer(self, mel, noise_scale=0.0):
	m, logs = self.enc_p(mel)
	z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale
	z_p = self.flow(z)
	z_back = self.flow(z_p, reverse=True)
	mel_out = self.dec(z_back)
	return mel_out


	# ============================================================
	# HiFi-GAN Vocoder Definition
	# ============================================================

	class ResBlock1(nn.Module):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
	super().__init__()
	self.convs = nn.ModuleList()
	for d in dilation:
	self.convs.append(nn.Sequential(
	nn.LeakyReLU(0.1),
	nn.Conv1d(channels, channels, kernel_size, dilation=d,
	padding=(kernel_size - 1) * d // 2),
	nn.LeakyReLU(0.1),
	nn.Conv1d(channels, channels, kernel_size, dilation=1,
	padding=(kernel_size - 1) // 2),
	))

	def forward(self, x):
	for conv in self.convs:
	x = x + conv(x)
	return x


	class HiFiGANGenerator(nn.Module):
	"""HiFi-GAN Generator (Universal V1 compatible)"""
	def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
	upsample_kernel_sizes=(16, 16, 4, 4),
	upsample_initial_channel=512,
	resblock_kernel_sizes=(3, 7, 11),
	resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
	super().__init__()
	self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)

	self.num_upsamples = len(upsample_rates)
	self.num_kernels = len(resblock_kernel_sizes)

	self.ups = nn.ModuleList()
	self.resblocks = nn.ModuleList()

	ch = upsample_initial_channel
	for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
	ch_new = ch // 2
	self.ups.append(nn.ConvTranspose1d(ch, ch_new, k, u, padding=(k - u) // 2))
	for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
	self.resblocks.append(ResBlock1(ch_new, rk, rd))
	ch = ch_new

	self.conv_post = nn.Sequential(
	nn.LeakyReLU(0.1),
	nn.Conv1d(ch, 1, 7, padding=3),
	nn.Tanh(),
	)

	def forward(self, x):
	x = self.conv_pre(x)
	for i in range(self.num_upsamples):
	x = F.leaky_relu(x, 0.1)
	x = self.ups[i](x)
	xs = 0
	for j in range(self.num_kernels):
	xs += self.resblocks[i * self.num_kernels + j](x)
	x = xs / self.num_kernels
	x = self.conv_post(x)
	return x


	# ============================================================
	# Mel-spectrogram utilities
	# ============================================================

	def mel_spectrogram(y, n_fft=1024, hop_length=256, win_length=1024,
	n_mels=80, sample_rate=40000, fmin=0, fmax=None):
	"""Compute mel spectrogram"""
	if fmax is None:
	fmax = sample_rate // 2
	mel_basis = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mels,
	fmin=fmin, fmax=fmax)
	window = torch.hann_window(win_length)

	# Pad signal
	pad_length = (win_length - hop_length) // 2
	y = torch.nn.functional.pad(y, (pad_length, pad_length), mode='reflect')

	# STFT
	stft = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_length,
	window=window, center=False, return_complex=True)
	magnitudes = torch.sqrt(stft.real 2 + stft.imag 2 + 1e-7)

	# Mel filterbank
	mel_basis_t = torch.tensor(mel_basis, dtype=magnitudes.dtype)
	mel = torch.matmul(mel_basis_t, magnitudes)

	# Log
	mel = torch.log(torch.clamp(mel, min=1e-5))
	return mel


	# ============================================================
	# Inference Engine
	# ============================================================

	class VoiceCloner:
	def __init__(self):
	self.device = torch.device('cpu')
	self.rvc_model = None
	self.hifigan = None
	self.sample_rate = 40000
	self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
	self.model_loaded = False
	self.samples = []
	self.load_models()

	def load_models(self):
	"""Load RVC model + HiFi-GAN vocoder"""
	print("Loading RVC model...")
	try:
	model_path = hf_hub_download(
	repo_id=self.dataset_id,
	filename="models/one_voice_rvc_v2.pth",
	repo_type="dataset"
	)

	ckpt = torch.load(model_path, map_location='cpu', weights_only=False)

	# Determine model config
	if isinstance(ckpt, dict) and 'model' in ckpt:
	state_dict = ckpt['model']
	elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
	state_dict = ckpt['state_dict']
	else:
	state_dict = ckpt

	# Auto-detect hidden channels from state_dict
	hidden_ch = 192
	for k, v in state_dict.items():
	if 'enc_p.pre.weight' in k:
	hidden_ch = v.shape[0]
	break

	self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
	self.rvc_model.load_state_dict(state_dict, strict=False)
	self.rvc_model.eval()
	print(f"✅ RVC model loaded (hidden={hidden_ch})")

	except Exception as e:
	print(f"❌ RVC model load failed: {e}")
	self.rvc_model = None

	print("Loading HiFi-GAN vocoder...")
	try:
	# Try loading from local or download
	hifigan_path = self._get_hifigan()
	if hifigan_path:
	ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
	if isinstance(ckpt, dict) and 'generator' in ckpt:
	state_dict = ckpt['generator']
	elif isinstance(ckpt, dict) and 'state_dict' in ckpt:
	state_dict = {k.replace('generator.', ''): v
	for k, v in ckpt['state_dict'].items()
	if k.startswith('generator.')}
	else:
	state_dict = ckpt

	self.hifigan = HiFiGANGenerator()
	self.hifigan.load_state_dict(state_dict, strict=False)
	self.hifigan.eval()
	print("✅ HiFi-GAN vocoder loaded")
	else:
	print("⚠️ HiFi-GAN not available, will use Griffin-Lim fallback")
	except Exception as e:
	print(f"⚠️ HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
	self.hifigan = None

	# Load sample list for random generation
	try:
	api = HfApi()
	files = api.list_repo_files(self.dataset_id, repo_type="dataset")
	self.samples = [f for f in files if f.startswith('models/top_')
	and f.endswith('.wav')
	and '_p+' not in f and '_p-' not in f and '_s+' not in f]
	print(f"✅ Found {len(self.samples)} sample audio files")
	except Exception as e:
	print(f"⚠️ Could not list samples: {e}")
	self.samples = []

	self.model_loaded = self.rvc_model is not None

	def _get_hifigan(self):
	"""Get HiFi-GAN model - download if needed"""
	# Try downloading from jik876/hifi-gan
	try:
	path = hf_hub_download(
	repo_id="jik876/hifi-gan",
	filename="UNIVERSAL_V1/g_02500000",
	)
	return path
	except:
	pass

	# Try alternative location
	try:
	path = hf_hub_download(
	repo_id="facebook/hifigan-universal-v1",
	filename="hifigan.pt",
	)
	return path
	except:
	pass

	return None

	def mel_to_audio_hifigan(self, mel):
	"""Convert mel spectrogram to audio using HiFi-GAN"""
	with torch.no_grad():
	audio = self.hifigan(mel.unsqueeze(0))
	return audio.squeeze(0).squeeze(0).cpu().numpy()

	def mel_to_audio_griffinlim(self, mel, sr=40000, n_fft=1024, hop_length=256, n_iter=32):
	"""Fallback: Convert mel to audio using Griffin-Lim"""
	mel_np = mel.cpu().numpy()
	S = librosa.feature.inverse.mel_to_stft(
	mel_np, sr=sr, n_fft=n_fft, power=2.0
	)
	y = librosa.griffinlim(S, n_iter=n_iter, hop_length=hop_length, win_length=n_fft)
	return y

	def process_audio(self, input_audio, pitch_shift=0):
	"""
	Process audio through RVC model + HiFi-GAN vocoder

	Args:
	input_audio: path to input audio file
	pitch_shift: semitone shift

	Returns:
	output audio path, status message
	"""
	if not self.model_loaded:
	return None, "❌ 模型未加载"

	try:
	# Load audio
	y, sr = librosa.load(input_audio, sr=self.sample_rate)

	# Apply pitch shift
	if pitch_shift != 0:
	y = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch_shift)

	# Trim silence
	y, _ = librosa.effects.trim(y, top_db=20)

	# Limit length
	max_len = 10 * self.sample_rate # 10 seconds max
	if len(y) > max_len:
	y = y[:max_len]

	# Compute mel spectrogram
	y_tensor = torch.tensor(y, dtype=torch.float32)
	mel = mel_spectrogram(y_tensor, sample_rate=self.sample_rate, n_mels=80)

	# RVC inference
	with torch.no_grad():
	mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
	mel_out = mel_out.squeeze(0)

	# Vocoder
	if self.hifigan is not None:
	audio_out = self.mel_to_audio_hifigan(mel_out)
	vocoder_name = "HiFi-GAN"
	else:
	audio_out = self.mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
	vocoder_name = "Griffin-Lim"

	# Normalize
	audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95

	# Save
	output_path = tempfile.mktemp(suffix='.wav')
	sf.write(output_path, audio_out, self.sample_rate)

	return output_path, f"✅ 转换成功 ({vocoder_name}) \| 输入: {len(y)/sr:.1f}s → 输出: {len(audio_out)/self.sample_rate:.1f}s"

	except Exception as e:
	return None, f"❌ 转换失败: {str(e)}"

	def generate_random(self):
	"""Generate audio from a random sample"""
	if not self.samples:
	return None, "❌ 没有可用的样本"

	try:
	sample = random.choice(self.samples)
	sample_path = hf_hub_download(
	repo_id=self.dataset_id,
	filename=sample,
	repo_type="dataset"
	)
	output, msg = self.process_audio(sample_path)
	if output:
	return output, f"✅ {msg}\n采样: {Path(sample).name}"
	return output, msg
	except Exception as e:
	return None, f"❌ 生成失败: {str(e)}"


	# ============================================================
	# Gradio UI
	# ============================================================

	print("🚀 Initializing NumberBlocks One Voice Cloner...")
	cloner = VoiceCloner()

	with gr.Blocks(
	title="NumberBlocks One Voice",
	theme=gr.themes.Soft(),
	css="""
	.header { text-align: center; margin-bottom: 1rem; }
	.header h1 { color: #ff6b6b; }
	"""
	) as demo:
	gr.HTML("""
	<div class="header">
	<h1>🎭 NumberBlocks One Voice Cloner</h1>
	<p>RVC v2 Model (60.7MB) + HiFi-GAN Vocoder</p>
	</div>
	""")

	with gr.Tab("🎤 Voice Conversion"):
	gr.Markdown("### 上传音频 → 转换为 One 的声音")
	with gr.Row():
	with gr.Column():
	vc_input = gr.Audio(label="上传音频", type="filepath", sources=["upload", "microphone"])
	vc_pitch = gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="音高偏移 (半音)")
	vc_btn = gr.Button("🎙️ 转换", variant="primary", size="lg")
	with gr.Column():
	vc_output = gr.Audio(label="转换结果", type="filepath")
	vc_status = gr.Textbox(label="状态")

	vc_btn.click(
	fn=cloner.process_audio,
	inputs=[vc_input, vc_pitch],
	outputs=[vc_output, vc_status]
	)

	with gr.Tab("🎲 Random Sample"):
	gr.Markdown("### 随机采样 + RVC 转换")
	with gr.Row():
	rand_btn = gr.Button("🎲 随机生成", variant="primary", size="lg")
	with gr.Row():
	rand_output = gr.Audio(label="生成结果", type="filepath")
	rand_status = gr.Textbox(label="状态")

	rand_btn.click(
	fn=cloner.generate_random,
	inputs=[],
	outputs=[rand_output, rand_status]
	)

	with gr.Tab("ℹ️ About"):
	model_status = "✅ 已加载" if cloner.model_loaded else "❌ 未加载"
	hifigan_status = "✅ HiFi-GAN" if cloner.hifigan else "⚠️ Griffin-Lim (fallback)"
	gr.Markdown(f"""
	### NumberBlocks One Voice Cloner V2

	模型: RVC v3.0 (VITS-like, 5.3M params, 60.7MB)
	Vocoder: {hifigan_status}
	采样率: 40kHz
	模型状态: {model_status}
	训练数据: 100 源文件 → 1,334 chunks, 500 steps
	Dataset: [ayf3/numberblocks-one-voice-dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)

	功能:
	- ✅ 上传音频 → One 音色转换
	- ✅ 随机采样生成
	- ✅ 音高调节 (-12 ~ +12 半音)
	- ✅ HiFi-GAN 高品质 vocoder

	限制:
	- CPU 推理，速度较慢
	- 输入建议 < 10 秒
	- 音质取决于输入质量
	""")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)