Numberblocks1Voice

Sleeping

App Files Files Community

Numberblocks1Voice / app.py

ayf3

Upload app.py with huggingface_hub

6699d9a verified about 2 months ago

raw

history blame

15.9 kB

	#!/usr/bin/env python3
	"""
	NumberBlocks One Voice Cloner - V5 Fixed
	Pinned gradio version to avoid jinja2 schema bugs
	"""

	import os
	import random
	import tempfile
	import numpy as np
	import soundfile as sf
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchaudio
	from pathlib import Path
	from huggingface_hub import hf_hub_download, HfApi
	import gradio as gr

	# ============================================================
	# 模型定义 - VITS-like RVC Model
	# ============================================================

	class PosteriorEncoder(nn.Module):
	def __init__(self, in_channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
	super().__init__()
	self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
	self.enc = nn.ModuleList()
	for _ in range(n_layers):
	self.enc.append(nn.Sequential(
	nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
	padding=(kernel_size - 1) * dilation_rate // 2,
	dilation=dilation_rate),
	nn.GLU(dim=1),
	))
	self.proj = nn.Conv1d(hidden_channels, hidden_channels * 2, 1)

	def forward(self, x):
	x = self.pre(x)
	for layer in self.enc:
	x = x + layer(x)
	stats = self.proj(x)
	m, logs = stats.chunk(2, dim=1)
	return m, logs


	class ResidualCouplingLayer(nn.Module):
	def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_layers=4):
	super().__init__()
	self.pre = nn.Conv1d(channels, hidden_channels, 1)
	self.enc = nn.ModuleList()
	for _ in range(n_layers):
	self.enc.append(nn.Sequential(
	nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
	padding=(kernel_size - 1) * dilation_rate // 2,
	dilation=dilation_rate),
	nn.GLU(dim=1),
	))
	self.post = nn.Conv1d(hidden_channels, channels * 2, 1)
	self.post.weight.data.zero_()
	self.post.bias.data.zero_()

	def forward(self, x, reverse=False):
	h = self.pre(x)
	for layer in self.enc:
	h = h + layer(h)
	stats = self.post(h)
	m, logs = stats.chunk(2, dim=1)
	if not reverse:
	log_s = torch.clamp(logs, -5.0, 5.0)
	y = m + x * torch.exp(log_s)
	logdet = torch.sum(log_s)
	return y, logdet
	else:
	log_s = torch.clamp(logs, -5.0, 5.0)
	y = (x - m) * torch.exp(-log_s)
	return y


	class Flip(nn.Module):
	def forward(self, x, reverse=False):
	if not reverse:
	return torch.flip(x, [1]), 0
	else:
	return torch.flip(x, [1])


	class ResidualCouplingBlock(nn.Module):
	def __init__(self, channels, hidden_channels, kernel_size=5, dilation_rate=1, n_flows=4, n_layers=4):
	super().__init__()
	self.flows = nn.ModuleList()
	for _ in range(n_flows):
	self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers))
	self.flows.append(Flip())

	def forward(self, x, reverse=False):
	if not reverse:
	for flow in self.flows:
	x, _ = flow(x, reverse=reverse)
	else:
	for flow in reversed(self.flows):
	x = flow(x, reverse=reverse)
	return x


	class Decoder(nn.Module):
	def __init__(self, hidden_channels, out_channels, kernel_size=5, dilation_rate=1, n_layers=4):
	super().__init__()
	self.pre = nn.Conv1d(hidden_channels, hidden_channels, 1)
	self.dec = nn.ModuleList()
	for _ in range(n_layers):
	self.dec.append(nn.Sequential(
	nn.Conv1d(hidden_channels, hidden_channels, kernel_size,
	padding=(kernel_size - 1) * dilation_rate // 2,
	dilation=dilation_rate),
	nn.GLU(dim=1),
	))
	self.proj = nn.Conv1d(hidden_channels, out_channels, 1)

	def forward(self, x):
	x = self.pre(x)
	for layer in self.dec:
	x = x + layer(x)
	return self.proj(x)


	class RVCModel(nn.Module):
	"""VITS-like RVC v3.0 Model"""
	def __init__(self, n_mels=80, hidden_channels=192):
	super().__init__()
	self.enc_p = PosteriorEncoder(n_mels, hidden_channels)
	self.flow = ResidualCouplingBlock(hidden_channels, hidden_channels)
	self.dec = Decoder(hidden_channels, n_mels)
	self.n_mels = n_mels

	def forward(self, mel):
	m, logs = self.enc_p(mel)
	z = m + torch.randn_like(logs) * torch.exp(logs) * 0.0
	z_p = self.flow(z)
	z_back = self.flow(z_p, reverse=True)
	mel_out = self.dec(z_back)
	return mel_out

	def infer(self, mel, noise_scale=0.0):
	m, logs = self.enc_p(mel)
	z = m + torch.randn_like(logs) * torch.exp(logs) * noise_scale
	z_p = self.flow(z)
	z_back = self.flow(z_p, reverse=True)
	mel_out = self.dec(z_back)
	return mel_out


	# ============================================================
	# HiFi-GAN Vocoder
	# ============================================================

	class ResBlock1(nn.Module):
	def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
	super().__init__()
	self.convs = nn.ModuleList()
	for d in dilation:
	self.convs.append(nn.Sequential(
	nn.LeakyReLU(0.1),
	nn.Conv1d(channels, channels, kernel_size, dilation=d,
	padding=(kernel_size - 1) * d // 2),
	nn.LeakyReLU(0.1),
	nn.Conv1d(channels, channels, kernel_size, dilation=1,
	padding=(kernel_size - 1) // 2),
	))

	def forward(self, x):
	for conv in self.convs:
	x = x + conv(x)
	return x


	class HiFiGANGenerator(nn.Module):
	def __init__(self, in_channels=80, upsample_rates=(8, 8, 2, 2),
	upsample_kernel_sizes=(16, 16, 4, 4),
	upsample_initial_channel=512,
	resblock_kernel_sizes=(3, 7, 11),
	resblock_dilation_sizes=((1, 3, 5), (1, 3, 5), (1, 3, 5))):
	super().__init__()
	self.conv_pre = nn.Conv1d(in_channels, upsample_initial_channel, 7, padding=3)
	self.num_upsamples = len(upsample_rates)
	self.num_kernels = len(resblock_kernel_sizes)
	self.ups = nn.ModuleList()
	self.resblocks = nn.ModuleList()
	ch = upsample_initial_channel
	for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
	ch_new = ch // 2
	self.ups.append(nn.ConvTranspose1d(ch, ch_new, k, u, padding=(k - u) // 2))
	for _, (rk, rd) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
	self.resblocks.append(ResBlock1(ch_new, rk, rd))
	ch = ch_new
	self.conv_post = nn.Sequential(
	nn.LeakyReLU(0.1),
	nn.Conv1d(ch, 1, 7, padding=3),
	nn.Tanh(),
	)

	def forward(self, x):
	x = self.conv_pre(x)
	for i in range(self.num_upsamples):
	x = F.leaky_relu(x, 0.1)
	x = self.ups[i](x)
	xs = 0
	for j in range(self.num_kernels):
	xs += self.resblocks[i * self.num_kernels + j](x)
	x = xs / self.num_kernels
	x = self.conv_post(x)
	return x


	# ============================================================
	# Mel utilities (no librosa)
	# ============================================================

	def compute_mel(y, sample_rate=40000, n_fft=1024, hop_length=256, n_mels=80):
	mel_transform = torchaudio.transforms.MelSpectrogram(
	sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
	n_mels=n_mels, f_min=0.0, f_max=float(sample_rate // 2),
	power=2.0, norm=None, mel_scale="htk",
	)
	mel = mel_transform(y)
	mel = torch.log(torch.clamp(mel, min=1e-5))
	return mel


	def mel_to_audio_griffinlim(mel, sample_rate=40000, n_fft=1024, hop_length=256, n_iter=32):
	inverse_mel = torchaudio.transforms.InverseMelScale(
	n_stft=n_fft // 2 + 1, n_mels=mel.shape[0],
	sample_rate=sample_rate, f_min=0, f_max=float(sample_rate // 2), mel_scale="htk",
	)
	mel_power = torch.exp(mel)
	spec = inverse_mel(mel_power)
	griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, hop_length=hop_length, n_iter=n_iter)
	audio = griffin_lim(spec)
	return audio.numpy()


	# ============================================================
	# Inference Engine
	# ============================================================

	class VoiceCloner:
	def __init__(self):
	self.device = torch.device('cpu')
	self.rvc_model = None
	self.hifigan = None
	self.sample_rate = 40000
	self.dataset_id = "ayf3/numberblocks-one-voice-dataset"
	self.model_loaded = False
	self.samples = []
	self.load_models()

	def load_models(self):
	print("Loading RVC model...")
	try:
	model_path = hf_hub_download(
	repo_id=self.dataset_id, filename="models/one_voice_rvc_v2.pth", repo_type="dataset"
	)
	ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
	state_dict = ckpt.get('model', ckpt.get('state_dict', ckpt))
	hidden_ch = 192
	for k, v in state_dict.items():
	if 'enc_p.pre.weight' in k:
	hidden_ch = v.shape[0]
	break
	self.rvc_model = RVCModel(n_mels=80, hidden_channels=hidden_ch)
	self.rvc_model.load_state_dict(state_dict, strict=False)
	self.rvc_model.eval()
	print(f"RVC model loaded (hidden={hidden_ch})")
	except Exception as e:
	print(f"RVC model load failed: {e}")

	print("Loading HiFi-GAN vocoder...")
	try:
	hifigan_path = hf_hub_download(repo_id="jik876/hifi-gan", filename="UNIVERSAL_V1/g_02500000")
	ckpt = torch.load(hifigan_path, map_location='cpu', weights_only=False)
	state_dict = ckpt.get('generator', ckpt.get('state_dict', ckpt))
	if any(k.startswith('generator.') for k in state_dict):
	state_dict = {k.replace('generator.', ''): v for k, v in state_dict.items() if k.startswith('generator.')}
	self.hifigan = HiFiGANGenerator()
	self.hifigan.load_state_dict(state_dict, strict=False)
	self.hifigan.eval()
	print("HiFi-GAN vocoder loaded")
	except Exception as e:
	print(f"HiFi-GAN load failed: {e}, using Griffin-Lim fallback")
	self.hifigan = None

	try:
	api = HfApi()
	files = api.list_repo_files(self.dataset_id, repo_type="dataset")
	self.samples = [f for f in files if f.startswith('models/top_')
	and f.endswith('.wav')
	and '_p+' not in f and '_p-' not in f and '_s+' not in f]
	print(f"Found {len(self.samples)} sample audio files")
	except Exception as e:
	print(f"Could not list samples: {e}")

	self.model_loaded = self.rvc_model is not None

	def process_audio(self, input_audio, pitch_shift=0):
	if not self.model_loaded:
	return None, "Model not loaded"
	if input_audio is None:
	return None, "Please upload an audio file"
	try:
	y, sr = torchaudio.load(input_audio)
	if y.shape[0] > 1:
	y = y.mean(dim=0)
	else:
	y = y.squeeze(0)
	if sr != self.sample_rate:
	y = torchaudio.transforms.Resample(sr, self.sample_rate)(y)
	sr = self.sample_rate

	if pitch_shift != 0:
	factor = 2.0 ** (abs(pitch_shift) / 12.0)
	new_len = int(len(y) / factor) if pitch_shift > 0 else int(len(y) * factor)
	y = F.interpolate(y.unsqueeze(0).unsqueeze(0), size=new_len, mode='linear').squeeze(0).squeeze(0)

	# Trim silence
	energy = y ** 2
	window_size = int(0.1 * sr)
	if len(energy) > window_size:
	kernel = torch.ones(window_size) / window_size
	smooth_energy = F.conv1d(
	energy.unsqueeze(0).unsqueeze(0), kernel.unsqueeze(0).unsqueeze(0), padding=window_size // 2
	).squeeze()
	threshold = smooth_energy.max() * (10 ** (-20 / 10))
	active = torch.where(smooth_energy > threshold)[0]
	if len(active) > 0:
	y = y[active[0]:active[-1] + 1]

	max_len = 10 * self.sample_rate
	if len(y) > max_len:
	y = y[:max_len]

	mel = compute_mel(y, sample_rate=self.sample_rate, n_mels=80)

	with torch.no_grad():
	mel_out = self.rvc_model.infer(mel.unsqueeze(0), noise_scale=0.0)
	mel_out = mel_out.squeeze(0)

	if self.hifigan is not None:
	with torch.no_grad():
	audio_out = self.hifigan(mel_out.unsqueeze(0))
	audio_out = audio_out.squeeze(0).squeeze(0).cpu().numpy()
	vocoder_name = "HiFi-GAN"
	else:
	audio_out = mel_to_audio_griffinlim(mel_out, sr=self.sample_rate)
	vocoder_name = "Griffin-Lim"

	audio_out = audio_out / (np.max(np.abs(audio_out)) + 1e-7) * 0.95
	output_path = tempfile.mktemp(suffix='.wav')
	sf.write(output_path, audio_out, self.sample_rate)
	return output_path, f"Success ({vocoder_name}) \| {len(y)/sr:.1f}s -> {len(audio_out)/self.sample_rate:.1f}s"
	except Exception as e:
	import traceback
	traceback.print_exc()
	return None, f"Error: {str(e)}"

	def generate_random(self):
	if not self.samples:
	return None, "No samples available"
	try:
	sample = random.choice(self.samples)
	sample_path = hf_hub_download(repo_id=self.dataset_id, filename=sample, repo_type="dataset")
	output, msg = self.process_audio(sample_path)
	if output:
	return output, f"{msg}\nSample: {Path(sample).name}"
	return output, msg
	except Exception as e:
	return None, f"Error: {str(e)}"


	# ============================================================
	# Gradio UI
	# ============================================================

	print("Initializing NumberBlocks One Voice Cloner...")
	cloner = VoiceCloner()

	vc_interface = gr.Interface(
	fn=cloner.process_audio,
	inputs=[
	gr.Audio(label="Upload Audio", type="filepath"),
	gr.Slider(minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)"),
	],
	outputs=[
	gr.Audio(label="Result", type="filepath"),
	gr.Textbox(label="Status"),
	],
	title="NumberBlocks One Voice Cloner",
	description="RVC v2 Model (60.7MB) + HiFi-GAN Vocoder \| Upload audio to convert to One's voice",
	allow_flagging="never",
	)

	rand_interface = gr.Interface(
	fn=cloner.generate_random,
	inputs=[],
	outputs=[
	gr.Audio(label="Result", type="filepath"),
	gr.Textbox(label="Status"),
	],
	title="Random Sample Generation",
	description="Generate from random dataset sample + RVC conversion",
	allow_flagging="never",
	)

	demo = gr.TabbedInterface(
	[vc_interface, rand_interface],
	["Voice Conversion", "Random Sample"],
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)