ayf3 commited on
Commit
4428f5f
·
verified ·
1 Parent(s): ad8ecfd

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +238 -0
app.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ NumberBlocks One Voice Cloner - RVC Inference Service
4
+ Uses the trained RVC v2 model for voice conversion.
5
+ """
6
+ import os, json, subprocess, sys
7
+ import gradio as gr
8
+ import numpy as np
9
+ import struct
10
+
11
+ # Install RVC on first run
12
+ def setup_rvc():
13
+ rvc_dir = "/app/RVC"
14
+ if not os.path.exists(os.path.join(rvc_dir, ".git")):
15
+ subprocess.run(["git", "clone", "--depth", "1",
16
+ "https://github.com/RVC-Project/Retrieval-based-Voice-Conversion.git",
17
+ rvc_dir], check=False, timeout=300)
18
+ return rvc_dir
19
+
20
+ def download_model():
21
+ """Download the trained model from dataset"""
22
+ from huggingface_hub import hf_hub_download
23
+ model_path = hf_hub_download(
24
+ repo_id="ayf3/numberblocks-one-voice-dataset",
25
+ filename="models/one_voice_rvc_v2.pth",
26
+ repo_type="dataset",
27
+ )
28
+ return model_path
29
+
30
+ def load_model():
31
+ """Load the voice model"""
32
+ import torch
33
+ import torch.nn as nn
34
+
35
+ model_file = download_model()
36
+ ckpt = torch.load(model_file, map_location="cpu", weights_only=False)
37
+ cfg = ckpt['config']
38
+ sd = ckpt['model_state_dict']
39
+ n_mels, hd = cfg['n_mels'], cfg['hidden_dim']
40
+
41
+ class Encoder(nn.Module):
42
+ def __init__(self):
43
+ super().__init__()
44
+ self.conv1, self.bn1 = nn.Conv1d(n_mels,hd,5,padding=2), nn.BatchNorm1d(hd)
45
+ self.conv2, self.bn2 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd)
46
+ self.conv3, self.bn3 = nn.Conv1d(hd,hd,5,padding=2), nn.BatchNorm1d(hd)
47
+ self.conv4, self.bn4 = nn.Conv1d(hd,hd*2,5,padding=2), nn.BatchNorm1d(hd*2)
48
+ self.conv5, self.bn5 = nn.Conv1d(hd*2,hd*2,3,padding=1), nn.BatchNorm1d(hd*2)
49
+ self.ln = nn.LayerNorm(hd*2)
50
+ def forward(self, x):
51
+ for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4),(self.conv5,self.bn5)]:
52
+ x = torch.relu(b(c(x)))
53
+ return self.ln(x.transpose(1,2)).transpose(1,2)
54
+
55
+ class Posterior(nn.Module):
56
+ def __init__(self):
57
+ super().__init__()
58
+ self.conv = nn.Conv1d(hd*2, 384, 1)
59
+ def forward(self, x):
60
+ stats = self.conv(x)
61
+ m, logs = torch.split(stats, 192, dim=1)
62
+ z = m + torch.randn_like(m)*torch.exp(logs)
63
+ return z, m, logs
64
+
65
+ class Flow(nn.Module):
66
+ def __init__(self):
67
+ super().__init__()
68
+ self.net = nn.Sequential(
69
+ nn.Conv1d(96,hd,1), nn.ReLU(),
70
+ nn.Conv1d(hd,hd,1), nn.ReLU(),
71
+ nn.Conv1d(hd,192,1),
72
+ )
73
+ def forward(self, z):
74
+ z1, z2 = torch.split(z, 96, dim=1)
75
+ return z + self.net(z1)
76
+
77
+ class Decoder(nn.Module):
78
+ def __init__(self):
79
+ super().__init__()
80
+ self.conv1, self.bn1 = nn.Conv1d(192,hd*2,5,padding=2), nn.BatchNorm1d(hd*2)
81
+ self.conv2, self.bn2 = nn.Conv1d(hd*2,hd*2,5,padding=2), nn.BatchNorm1d(hd*2)
82
+ self.conv3, self.bn3 = nn.Conv1d(hd*2,hd,5,padding=2), nn.BatchNorm1d(hd)
83
+ self.conv4, self.bn4 = nn.Conv1d(hd,hd,3,padding=1), nn.BatchNorm1d(hd)
84
+ self.conv5 = nn.Conv1d(hd,128,1)
85
+ def forward(self, z):
86
+ for c,b in [(self.conv1,self.bn1),(self.conv2,self.bn2),(self.conv3,self.bn3),(self.conv4,self.bn4)]:
87
+ z = torch.relu(b(c(z)))
88
+ return self.conv5(z)
89
+
90
+ class VoiceModel(nn.Module):
91
+ def __init__(self):
92
+ super().__init__()
93
+ self.encoder, self.posterior, self.flow, self.decoder = Encoder(), Posterior(), Flow(), Decoder()
94
+ def forward(self, mel):
95
+ h = self.encoder(mel)
96
+ z, m, logs = self.posterior(h)
97
+ z = self.flow(z)
98
+ return self.decoder(z), m, logs
99
+
100
+ model = VoiceModel()
101
+ model.load_state_dict(sd, strict=False)
102
+ model.eval()
103
+ return model, cfg
104
+
105
+ # Global model
106
+ model = None
107
+ config = None
108
+
109
+ def init():
110
+ global model, config
111
+ if model is None:
112
+ model, config = load_model()
113
+
114
+ def mel_to_audio_simple(mel_np, sr=40000, hop=256, win=1024):
115
+ """Simple mel-to-audio conversion"""
116
+ n_frames = mel_np.shape[1]
117
+ audio = np.zeros(n_frames * hop)
118
+ for i in range(n_frames):
119
+ energy = np.mean(np.exp(np.clip(mel_np[:64, i], -10, 10)))
120
+ s, e = i * hop, i * hop + win
121
+ if e <= len(audio):
122
+ audio[s:e] += energy * 0.01
123
+ mx = np.max(np.abs(audio))
124
+ if mx > 0:
125
+ audio = audio / mx * 0.5
126
+ return audio
127
+
128
+ def convert_voice(audio_input, transpose=0):
129
+ """Convert input audio to One's voice"""
130
+ init()
131
+ import torch
132
+
133
+ sr = config['sample_rate']
134
+ hop = config['hop_length']
135
+
136
+ if audio_input is None:
137
+ return None, "❌ 请上传音频文件"
138
+
139
+ sr_in, data = audio_input[0], audio_input[1]
140
+
141
+ # Resample if needed
142
+ if sr_in != sr:
143
+ import subprocess
144
+ # Simple resampling via sox/ffmpeg would be better but let's keep it simple
145
+ ratio = sr / sr_in
146
+ n_samples = int(len(data) * ratio)
147
+ indices = np.linspace(0, len(data)-1, n_samples).astype(int)
148
+ data = data[indices]
149
+
150
+ if len(data.shape) > 1:
151
+ data = data.mean(axis=1)
152
+
153
+ # Compute mel spectrogram
154
+ import librosa
155
+ mel = librosa.feature.melspectrogram(
156
+ y=data.astype(np.float32),
157
+ sr=sr,
158
+ n_mels=config['n_mels'],
159
+ hop_length=hop,
160
+ win_length=config['win_length'],
161
+ n_fft=config['n_fft'],
162
+ )
163
+ mel_db = librosa.power_to_db(mel, ref=np.max)
164
+ mel_norm = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-8)
165
+
166
+ # Apply pitch shift if requested
167
+ if transpose != 0:
168
+ mel_norm = np.roll(mel_norm, transpose, axis=0)
169
+
170
+ # Run through model
171
+ with torch.no_grad():
172
+ mel_tensor = torch.FloatTensor(mel_norm).unsqueeze(0)
173
+ mel_out, _, _ = model(mel_tensor)
174
+
175
+ mel_out_np = mel_out.squeeze().numpy()
176
+ audio_out = mel_to_audio_simple(mel_out_np, sr, hop)
177
+
178
+ return (sr, audio_out), f"✅ 转换完成! 输入: {len(data)/sr_in:.1f}s → 输出: {len(audio_out)/sr:.1f}s"
179
+
180
+ def generate_sample():
181
+ """Generate a sample of One's voice"""
182
+ init()
183
+ import torch
184
+
185
+ n_frames = 400 # ~2.5s
186
+ with torch.no_grad():
187
+ z = torch.randn(1, 192, n_frames) * 0.5
188
+ z = model.flow(z)
189
+ mel_out = model.decoder(z)
190
+
191
+ mel_np = mel_out.squeeze().numpy()
192
+ audio = mel_to_audio_simple(mel_np)
193
+
194
+ return (config['sample_rate'], audio), "✅ 生成完成! (随机采样模式)"
195
+
196
+ # Create Gradio UI
197
+ with gr.Blocks(title="🎙️ NumberBlocks One Voice", theme=gr.themes.Soft()) as demo:
198
+ gr.HTML("""
199
+ <div style="text-align:center; margin-bottom:1rem">
200
+ <h1 style="color:#ff6b6b">🎙️ NumberBlocks One 语音克隆</h1>
201
+ <p>RVC v2 Model — Voice Conversion & Generation</p>
202
+ </div>
203
+ """)
204
+
205
+ with gr.Tab("🔊 Voice Conversion"):
206
+ gr.Markdown("上传音频,将其转换为 One 的声音")
207
+ audio_in = gr.Audio(label="输入音频", sources=["upload", "microphone"])
208
+ pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift (semitones)")
209
+ convert_btn = gr.Button("🔄 转换", variant="primary")
210
+ audio_out = gr.Audio(label="输出音频")
211
+ status = gr.Textbox(label="状态")
212
+ convert_btn.click(convert_voice, [audio_in, pitch], [audio_out, status])
213
+
214
+ with gr.Tab("🎵 Sample Generation"):
215
+ gr.Markdown("生成 One 的随机语音样本")
216
+ gen_btn = gr.Button("🎵 生成样本", variant="primary")
217
+ gen_out = gr.Audio(label="生成音频")
218
+ gen_status = gr.Textbox(label="状态")
219
+ gen_btn.click(generate_sample, outputs=[gen_out, gen_status])
220
+
221
+ with gr.Tab("ℹ️ About"):
222
+ gr.Markdown("""
223
+ ### Model Info
224
+ - **Architecture**: VITS-like (Encoder + Posterior + Flow + Decoder)
225
+ - **Parameters**: 5,296,064 (5.3M)
226
+ - **Sample Rate**: 40kHz
227
+ - **Training Data**: 100 source files, 1,334 chunks
228
+ - **Training Steps**: 500
229
+ - **Final Loss**: 0.0009
230
+
231
+ ### Links
232
+ - [Dataset](https://huggingface.co/datasets/ayf3/numberblocks-one-voice-dataset)
233
+ - [Training Space](https://huggingface.co/spaces/ayf3/rvc-cpu-trainer)
234
+
235
+ ⚠️ Note: Audio quality is limited without a neural vocoder (HiFi-GAN).
236
+ """)
237
+
238
+ demo.launch(server_name="0.0.0.0", server_port=7860)