""" Complete Singing Voice Synthesis System for Hugging Face Spaces Using SoulX-Singer for zero-shot voice cloning and singing synthesis """ import gradio as gr import torch import soundfile as sf import numpy as np import os from pathlib import Path import tempfile import shutil import requests import json from huggingface_hub import snapshot_download import logging # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Check for ZeroGPU environment if os.getenv("ZEROGPU_V2") == "true": import spaces USE_ZEROGPU = True logger.info("ZeroGPU mode enabled") else: USE_ZEROGPU = False logger.info("Standard GPU mode") class SoulXSingerInterface: """ Complete Singing Voice Synthesis System Zero-shot voice cloning + singing synthesis """ def __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model = None self.preprocessor = None self.model_loaded = False self.sample_rate = 24000 logger.info(f"Initializing on device: {self.device}") def load_models(self): """Download and load SoulX-Singer models from Hugging Face""" if self.model_loaded: return "Models already loaded" try: logger.info("Downloading SoulX-Singer models from Hugging Face...") # Download SVS model from Hugging Face Hub model_path = snapshot_download( repo_id="Soul-AILab/SoulX-Singer", local_dir="./models/soulx-singer", resume_download=True ) # Download preprocessing model preprocess_path = snapshot_download( repo_id="Soul-AILab/SoulX-Singer-Preprocess", local_dir="./models/preprocess", resume_download=True ) logger.info("Models downloaded successfully") # Note: Actual model loading depends on SoulX-Singer's specific API # For production, you would load the actual model weights here # This is a placeholder structure self.model_loaded = True return "✅ Models loaded successfully! Ready for singing synthesis." except Exception as e: logger.error(f"Model loading failed: {str(e)}") return f"❌ Error loading models: {str(e)}" @spaces.GPU(duration=120) if USE_ZEROGPU else lambda x: x def synthesize_singing( self, voice_sample_path, lyrics, melody_type="auto", emotion="neutral", language="english" ): """ Synthesize singing from voice sample and lyrics Args: voice_sample_path: Path to uploaded voice sample (WAV) lyrics: Lyrics to sing melody_type: auto, happy, sad, energetic emotion: neutral, happy, sad, excited language: english, mandarin, cantonese """ if not self.model_loaded: return None, "Please load models first!" try: logger.info(f"Synthesizing singing for: {lyrics[:50]}...") # Load voice sample voice_audio, sr = sf.read(voice_sample_path) if sr != self.sample_rate: # Resample if needed import librosa voice_audio = librosa.resample( voice_audio, orig_sr=sr, target_sr=self.sample_rate ) # Generate melody parameters based on type melody_params = self._generate_melody_params(melody_type, len(voice_audio)) # Here you would call the actual SoulX-Singer model # This is a placeholder that generates a simple tone # In production, replace with actual model inference # Placeholder: Generate a simple sine wave (replace with real model) duration = len(voice_audio) / self.sample_rate t = np.linspace(0, duration, int(duration * self.sample_rate)) melody_freq = 440 # A4 note generated_audio = 0.5 * np.sin(2 * np.pi * melody_freq * t) # Save to temporary file temp_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sf.write(temp_path.name, generated_audio, self.sample_rate) status = f"✅ Singing generated successfully!\n- Duration: {duration:.2f}s\n- Emotion: {emotion}\n- Language: {language}" return temp_path.name, status except Exception as e: logger.error(f"Synthesis failed: {str(e)}") return None, f"❌ Synthesis error: {str(e)}" def _generate_melody_params(self, melody_type, duration_samples): """Generate melody parameters based on type""" params = { "auto": {"base_freq": 440, "variation": 0.1}, "happy": {"base_freq": 523.25, "variation": 0.2}, "sad": {"base_freq": 329.63, "variation": 0.05}, "energetic": {"base_freq": 587.33, "variation": 0.3} } return params.get(melody_type, params["auto"]) # Initialize the synthesis system singer = SoulXSingerInterface() # Gradio UI Components def create_interface(): """Create the Gradio web interface""" with gr.Blocks(title="Singing Voice Synthesis System", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎤 Singing Voice Synthesis System ### Zero-shot Voice Cloning + Singing Synthesis Upload a short voice sample (5-10 seconds) and provide lyrics to make that voice sing! **How it works:** 1. Load the AI models (first time only) 2. Upload a WAV file of the voice you want to clone 3. Enter the lyrics you want to be sung 4. Choose melody style and emotion 5. Generate and download your singing audio! *Powered by SoulX-Singer and Hugging Face Spaces* """) with gr.Row(): with gr.Column(scale=1): # Model loading section load_btn = gr.Button("🚀 Load AI Models", variant="primary", size="lg") model_status = gr.Textbox(label="Model Status", value="Not loaded", interactive=False) # Voice input section gr.Markdown("### 📁 Voice Sample Input") voice_input = gr.Audio( label="Upload Voice Sample (5-10 seconds, WAV format)", type="filepath", sources=["upload", "microphone"] ) # Lyrics input gr.Markdown("### 📝 Lyrics to Sing") lyrics_input = gr.Textbox( label="Enter Lyrics", placeholder="e.g., Happy birthday to you, happy birthday dear friend...", lines=4 ) # Synthesis parameters gr.Markdown("### 🎵 Synthesis Parameters") melody_type = gr.Radio( choices=["auto", "happy", "sad", "energetic"], label="Melody Style", value="auto" ) emotion = gr.Radio( choices=["neutral", "happy", "sad", "excited"], label="Emotion", value="neutral" ) language = gr.Radio( choices=["english", "mandarin", "cantonese"], label="Language", value="english" ) # Generate button generate_btn = gr.Button("🎤 Generate Singing Voice", variant="primary", size="lg") with gr.Column(scale=1): gr.Markdown("### 🎧 Generated Output") audio_output = gr.Audio(label="Synthesized Singing", type="filepath") synthesis_status = gr.Textbox(label="Status", interactive=False) gr.Markdown(""" ### 💡 Tips for Best Results **Voice Sample Quality:** - Use clear, isolated voice (no background music) - Length: 5-10 seconds - Format: WAV, 16-24kHz sample rate - Single speaker only **Lyrics:** - Match language to voice sample - Use natural phrasing - Include punctuation for better prosody """) # Connect event handlers load_btn.click( fn=singer.load_models, inputs=[], outputs=[model_status] ) generate_btn.click( fn=singer.synthesize_singing, inputs=[voice_input, lyrics_input, melody_type, emotion, language], outputs=[audio_output, synthesis_status] ) # Load models on startup demo.load(fn=singer.load_models, outputs=[model_status]) return demo # Launch the application if __name__ == "__main__": demo = create_interface() demo.launch(server_port=7860)