""" ============================================================================ AI-GENERATED CODE ============================================================================ """ """ Pocket TTS Web Application ========================== A Gradio-based web interface for the Pocket TTS text-to-speech model. This application provides an intuitive interface for generating speech from text using either preset voices or voice cloning capabilities. Features: --------- - Multiple preset voice options - Voice cloning from uploaded audio files - Configurable generation parameters (temperature, LSD steps, etc.) - Real-time character counting and validation - Temporary file management with automatic cleanup - Thread-safe generation state management Usage: ------ Run this script directly to launch the web application: $ python app.py The application will be available at http://localhost:7860 """ import os import time import torch import tempfile import threading import scipy.io.wavfile import gradio as gr from pocket_tts import TTSModel # ============================================================================= # ENVIRONMENT CONFIGURATION # ============================================================================= # Configure PyTorch threading behavior torch.set_num_threads(2) # Intra-op parallelism threads torch.set_num_interop_threads(2) # Inter-op parallelism threads # ============================================================================= # APPLICATION CONSTANTS # ============================================================================= # Define all configurable constants and default values used throughout # the application. These values control model behavior, UI constraints, # and resource management policies. # Available preset voice options for speech generation AVAILABLE_VOICES = [ "alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma" ] # Default configuration values DEFAULT_VOICE = "alba" # Default preset voice selection DEFAULT_MODEL_VARIANT = "b6369a24" # Model variant identifier DEFAULT_TEMPERATURE = 0.7 # Generation temperature DEFAULT_LSD_DECODE_STEPS = 1 # Latent space decode steps DEFAULT_EOS_THRESHOLD = -4.0 # End-of-sequence detection threshold DEFAULT_NOISE_CLAMP = 0.0 # Noise clamping value (0 = disabled) DEFAULT_FRAMES_AFTER_EOS = 10 # Additional frames after EOS # Input constraints and resource management MAXIMUM_INPUT_LENGTH = 1000 # Maximum text input characters TEMPORARY_FILE_LIFETIME_SECONDS = 7200 # Temp file retention (2 hours) # Voice mode selection options VOICE_MODE_PRESET = "Preset Voices" # Use predefined voice VOICE_MODE_CLONE = "Voice Cloning" # Clone voice from audio # Example prompts with associated voice presets for demonstration EXAMPLE_PROMPTS_WITH_VOICES = [ { "text": "The quick brown fox jumps over the lazy dog near the riverbank.", "voice": "alba" }, { "text": "Welcome to the future of text to speech technology powered by artificial intelligence.", "voice": "marius" }, { "text": "Technology continues to push the boundaries of what we thought was possible.", "voice": "javert" }, { "text": "The weather today is absolutely beautiful and perfect for a relaxing walk outside.", "voice": "fantine" }, { "text": "Science and innovation are transforming how we interact with the world around us.", "voice": "jean" } ] # ============================================================================= # THREAD SYNCHRONIZATION # ============================================================================= # Global state management for thread-safe generation operations. # These locks and flags prevent concurrent generation requests and # enable graceful cancellation of ongoing operations. generation_state_lock = threading.Lock() # Lock for generation state access is_currently_generating = False # Flag indicating active generation stop_generation_requested = False # Flag for stop request signaling # Temporary file registry for cleanup management temporary_files_registry = {} # Maps file paths to creation timestamps temporary_files_lock = threading.Lock() # Lock for registry access # ============================================================================= # TEXT-TO-SPEECH MANAGER CLASS # ============================================================================= class TextToSpeechManager: """ Manages TTS model lifecycle and speech generation operations. This class handles model loading, configuration caching, voice state management, and audio generation. It implements lazy loading and caching strategies to optimize performance and memory usage. Attributes: loaded_model: Currently loaded TTS model instance current_configuration: Dict of current model configuration voice_state_cache: Cache of computed voice states for preset voices Example: >>> manager = TextToSpeechManager() >>> manager.load_or_get_model("b6369a24", 0.7, 1, None, -4.0) >>> voice_state = manager.get_voice_state_for_preset("alba") >>> audio = manager.generate_audio("Hello world", voice_state, 10, False) """ def __init__(self): """Initialize the TTS manager with empty state.""" self.loaded_model = None self.current_configuration = {} self.voice_state_cache = {} def load_or_get_model( self, model_variant, temperature, lsd_decode_steps, noise_clamp, eos_threshold ): """ Load a TTS model or return cached instance if configuration matches. This method implements lazy loading with configuration-based caching. If the requested configuration differs from the currently loaded model, a new model instance is created and the voice state cache is cleared. Args: model_variant: Model variant identifier string temperature: Generation temperature (float, 0.1-2.0) lsd_decode_steps: Number of LSD decode steps (int, 1-20) noise_clamp: Maximum noise value or None to disable eos_threshold: End-of-sequence detection threshold (float) Returns: TTSModel: Loaded and configured TTS model instance """ # Process and validate input parameters with defaults processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip() processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD # Build configuration dictionary for comparison requested_configuration = { "variant": processed_variant, "temp": processed_temperature, "lsd_decode_steps": processed_lsd_steps, "noise_clamp": processed_noise_clamp, "eos_threshold": processed_eos_threshold } # Load new model if configuration changed or no model loaded if self.loaded_model is None or self.current_configuration != requested_configuration: self.loaded_model = TTSModel.load_model(**requested_configuration) self.current_configuration = requested_configuration self.voice_state_cache = {} # Clear cache on model change return self.loaded_model def get_voice_state_for_preset(self, voice_name): """ Get or compute voice state for a preset voice. Voice states are cached to avoid redundant computation for frequently used preset voices. Args: voice_name: Name of the preset voice (must be in AVAILABLE_VOICES) Returns: Voice state tensor for the specified preset voice """ # Validate voice name and fall back to default if invalid validated_voice = voice_name if voice_name in AVAILABLE_VOICES else DEFAULT_VOICE # Compute and cache voice state if not already cached if validated_voice not in self.voice_state_cache: self.voice_state_cache[validated_voice] = self.loaded_model.get_state_for_audio_prompt( audio_conditioning=validated_voice, truncate=False ) return self.voice_state_cache[validated_voice] def get_voice_state_for_clone(self, audio_file_path): """ Compute voice state from an uploaded audio file for voice cloning. Unlike preset voices, cloned voice states are not cached as they are typically unique per request. Args: audio_file_path: Path to the uploaded audio file Returns: Voice state tensor extracted from the audio file """ return self.loaded_model.get_state_for_audio_prompt( audio_conditioning=audio_file_path, truncate=False ) def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames): """ Generate speech audio from text using the specified voice state. Args: text_content: Text string to convert to speech voice_state: Pre-computed voice state tensor frames_after_eos: Number of frames to generate after EOS enable_custom_frames: Whether to use custom frame count Returns: torch.Tensor: Generated audio waveform """ # Apply custom frames setting if enabled processed_frames = int(frames_after_eos) if enable_custom_frames else None return self.loaded_model.generate_audio( model_state=voice_state, text_to_generate=text_content, frames_after_eos=processed_frames, copy_state=True ) def save_audio_to_file(self, audio_tensor): """ Save generated audio tensor to a temporary WAV file. The file is registered for automatic cleanup after the configured lifetime expires. Args: audio_tensor: PyTorch tensor containing audio waveform Returns: str: Path to the saved temporary WAV file """ # Convert tensor to numpy array for scipy audio_numpy_data = audio_tensor.numpy() audio_sample_rate = self.loaded_model.sample_rate # Create temporary file and write audio data output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data) # Register file for cleanup tracking with temporary_files_lock: temporary_files_registry[output_file.name] = time.time() return output_file.name # Create global TTS manager instance text_to_speech_manager = TextToSpeechManager() # ============================================================================= # UTILITY FUNCTIONS # ============================================================================= def cleanup_expired_temporary_files(): """ Remove temporary files that have exceeded their lifetime. This function is called periodically to prevent disk space exhaustion from accumulated temporary audio files. Files older than TEMPORARY_FILE_LIFETIME_SECONDS are removed from disk and registry. """ current_timestamp = time.time() expired_files = [] with temporary_files_lock: # Identify expired files for file_path, creation_timestamp in list(temporary_files_registry.items()): if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS: expired_files.append(file_path) # Remove expired files from disk and registry for file_path in expired_files: try: if os.path.exists(file_path): os.remove(file_path) del temporary_files_registry[file_path] except Exception: pass # Silently ignore deletion errors def validate_text_input(text_content): """ Validate and clean text input for speech generation. Args: text_content: Raw text input from user Returns: tuple: (is_valid: bool, result: str) - If valid: (True, cleaned_text) - If invalid: (False, error_message or empty string) """ # Check for None or non-string input if not text_content or not isinstance(text_content, str): return False, "" # Clean whitespace cleaned_text = text_content.strip() # Check for empty content if not cleaned_text: return False, "" # Check length constraint if len(cleaned_text) > MAXIMUM_INPUT_LENGTH: return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters." return True, cleaned_text def request_generation_stop(): """ Signal a request to stop the current generation. Returns: gr.update: Update to disable the stop button """ global stop_generation_requested stop_generation_requested = True return gr.update(interactive=False) # ============================================================================= # SPEECH GENERATION FUNCTION # ============================================================================= def perform_speech_generation( text_input, voice_mode_selection, voice_preset_selection, voice_clone_audio_file, model_variant, lsd_decode_steps, temperature, noise_clamp, eos_threshold, frames_after_eos, enable_custom_frames ): """ Perform the complete speech generation workflow. This function orchestrates the entire generation process including: validation, model loading, voice state preparation, audio generation, and file saving. It handles thread safety and stop requests. Args: text_input: Text to convert to speech voice_mode_selection: "Preset Voices" or "Voice Cloning" voice_preset_selection: Selected preset voice name voice_clone_audio_file: Path to uploaded audio for cloning model_variant: Model variant identifier lsd_decode_steps: Number of LSD decode steps temperature: Generation temperature noise_clamp: Noise clamping value eos_threshold: End-of-sequence threshold frames_after_eos: Frames to generate after EOS enable_custom_frames: Whether to use custom frame count Returns: str or None: Path to generated audio file, or None if stopped Raises: gr.Error: On validation failure or generation error """ global is_currently_generating, stop_generation_requested # Run cleanup before starting new generation cleanup_expired_temporary_files() # Validate text input is_valid, validation_result = validate_text_input(text_input) if not is_valid: if validation_result: raise gr.Error(validation_result) raise gr.Error("Please enter valid text to generate speech.") # Validate voice cloning audio if in clone mode if voice_mode_selection == VOICE_MODE_CLONE and not voice_clone_audio_file: raise gr.Error("Please upload an audio file for voice cloning.") # Acquire generation lock with generation_state_lock: if is_currently_generating: raise gr.Error("A generation is already in progress. Please wait.") is_currently_generating = True stop_generation_requested = False try: # Load or retrieve cached model text_to_speech_manager.load_or_get_model( model_variant, temperature, lsd_decode_steps, noise_clamp, eos_threshold ) # Check for stop request after model loading if stop_generation_requested: return None # Prepare voice state based on mode if voice_mode_selection == VOICE_MODE_CLONE: voice_state = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file) else: voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection) # Check for stop request after voice state preparation if stop_generation_requested: return None # Generate audio from text generated_audio = text_to_speech_manager.generate_audio( validation_result, voice_state, frames_after_eos, enable_custom_frames ) # Check for stop request after generation if stop_generation_requested: return None # Save audio to temporary file output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio) return output_file_path except gr.Error: raise except Exception as generation_error: raise gr.Error(f"Speech generation failed: {str(generation_error)}") finally: # Always release generation lock with generation_state_lock: is_currently_generating = False stop_generation_requested = False # ============================================================================= # UI STATE MANAGEMENT FUNCTIONS # ============================================================================= def check_generate_button_state(text_content): """ Update generate button interactivity based on text validity. Args: text_content: Current text input content Returns: gr.update: Update with interactive state """ is_valid, _ = validate_text_input(text_content) return gr.update(interactive=is_valid) def calculate_character_count_display(text_content): """ Generate HTML for character count display with color coding. Args: text_content: Current text input content Returns: str: HTML string for character count display """ character_count = len(text_content) if text_content else 0 # Use error color if over limit display_color = ( "var(--error-text-color)" if character_count > MAXIMUM_INPUT_LENGTH else "var(--body-text-color-subdued)" ) return f"
Click any example to generate speech with its assigned voice