""" ============================================================================ AI-GENERATED CODE ============================================================================ """ """ Pocket TTS Web Application ========================== A Gradio-based web interface for the Pocket TTS text-to-speech model. This application provides an intuitive interface for generating speech from text using either preset voices or voice cloning capabilities. Features: --------- - Multiple preset voice options - Voice cloning from uploaded audio files - Configurable generation parameters (temperature, LSD steps, etc.) - Real-time character counting and validation - Temporary file management with automatic cleanup - Thread-safe generation state management Usage: ------ Run this script directly to launch the web application: $ python app.py The application will be available at http://localhost:7860 """ import os import time import torch import tempfile import threading import scipy.io.wavfile import gradio as gr from pocket_tts import TTSModel # ============================================================================= # ENVIRONMENT CONFIGURATION # ============================================================================= # Configure PyTorch threading behavior torch.set_num_threads(2) # Intra-op parallelism threads torch.set_num_interop_threads(2) # Inter-op parallelism threads # ============================================================================= # APPLICATION CONSTANTS # ============================================================================= # Define all configurable constants and default values used throughout # the application. These values control model behavior, UI constraints, # and resource management policies. # Available preset voice options for speech generation AVAILABLE_VOICES = [ "alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma" ] # Default configuration values DEFAULT_VOICE = "alba" # Default preset voice selection DEFAULT_MODEL_VARIANT = "b6369a24" # Model variant identifier DEFAULT_TEMPERATURE = 0.7 # Generation temperature DEFAULT_LSD_DECODE_STEPS = 1 # Latent space decode steps DEFAULT_EOS_THRESHOLD = -4.0 # End-of-sequence detection threshold DEFAULT_NOISE_CLAMP = 0.0 # Noise clamping value (0 = disabled) DEFAULT_FRAMES_AFTER_EOS = 10 # Additional frames after EOS # Input constraints and resource management MAXIMUM_INPUT_LENGTH = 1000 # Maximum text input characters TEMPORARY_FILE_LIFETIME_SECONDS = 7200 # Temp file retention (2 hours) # Voice mode selection options VOICE_MODE_PRESET = "Preset Voices" # Use predefined voice VOICE_MODE_CLONE = "Voice Cloning" # Clone voice from audio # Example prompts with associated voice presets for demonstration EXAMPLE_PROMPTS_WITH_VOICES = [ { "text": "The quick brown fox jumps over the lazy dog near the riverbank.", "voice": "alba" }, { "text": "Welcome to the future of text to speech technology powered by artificial intelligence.", "voice": "marius" }, { "text": "Technology continues to push the boundaries of what we thought was possible.", "voice": "javert" }, { "text": "The weather today is absolutely beautiful and perfect for a relaxing walk outside.", "voice": "fantine" }, { "text": "Science and innovation are transforming how we interact with the world around us.", "voice": "jean" } ] # ============================================================================= # THREAD SYNCHRONIZATION # ============================================================================= # Global state management for thread-safe generation operations. # These locks and flags prevent concurrent generation requests and # enable graceful cancellation of ongoing operations. generation_state_lock = threading.Lock() # Lock for generation state access is_currently_generating = False # Flag indicating active generation stop_generation_requested = False # Flag for stop request signaling # Temporary file registry for cleanup management temporary_files_registry = {} # Maps file paths to creation timestamps temporary_files_lock = threading.Lock() # Lock for registry access # ============================================================================= # TEXT-TO-SPEECH MANAGER CLASS # ============================================================================= class TextToSpeechManager: """ Manages TTS model lifecycle and speech generation operations. This class handles model loading, configuration caching, voice state management, and audio generation. It implements lazy loading and caching strategies to optimize performance and memory usage. Attributes: loaded_model: Currently loaded TTS model instance current_configuration: Dict of current model configuration voice_state_cache: Cache of computed voice states for preset voices Example: >>> manager = TextToSpeechManager() >>> manager.load_or_get_model("b6369a24", 0.7, 1, None, -4.0) >>> voice_state = manager.get_voice_state_for_preset("alba") >>> audio = manager.generate_audio("Hello world", voice_state, 10, False) """ def __init__(self): """Initialize the TTS manager with empty state.""" self.loaded_model = None self.current_configuration = {} self.voice_state_cache = {} def load_or_get_model( self, model_variant, temperature, lsd_decode_steps, noise_clamp, eos_threshold ): """ Load a TTS model or return cached instance if configuration matches. This method implements lazy loading with configuration-based caching. If the requested configuration differs from the currently loaded model, a new model instance is created and the voice state cache is cleared. Args: model_variant: Model variant identifier string temperature: Generation temperature (float, 0.1-2.0) lsd_decode_steps: Number of LSD decode steps (int, 1-20) noise_clamp: Maximum noise value or None to disable eos_threshold: End-of-sequence detection threshold (float) Returns: TTSModel: Loaded and configured TTS model instance """ # Process and validate input parameters with defaults processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip() processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD # Build configuration dictionary for comparison requested_configuration = { "variant": processed_variant, "temp": processed_temperature, "lsd_decode_steps": processed_lsd_steps, "noise_clamp": processed_noise_clamp, "eos_threshold": processed_eos_threshold } # Load new model if configuration changed or no model loaded if self.loaded_model is None or self.current_configuration != requested_configuration: self.loaded_model = TTSModel.load_model(**requested_configuration) self.current_configuration = requested_configuration self.voice_state_cache = {} # Clear cache on model change return self.loaded_model def get_voice_state_for_preset(self, voice_name): """ Get or compute voice state for a preset voice. Voice states are cached to avoid redundant computation for frequently used preset voices. Args: voice_name: Name of the preset voice (must be in AVAILABLE_VOICES) Returns: Voice state tensor for the specified preset voice """ # Validate voice name and fall back to default if invalid validated_voice = voice_name if voice_name in AVAILABLE_VOICES else DEFAULT_VOICE # Compute and cache voice state if not already cached if validated_voice not in self.voice_state_cache: self.voice_state_cache[validated_voice] = self.loaded_model.get_state_for_audio_prompt( audio_conditioning=validated_voice, truncate=False ) return self.voice_state_cache[validated_voice] def get_voice_state_for_clone(self, audio_file_path): """ Compute voice state from an uploaded audio file for voice cloning. Unlike preset voices, cloned voice states are not cached as they are typically unique per request. Args: audio_file_path: Path to the uploaded audio file Returns: Voice state tensor extracted from the audio file """ return self.loaded_model.get_state_for_audio_prompt( audio_conditioning=audio_file_path, truncate=False ) def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames): """ Generate speech audio from text using the specified voice state. Args: text_content: Text string to convert to speech voice_state: Pre-computed voice state tensor frames_after_eos: Number of frames to generate after EOS enable_custom_frames: Whether to use custom frame count Returns: torch.Tensor: Generated audio waveform """ # Apply custom frames setting if enabled processed_frames = int(frames_after_eos) if enable_custom_frames else None return self.loaded_model.generate_audio( model_state=voice_state, text_to_generate=text_content, frames_after_eos=processed_frames, copy_state=True ) def save_audio_to_file(self, audio_tensor): """ Save generated audio tensor to a temporary WAV file. The file is registered for automatic cleanup after the configured lifetime expires. Args: audio_tensor: PyTorch tensor containing audio waveform Returns: str: Path to the saved temporary WAV file """ # Convert tensor to numpy array for scipy audio_numpy_data = audio_tensor.numpy() audio_sample_rate = self.loaded_model.sample_rate # Create temporary file and write audio data output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data) # Register file for cleanup tracking with temporary_files_lock: temporary_files_registry[output_file.name] = time.time() return output_file.name # Create global TTS manager instance text_to_speech_manager = TextToSpeechManager() # ============================================================================= # UTILITY FUNCTIONS # ============================================================================= def cleanup_expired_temporary_files(): """ Remove temporary files that have exceeded their lifetime. This function is called periodically to prevent disk space exhaustion from accumulated temporary audio files. Files older than TEMPORARY_FILE_LIFETIME_SECONDS are removed from disk and registry. """ current_timestamp = time.time() expired_files = [] with temporary_files_lock: # Identify expired files for file_path, creation_timestamp in list(temporary_files_registry.items()): if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS: expired_files.append(file_path) # Remove expired files from disk and registry for file_path in expired_files: try: if os.path.exists(file_path): os.remove(file_path) del temporary_files_registry[file_path] except Exception: pass # Silently ignore deletion errors def validate_text_input(text_content): """ Validate and clean text input for speech generation. Args: text_content: Raw text input from user Returns: tuple: (is_valid: bool, result: str) - If valid: (True, cleaned_text) - If invalid: (False, error_message or empty string) """ # Check for None or non-string input if not text_content or not isinstance(text_content, str): return False, "" # Clean whitespace cleaned_text = text_content.strip() # Check for empty content if not cleaned_text: return False, "" # Check length constraint if len(cleaned_text) > MAXIMUM_INPUT_LENGTH: return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters." return True, cleaned_text def request_generation_stop(): """ Signal a request to stop the current generation. Returns: gr.update: Update to disable the stop button """ global stop_generation_requested stop_generation_requested = True return gr.update(interactive=False) # ============================================================================= # SPEECH GENERATION FUNCTION # ============================================================================= def perform_speech_generation( text_input, voice_mode_selection, voice_preset_selection, voice_clone_audio_file, model_variant, lsd_decode_steps, temperature, noise_clamp, eos_threshold, frames_after_eos, enable_custom_frames ): """ Perform the complete speech generation workflow. This function orchestrates the entire generation process including: validation, model loading, voice state preparation, audio generation, and file saving. It handles thread safety and stop requests. Args: text_input: Text to convert to speech voice_mode_selection: "Preset Voices" or "Voice Cloning" voice_preset_selection: Selected preset voice name voice_clone_audio_file: Path to uploaded audio for cloning model_variant: Model variant identifier lsd_decode_steps: Number of LSD decode steps temperature: Generation temperature noise_clamp: Noise clamping value eos_threshold: End-of-sequence threshold frames_after_eos: Frames to generate after EOS enable_custom_frames: Whether to use custom frame count Returns: str or None: Path to generated audio file, or None if stopped Raises: gr.Error: On validation failure or generation error """ global is_currently_generating, stop_generation_requested # Run cleanup before starting new generation cleanup_expired_temporary_files() # Validate text input is_valid, validation_result = validate_text_input(text_input) if not is_valid: if validation_result: raise gr.Error(validation_result) raise gr.Error("Please enter valid text to generate speech.") # Validate voice cloning audio if in clone mode if voice_mode_selection == VOICE_MODE_CLONE and not voice_clone_audio_file: raise gr.Error("Please upload an audio file for voice cloning.") # Acquire generation lock with generation_state_lock: if is_currently_generating: raise gr.Error("A generation is already in progress. Please wait.") is_currently_generating = True stop_generation_requested = False try: # Load or retrieve cached model text_to_speech_manager.load_or_get_model( model_variant, temperature, lsd_decode_steps, noise_clamp, eos_threshold ) # Check for stop request after model loading if stop_generation_requested: return None # Prepare voice state based on mode if voice_mode_selection == VOICE_MODE_CLONE: voice_state = text_to_speech_manager.get_voice_state_for_clone(voice_clone_audio_file) else: voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection) # Check for stop request after voice state preparation if stop_generation_requested: return None # Generate audio from text generated_audio = text_to_speech_manager.generate_audio( validation_result, voice_state, frames_after_eos, enable_custom_frames ) # Check for stop request after generation if stop_generation_requested: return None # Save audio to temporary file output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio) return output_file_path except gr.Error: raise except Exception as generation_error: raise gr.Error(f"Speech generation failed: {str(generation_error)}") finally: # Always release generation lock with generation_state_lock: is_currently_generating = False stop_generation_requested = False # ============================================================================= # UI STATE MANAGEMENT FUNCTIONS # ============================================================================= def check_generate_button_state(text_content): """ Update generate button interactivity based on text validity. Args: text_content: Current text input content Returns: gr.update: Update with interactive state """ is_valid, _ = validate_text_input(text_content) return gr.update(interactive=is_valid) def calculate_character_count_display(text_content): """ Generate HTML for character count display with color coding. Args: text_content: Current text input content Returns: str: HTML string for character count display """ character_count = len(text_content) if text_content else 0 # Use error color if over limit display_color = ( "var(--error-text-color)" if character_count > MAXIMUM_INPUT_LENGTH else "var(--body-text-color-subdued)" ) return f"
{character_count} / {MAXIMUM_INPUT_LENGTH}
" def determine_clear_button_visibility_idle(text_content, audio_output): """ Determine clear button visibility based on content state. Args: text_content: Current text input content audio_output: Current audio output value Returns: gr.update: Update with visibility state """ has_text_content = bool(text_content and text_content.strip()) has_audio_output = audio_output is not None should_show_clear = has_text_content or has_audio_output return gr.update(visible=should_show_clear) def update_voice_mode_visibility(voice_mode_value): """ Update visibility of voice selection containers based on mode. Args: voice_mode_value: Selected voice mode Returns: tuple: (preset_container_update, clone_container_update) """ if voice_mode_value == VOICE_MODE_CLONE: return gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=True), gr.update(visible=False) def switch_to_generating_state(): """ Switch UI to generation-in-progress state. Returns: tuple: Updates for (generate_button, stop_button, clear_button) """ return ( gr.update(visible=False), # Hide generate button gr.update(visible=True, interactive=True), # Show stop button gr.update(visible=False) # Hide clear button ) def switch_to_idle_state(text_content, audio_output): """ Switch UI back to idle state after generation. Args: text_content: Current text input content audio_output: Current audio output value Returns: tuple: Updates for (generate_button, stop_button, clear_button) """ has_text_content = bool(text_content and text_content.strip()) has_audio_output = audio_output is not None should_show_clear = has_text_content or has_audio_output return ( gr.update(visible=True), # Show generate button gr.update(visible=False), # Hide stop button gr.update(visible=should_show_clear) # Show clear if content exists ) def perform_clear_action(): """ Clear all input and output fields. Returns: tuple: Reset values for all clearable components """ return ( "", # Clear text input None, # Clear audio output gr.update(visible=False), # Hide clear button VOICE_MODE_PRESET, # Reset voice mode DEFAULT_VOICE, # Reset voice preset None # Clear clone audio ) # ============================================================================= # EXAMPLE HANDLING FUNCTIONS # ============================================================================= def create_example_handler(example_text, example_voice): """ Create a handler function for example button clicks. Args: example_text: Example text to set example_voice: Example voice to select Returns: function: Handler that sets example values """ def set_example_values(): return example_text, VOICE_MODE_PRESET, example_voice return set_example_values def format_example_button_label(example_text, example_voice, max_text_length=40): """ Format example button label with voice and truncated text. Args: example_text: Full example text example_voice: Voice name max_text_length: Maximum text length before truncation Returns: str: Formatted button label """ truncated_text = ( example_text[:max_text_length] + "..." if len(example_text) > max_text_length else example_text ) return f"[{example_voice}] {truncated_text}" # ============================================================================= # GRADIO APPLICATION DEFINITION # ============================================================================= with gr.Blocks() as application: # ------------------------------------------------------------------------- # SIDEBAR SECTION # ------------------------------------------------------------------------- # Contains project information, description, and credits with gr.Sidebar(): gr.HTML( """

Audio Generation Playground part of the Demo Playground, and the UltimaX Intelligence project.


This space runs the Pocket TTS model from Kyutai.

A lightweight text-to-speech (TTS) application designed to run efficiently on CPUs. Forget about the hassle of using GPUs and web APIs serving TTS models.

Additionally, this space runs with a custom Docker image to maximize the model's potential and has been optimized for the limited scope of Hugging Face Spaces.

âš ī¸ This space was created entirely by the J.A.R.V.I.S. model operating in autonomous agent mode. All code was generated by AI without human review.

This is an experimental space and is not part of production. There may be minor bugs since the code was generated by AI. However, none have been found so far.

If you find a bug, please report it in the community tab.

Like this project? You can support me by buying a coffee """ ) # ------------------------------------------------------------------------- # AUDIO OUTPUT SECTION # ------------------------------------------------------------------------- audio_output_component = gr.Audio( label="Generated Speech Output", type="filepath", interactive=False, show_download_button=True ) # ------------------------------------------------------------------------- # VOICE SELECTION SECTION # ------------------------------------------------------------------------- with gr.Accordion("🎭 Voice Selection", open=True): # Voice mode selector (preset vs cloning) voice_mode_radio = gr.Radio( label="Voice Mode", choices=[VOICE_MODE_PRESET, VOICE_MODE_CLONE], value=VOICE_MODE_PRESET, info="Choose between preset voices or clone a voice from uploaded audio" ) # Container for preset voice selection with gr.Column(visible=True) as preset_voice_container: voice_preset_dropdown = gr.Dropdown( label="Select Preset Voice", choices=AVAILABLE_VOICES, value=DEFAULT_VOICE ) # Container for voice cloning audio upload with gr.Column(visible=False) as clone_voice_container: voice_clone_audio_input = gr.Audio( label="Upload Audio for Voice Cloning", type="filepath" ) # ------------------------------------------------------------------------- # GENERATION PARAMETERS SECTION # ------------------------------------------------------------------------- with gr.Accordion("âš™ī¸ Generation Parameters", open=False): with gr.Row(): temperature_slider = gr.Slider( label="Temperature", minimum=0.1, maximum=2.0, step=0.05, value=DEFAULT_TEMPERATURE, info="Higher values produce more expressive speech" ) lsd_decode_steps_slider = gr.Slider( label="LSD Decode Steps", minimum=1, maximum=20, step=1, value=DEFAULT_LSD_DECODE_STEPS, info="More steps may improve quality but slower" ) with gr.Row(): noise_clamp_slider = gr.Slider( label="Noise Clamp", minimum=0.0, maximum=2.0, step=0.05, value=DEFAULT_NOISE_CLAMP, info="Maximum noise sampling value (0 = disabled)" ) eos_threshold_slider = gr.Slider( label="End of Sequence Threshold", minimum=-10.0, maximum=0.0, step=0.25, value=DEFAULT_EOS_THRESHOLD, info="Smaller values cause earlier completion" ) # ------------------------------------------------------------------------- # ADVANCED SETTINGS SECTION # ------------------------------------------------------------------------- with gr.Accordion("🔧 Advanced Settings", open=False): model_variant_textbox = gr.Textbox( label="Model Variant Identifier", value=DEFAULT_MODEL_VARIANT, info="Model signature for generation" ) with gr.Row(): enable_custom_frames_checkbox = gr.Checkbox( label="Enable Custom Frames After EOS", value=False, info="Manually control post-EOS frame generation" ) frames_after_eos_slider = gr.Slider( label="Frames After EOS", minimum=0, maximum=100, step=1, value=DEFAULT_FRAMES_AFTER_EOS, info="Additional frames after end-of-sequence (80ms per frame)" ) # ------------------------------------------------------------------------- # TEXT INPUT SECTION # ------------------------------------------------------------------------- text_input_component = gr.Textbox( label="Prompt", placeholder="Enter the text you want to convert to speech...", lines=3, max_lines=20, max_length=MAXIMUM_INPUT_LENGTH, autoscroll=True ) # Character count display character_count_display = gr.HTML( f"
0 / {MAXIMUM_INPUT_LENGTH}
" ) # ------------------------------------------------------------------------- # ACTION BUTTONS SECTION # ------------------------------------------------------------------------- # Primary generate button generate_button = gr.Button( "đŸŽ™ī¸ Generate Speech", variant="primary", size="lg", interactive=False ) # Stop button (visible during generation) stop_button = gr.Button( "âšī¸ Stop Generation", variant="stop", size="lg", visible=False ) # Clear button (visible when content exists) clear_button = gr.Button( "đŸ—‘ī¸ Clear", variant="secondary", size="lg", visible=False ) # ------------------------------------------------------------------------- # EXAMPLE PROMPTS SECTION # ------------------------------------------------------------------------- gr.HTML("""

💡 Example Prompts

Click any example to generate speech with its assigned voice

""") # Create example buttons dynamically example_buttons_list = [] with gr.Row(): example_button_0 = gr.Button( format_example_button_label( EXAMPLE_PROMPTS_WITH_VOICES[0]["text"], EXAMPLE_PROMPTS_WITH_VOICES[0]["voice"] ), size="sm", variant="secondary" ) example_buttons_list.append(example_button_0) example_button_1 = gr.Button( format_example_button_label( EXAMPLE_PROMPTS_WITH_VOICES[1]["text"], EXAMPLE_PROMPTS_WITH_VOICES[1]["voice"] ), size="sm", variant="secondary" ) example_buttons_list.append(example_button_1) with gr.Row(): example_button_2 = gr.Button( format_example_button_label( EXAMPLE_PROMPTS_WITH_VOICES[2]["text"], EXAMPLE_PROMPTS_WITH_VOICES[2]["voice"] ), size="sm", variant="secondary" ) example_buttons_list.append(example_button_2) example_button_3 = gr.Button( format_example_button_label( EXAMPLE_PROMPTS_WITH_VOICES[3]["text"], EXAMPLE_PROMPTS_WITH_VOICES[3]["voice"] ), size="sm", variant="secondary" ) example_buttons_list.append(example_button_3) with gr.Row(): example_button_4 = gr.Button( format_example_button_label( EXAMPLE_PROMPTS_WITH_VOICES[4]["text"], EXAMPLE_PROMPTS_WITH_VOICES[4]["voice"] ), size="sm", variant="secondary" ) example_buttons_list.append(example_button_4) # ------------------------------------------------------------------------- # EVENT HANDLERS AND BINDINGS # ------------------------------------------------------------------------- # Define input components list for generation function generation_inputs = [ text_input_component, voice_mode_radio, voice_preset_dropdown, voice_clone_audio_input, model_variant_textbox, lsd_decode_steps_slider, temperature_slider, noise_clamp_slider, eos_threshold_slider, frames_after_eos_slider, enable_custom_frames_checkbox ] # Voice mode change handler voice_mode_radio.change( fn=update_voice_mode_visibility, inputs=[voice_mode_radio], outputs=[preset_voice_container, clone_voice_container] ) # Text input change handlers text_input_component.change( fn=calculate_character_count_display, inputs=[text_input_component], outputs=[character_count_display] ) text_input_component.change( fn=check_generate_button_state, inputs=[text_input_component], outputs=[generate_button] ) text_input_component.change( fn=determine_clear_button_visibility_idle, inputs=[text_input_component, audio_output_component], outputs=[clear_button] ) # Audio output change handler audio_output_component.change( fn=determine_clear_button_visibility_idle, inputs=[text_input_component, audio_output_component], outputs=[clear_button] ) # Generate button click handler chain generate_button.click( fn=switch_to_generating_state, outputs=[generate_button, stop_button, clear_button] ).then( fn=perform_speech_generation, inputs=generation_inputs, outputs=[audio_output_component] ).then( fn=switch_to_idle_state, inputs=[text_input_component, audio_output_component], outputs=[generate_button, stop_button, clear_button] ).then( fn=check_generate_button_state, inputs=[text_input_component], outputs=[generate_button] ) # Stop button handler stop_button.click( fn=request_generation_stop, outputs=[stop_button] ) # Clear button handler clear_button.click( fn=perform_clear_action, outputs=[ text_input_component, audio_output_component, clear_button, voice_mode_radio, voice_preset_dropdown, voice_clone_audio_input ] ) # Example button handlers for button_index, example_button in enumerate(example_buttons_list): example_text = EXAMPLE_PROMPTS_WITH_VOICES[button_index]["text"] example_voice = EXAMPLE_PROMPTS_WITH_VOICES[button_index]["voice"] example_button.click( fn=create_example_handler(example_text, example_voice), outputs=[text_input_component, voice_mode_radio, voice_preset_dropdown] ).then( fn=switch_to_generating_state, outputs=[generate_button, stop_button, clear_button] ).then( fn=perform_speech_generation, inputs=generation_inputs, outputs=[audio_output_component] ).then( fn=switch_to_idle_state, inputs=[text_input_component, audio_output_component], outputs=[generate_button, stop_button, clear_button] ).then( fn=check_generate_button_state, inputs=[text_input_component], outputs=[generate_button] ) # ============================================================================= # APPLICATION ENTRY POINT # ============================================================================= if __name__ == "__main__": application.launch( server_name="0.0.0.0", share=False )