# http://localhost:7860 import os import time import torch import numpy as np import gradio as gr from TTS.api import TTS from hing_bert_module import process_text import tempfile from pathlib import Path import platform import subprocess class TTSGenerator: def __init__(self): # Model configuration self.model_dir = "xtts_Hindi_FineTuned" self.reference_voices = { "Male": "xtts_Hindi_FineTuned\\speakers\\Reference_male.wav", "Female": "xtts_Hindi_FineTuned\\speakers\\Reference_female.wav" } # Default parameters self.params = { "temperature": 0.7, "top_k": 50, "top_p": 0.85, "speed": 1.0, "language": "en" } # Load the model print("Loading XTTS model... (This may take a moment)") self.use_cuda = torch.cuda.is_available() try: self.tts = TTS( model_path=self.model_dir, config_path=os.path.join(self.model_dir, "config.json"), gpu=self.use_cuda ) print("Model loaded successfully!") self.model_loaded = True except Exception as e: print(f"Error loading model: {str(e)}") self.model_loaded = False def generate_speech(self, text, voice, language, temperature, top_k, top_p, speed): if not self.model_loaded: return None, "Error: Model failed to load. Please check the model directory." if not text.strip(): return None, "Error: Please enter some text to convert to speech." try: # Process the text processed_text = process_text(text) input_text = processed_text["final_text"] # Create a temporary file for the output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: output_path = temp_file.name # Get the reference audio path reference_audio = self.reference_voices[voice] print("\nGenerating speech...") start_time = time.perf_counter() # Generate the speech self.tts.tts_to_file( text=input_text, file_path=output_path, speaker_wav=reference_audio, language=language.lower(), temperature=float(temperature), top_k=int(top_k), top_p=float(top_p), speed=float(speed), split_sentences=True ) end_time = time.perf_counter() total_duration = end_time - start_time # Get file size in KB file_size = os.path.getsize(output_path) / 1024 stats = ( f"Generation complete!\n" f"• Processing time: {total_duration:.2f} seconds\n" f"• Output file: {os.path.basename(output_path)}\n" f"• File size: {file_size:.1f} KB\n" f"• Language: {'English' if language.lower() == 'en' else 'Hindi'}\n" f"• Voice: {voice}" ) return output_path, stats except Exception as e: return None, f"Error during speech generation: {str(e)}" def create_gradio_interface(): tts_generator = TTSGenerator() with gr.Blocks(title="Hindi/English TTS", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎙️ Hindi/English Text-to-Speech Convert text to natural-sounding speech in Hindi or English. """) with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Enter your text", placeholder="Type or paste your text here...", lines=8, max_lines=20, interactive=True ) with gr.Row(): voice_choice = gr.Dropdown( choices=["Female", "Male"], value="Female", label="Voice" ) language_choice = gr.Dropdown( choices=["English", "Hindi"], value="English", label="Language" ) with gr.Accordion("Advanced Settings", open=False): with gr.Row(): temperature = gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature (randomness)", info="Higher values make output more random" ) speed = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speed", info="1.0 = normal speed" ) with gr.Row(): top_k = gr.Slider( minimum=10, maximum=100, value=50, step=5, label="Top-K", info="Lower values make output more focused" ) top_p = gr.Slider( minimum=0.5, maximum=1.0, value=0.85, step=0.05, label="Top-P", info="Lower values make output more focused" ) generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Speech", type="filepath", interactive=False ) stats_output = gr.Textbox( label="Generation Stats", interactive=False, lines=8, max_lines=10 ) with gr.Accordion("Example Texts", open=False): with gr.Row(): example_english = gr.Button("English Example") example_hindi = gr.Button("Hindi Example") # Example text handlers def load_example_english(): example = ("""The quick brown fox jumps over the lazy dog. This is an example of English text """ """to demonstrate the text-to-speech capabilities. The system can handle both """ """short phrases and longer paragraphs with proper intonation and pacing.""") return example, "English", 0.7, 50, 0.85, 1.0 def load_example_hindi(): example = ("""यह हिंदी भाषा का एक उदाहरण है। यह प्रणाली हिंदी और अंग्रेजी दोनों भाषाओं में """ """पाठ को स्वाभाविक लगने वाली आवाज में बदल सकती है। आप कोई भी पाठ डाल सकते हैं """ """और इसे सुन सकते हैं।""") return example, "Hindi", 0.7, 50, 0.85, 1.0 example_english.click( fn=load_example_english, outputs=[text_input, language_choice, temperature, top_k, top_p, speed] ) example_hindi.click( fn=load_example_hindi, outputs=[text_input, language_choice, temperature, top_k, top_p, speed] ) # Main generation function def generate_tts(text, voice, language, temp, k, p, speed_val): if not text.strip(): return None, "Please enter some text to generate speech." lang_code = "en" if language.lower() == "english" else "hi" audio_path, stats = tts_generator.generate_speech( text=text, voice=voice, language=lang_code, temperature=temp, top_k=k, top_p=p, speed=speed_val ) if audio_path and os.path.exists(audio_path): return audio_path, stats return None, stats or "Error generating speech. Please try again." # Connect the generate button generate_btn.click( fn=generate_tts, inputs=[ text_input, voice_choice, language_choice, temperature, top_k, top_p, speed ], outputs=[audio_output, stats_output] ) # Add some footer text gr.Markdown(""" --- *Note: First generation might take longer as the model warms up.* *For best results, keep text under 500 characters.* """) return demo if __name__ == "__main__": # Create and launch the Gradio interface demo = create_gradio_interface() demo.launch(server_name="0.0.0.0", server_port=7860, share=True) # ============================================== # HOW TO USE THIS SCRIPT: # 1. Run this script using: python inference3.py # 2. The Gradio interface will be available at: http://localhost:7860 # 3. In the interface: # - Enter your text in the text box # - Select the desired voice (Male/Female) # - Choose the language (English/Hindi) # - Adjust parameters if needed (temperature, top-k, top-p, speed) # - Click "Generate Speech" to create the audio # - The audio player will appear below the button when ready # # TO EXIT: # - Press Ctrl+C in the terminal where the script is running # - Or close the terminal window # ==============================================