import os import tempfile import gradio as gr import torch import torchaudio import spaces from huggingface_hub import snapshot_download from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio import numpy as np import uuid from pydub import AudioSegment # Create output directory if it doesn't exist os.makedirs("outputs", exist_ok=True) # Check for CUDA availability (this will show CPU due to Zero-GPU) device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Initial device check: {device}") # Create a tensor to verify Zero-GPU is working zero = torch.Tensor([0]) if torch.cuda.is_available(): zero = zero.cuda() print(f"Zero tensor device: {zero.device}") # Initialize Tortoise TTS (will be loaded on demand with Zero-GPU) tts = None # Available preset voice options PRESET_VOICES = ["random", "angie", "daniel", "deniro", "emma", "freeman", "geralt", "halle", "jlaw", "lj", "mol", "myself", "pat", "snakes", "tim_reynolds", "tom", "train_atkins", "train_daws", "train_dotrice", "train_dreams", "train_empire", "train_grace", "train_kennard", "train_lescault", "train_mouse", "weaver", "william"] def process_audio_file(audio_file_path): """Process uploaded audio file to ensure it meets Tortoise requirements""" # Load audio file audio = AudioSegment.from_file(audio_file_path) # Convert to WAV format if it's not already if not audio_file_path.lower().endswith('.wav'): temp_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) audio.export(temp_wav.name, format="wav") audio_file_path = temp_wav.name # Resample to 22.05kHz which is what Tortoise expects y, sr = torchaudio.load(audio_file_path) if sr != 22050: resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050) y = resampler(y) temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) torchaudio.save(temp_file.name, y, 22050) audio_file_path = temp_file.name return audio_file_path @spaces.GPU def generate_tts_with_voice(text, voice_sample_path=None, preset_voice=None): """Generate TTS audio using Tortoise with either a custom voice or preset""" global tts try: # Now that we're inside the @spaces.GPU decorated function, CUDA should be available print(f"GPU function device: {zero.device}") # Initialize TTS model if not already initialized if tts is None: tts = TextToSpeech(use_deepspeed=True if torch.cuda.is_available() else False) print("TTS model initialized") voice_samples = None if voice_sample_path: # Process the voice sample voice_sample_path = process_audio_file(voice_sample_path) voice_samples, _ = load_audio(voice_sample_path, 22050) voice_samples = [voice_samples] preset_voice = None elif preset_voice and preset_voice != "random": voice_samples = None else: # random voice voice_samples = None preset_voice = "random" # Generate the speech output_id = str(uuid.uuid4())[:8] output_path = f"outputs/tts_output_{output_id}.wav" gen = tts.tts_with_preset( text, voice_samples=voice_samples, preset=preset_voice ) # Save the generated audio torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000) return output_path, "Success: TTS generation completed." except Exception as e: return None, f"Error: {str(e)}" @spaces.GPU def tts_interface(text, audio_file, preset_voice, record_audio): """Interface function for Gradio with GPU acceleration""" print(f"Processing with device: {zero.device}") voice_sample_path = None # Determine which voice input to use if record_audio is not None: # Use recorded audio temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') temp_file.close() record_audio = (record_audio[0], 22050) # Ensure sample rate is 22050 torchaudio.save(temp_file.name, torch.tensor(record_audio[0]).unsqueeze(0), record_audio[1]) voice_sample_path = temp_file.name elif audio_file is not None: # Use uploaded audio file voice_sample_path = audio_file # If no custom voice is provided, use the preset if voice_sample_path is None and preset_voice == "": preset_voice = "random" # Generate TTS output_path, message = generate_tts_with_voice(text, voice_sample_path, preset_voice) if output_path: return output_path, message else: return None, message # Create Gradio interface with gr.Blocks(title="Tortoise TTS with Voice Cloning") as demo: gr.Markdown("# Tortoise Text-to-Speech with Voice Cloning") gr.Markdown("Enter text and either upload a voice sample, record your voice, or select a preset voice.") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to speak", placeholder="Enter the text you want to convert to speech...", lines=5 ) preset_voice = gr.Dropdown( choices=[""] + PRESET_VOICES, label="Preset Voice (optional)", value="" ) with gr.Column(): gr.Markdown("### Voice Input Options") with gr.Tab("Upload Voice"): audio_file = gr.Audio( label="Upload Voice Sample (optional)", type="filepath" ) with gr.Tab("Record Voice"): record_audio = gr.Audio( label="Record Your Voice (optional)", source="microphone" ) generate_button = gr.Button("Generate Speech") with gr.Row(): output_audio = gr.Audio(label="Generated Speech") output_message = gr.Textbox(label="Status") generate_button.click( fn=tts_interface, inputs=[text_input, audio_file, preset_voice, record_audio], outputs=[output_audio, output_message] ) gr.Markdown("### About This App") gr.Markdown(""" This app uses Tortoise-TTS to generate high-quality speech from text. You can: - Enter any text you want to be spoken - Upload or record a voice sample for voice cloning - Or select from pre-defined voice presets The app runs on Hugging Face Spaces with Zero-GPU optimization. """) if __name__ == "__main__": demo.launch()