import os import time import shutil # Added shutil for potentially cleaning old files if needed, though not used in this version from huggingface_hub import snapshot_download import streamlit as st # Imports from your package # Ensure 'indextts' is correctly installed or available in your environment/requirements.txt from indextts.infer import IndexTTS # ------------------------------------------------------------------------------ # Configuration # ------------------------------------------------------------------------------ # Where to store model checkpoints and outputs # These paths are relative to the root directory of your Spaces repository CHECKPOINT_DIR = "checkpoints" OUTPUT_DIR = "outputs" PROMPTS_DIR = "prompts" # Directory to save uploaded reference audio # Ensure necessary directories exist. Hugging Face Spaces provides a writable filesystem. os.makedirs(CHECKPOINT_DIR, exist_ok=True) os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(PROMPTS_DIR, exist_ok=True) MODEL_REPO = "IndexTeam/IndexTTS-1.5" CFG_FILENAME = "config.yaml" # ------------------------------------------------------------------------------ # Model loading (cached so it only runs once per resource identifier) # ------------------------------------------------------------------------------ # @st.cache_resource is the recommended way in Streamlit to cache large objects # like ML models that should be loaded only once. # This is crucial for efficiency on platforms like Spaces, preventing re-loading # the model on every user interaction/script re-run. @st.cache_resource(show_spinner=False) def load_tts_model(): """ Downloads the model snapshot and initializes the IndexTTS model. Cached using st.cache_resource to load only once. """ st.write("β³ Loading model... This may take a moment.") # Download the model snapshot if not already present # local_dir_use_symlinks=False is often safer in containerized environments snapshot_download( repo_id=MODEL_REPO, local_dir=CHECKPOINT_DIR, local_dir_use_symlinks=False, ) # Initialize the TTS object # The underlying IndexTTS library should handle using the GPU if available # and if dependencies (like CUDA-enabled PyTorch/TensorFlow) are installed. tts = IndexTTS( model_dir=CHECKPOINT_DIR, cfg_path=os.path.join(CHECKPOINT_DIR, CFG_FILENAME) ) # Load any normalizer or auxiliary data required by the model tts.load_normalizer() st.write("β Model loaded!") return tts # Load the TTS model using the cached function # This line is executed on each script run, but the function body only runs # the first time or if the function signature/dependencies change. tts = load_tts_model() # ------------------------------------------------------------------------------ # Inference function # ------------------------------------------------------------------------------ def run_inference(reference_audio_path: str, text: str) -> str: """ Run TTS inference using the uploaded reference audio and the target text. Returns the path to the generated .wav file. """ if not os.path.exists(reference_audio_path): raise FileNotFoundError(f"Reference audio not found at {reference_audio_path}") # Generate a unique output filename timestamp = int(time.time()) output_filename = f"generated_{timestamp}.wav" output_path = os.path.join(OUTPUT_DIR, output_filename) # Perform the TTS inference # The efficiency of this step depends on the IndexTTS library and hardware tts.infer(reference_audio_path, text, output_path) # Optional: Clean up old files in output/prompts directories if space is limited # This can be added if you find directories filling up on Spaces. # E.g., a function to remove files older than X hours/days. # For a simple demo, may not be necessary initially. return output_path # ------------------------------------------------------------------------------ # Streamlit UI # ------------------------------------------------------------------------------ st.set_page_config(page_title="IndexTTS Demo", layout="wide") st.markdown( """