ECHOAI

Running on Zero

App Files Files Community

MPCIRCLE commited on 17 days ago

Commit

252addc

verified ·

1 Parent(s): b2ef099

Update webui.py

Browse files

baxk to gradio ui

Files changed (1) hide show

webui.py +76 -188

webui.py CHANGED Viewed

@@ -1,194 +1,82 @@
 import os
-import sys
 import time
-from pathlib import Path
 from huggingface_hub import snapshot_download
-import streamlit as st
-# ----------------------- Critical Path Configuration --------------------------
-current_dir = Path(__file__).parent.resolve()  # Get absolute path to current file
-sys.path.insert(0, str(current_dir))  # Add current directory to Python path
-sys.path.insert(1, str(current_dir / "indextts"))  # Add indextts package
-sys.path.insert(2, str(current_dir.parent))  # Add parent directory for utils
-try:
-    from indextts.infer import IndexTTS
-except ModuleNotFoundError as e:
-    st.error(f"Module import error: {str(e)}")
-    st.stop()
-# ----------------------- Rest of Your Original Code ---------------------------
-CHECKPOINT_DIR = "checkpoints"
-OUTPUT_DIR = "outputs"
-PROMPTS_DIR = "prompts"
-# Ensure necessary directories exist. Hugging Face Spaces provides a writable filesystem.
-os.makedirs(CHECKPOINT_DIR, exist_ok=True)
-os.makedirs(OUTPUT_DIR, exist_ok=True)
-os.makedirs(PROMPTS_DIR, exist_ok=True)
-MODEL_REPO = "IndexTeam/IndexTTS-1.5"
-CFG_FILENAME = "config.yaml"
-# ------------------------------------------------------------------------------
-# Model loading (cached so it only runs once per resource identifier)
-# ------------------------------------------------------------------------------
-# @st.cache_resource is the recommended way in Streamlit to cache large objects
-# like ML models that should be loaded only once.
-# This is crucial for efficiency on platforms like Spaces, preventing re-loading
-# the model on every user interaction/script re-run.
-@st.cache_resource(show_spinner=False)
-def load_tts_model():
-    """
-    Downloads the model snapshot and initializes the IndexTTS model.
-    Cached using st.cache_resource to load only once.
-    """
-    st.write("⏳ Loading model... This may take a moment.")
-    # Download the model snapshot if not already present
-    # local_dir_use_symlinks=False is often safer in containerized environments
-    snapshot_download(
-        repo_id=MODEL_REPO,
-        local_dir=CHECKPOINT_DIR,
-        local_dir_use_symlinks=False,
-    )
-    # Initialize the TTS object
-    # The underlying IndexTTS library should handle using the GPU if available
-    # and if dependencies (like CUDA-enabled PyTorch/TensorFlow) are installed.
-    tts = IndexTTS(
-        model_dir=CHECKPOINT_DIR,
-        cfg_path=os.path.join(CHECKPOINT_DIR, CFG_FILENAME)
-    )
-    # Load any normalizer or auxiliary data required by the model
-    tts.load_normalizer()
-    st.write("✅ Model loaded!")
-    return tts
-# Load the TTS model using the cached function
-# This line is executed on each script run, but the function body only runs
-# the first time or if the function signature/dependencies change.
-tts = load_tts_model()
-# ------------------------------------------------------------------------------
-# Inference function
-# ------------------------------------------------------------------------------
-def run_inference(reference_audio_path: str, text: str) -> str:
-    """
-    Run TTS inference using the uploaded reference audio and the target text.
-    Returns the path to the generated .wav file.
-    """
-    if not os.path.exists(reference_audio_path):
-         raise FileNotFoundError(f"Reference audio not found at {reference_audio_path}")
-    # Generate a unique output filename
-    timestamp = int(time.time())
-    output_filename = f"generated_{timestamp}.wav"
-    output_path = os.path.join(OUTPUT_DIR, output_filename)
-    # Perform the TTS inference
-    # The efficiency of this step depends on the IndexTTS library and hardware
-    tts.infer(reference_audio_path, text, output_path)
-    # Optional: Clean up old files in output/prompts directories if space is limited
-    # This can be added if you find directories filling up on Spaces.
-    # E.g., a function to remove files older than X hours/days.
-    # For a simple demo, may not be necessary initially.
     return output_path
-# ------------------------------------------------------------------------------
-# Streamlit UI
-# ------------------------------------------------------------------------------
-st.set_page_config(page_title="IndexTTS Demo", layout="wide")
-st.markdown(
-    """
-    <h1 style="text-align: center;">IndexTTS: Zero-Shot Controllable & Efficient TTS</h1>
-    <p style="text-align: center;">
-      <a href="https://arxiv.org/abs/2502.05512" target="_blank">
-        View the paper on arXiv (2502.05512)
-      </a>
-    </p>
-    """,
-    unsafe_allow_html=True
-)
-st.sidebar.header("Settings")
-with st.sidebar.expander("🗂️ Output Directories"):
-    st.write(f"- Checkpoints: `{CHECKPOINT_DIR}`")
-    st.write(f"- Generated audio: `{OUTPUT_DIR}`")
-    st.write(f"- Uploaded prompts: `{PROMPTS_DIR}`")
-    st.info("These directories are located within your Space's persistent storage.")
-st.header("1. Upload Reference Audio")
-ref_audio_file = st.file_uploader(
-    label="Upload a reference audio (wav or mp3)",
-    type=["wav", "mp3"],
-    help="This audio will condition the voice characteristics.",
-    key="ref_audio_uploader" # Added a key for potential future state management
-)
-ref_path = None # Initialize ref_path
-if ref_audio_file:
-    # Save the uploaded file to the prompts directory
-    # Streamlit's uploader provides file-like object
-    ref_filename = ref_audio_file.name
-    ref_path = os.path.join(PROMPTS_DIR, ref_filename)
-    # Use a more robust way to save the file
-    with open(ref_path, "wb") as f:
-        # Use getbuffer() for efficiency with large files
-        f.write(ref_audio_file.getbuffer())
-    st.success(f"Saved reference audio: `{ref_filename}`")
-    st.audio(ref_path, format="audio/wav") # Display the uploaded audio
-st.header("2. Enter Text to Synthesize")
-text_input = st.text_area(
-    label="Enter the text you want to convert to speech",
-    placeholder="Type your sentence here...",
-    key="text_input_area" # Added a key
-)
-# Button to trigger generation
-generate_button = st.button("Generate Speech", key="generate_tts_button")
-# ------------------------------------------------------------------------------
-# Trigger Inference and Display Results
-# ------------------------------------------------------------------------------
-# This block runs only when the button is clicked AND inputs are valid
-if generate_button:
-    if not ref_path or not os.path.exists(ref_path):
-        st.error("Please upload a reference audio first.")
-    elif not text_input or not text_input.strip():
-        st.error("Please enter some text to synthesize.")
-    else:
-        # Use st.spinner to indicate processing is happening
-        with st.spinner("🚀 Generating speech..."):
-            try:
-                # Call the inference function
-                output_wav_path = run_inference(ref_path, text_input)
-                # Check if output file was actually created
-                if os.path.exists(output_wav_path):
-                    st.success("🎉 Done! Here’s your generated audio:")
-                    # Display the generated audio
-                    st.audio(output_wav_path, format="audio/wav")
-                else:
-                     st.error("Generation failed: Output file was not created.")
-            except Exception as e:
-                st.error(f"An error occurred during inference: {e}")
-                # Optional: Log the full traceback for debugging on Spaces
-                # import traceback
-                # st.exception(e) # This shows traceback in the app
-# Add a footer or more info
-st.markdown("---")
-st.markdown("Demo powered by [IndexTTS](https://arxiv.org/abs/2502.05512) and built with Streamlit.")

+import spaces
 import os
+import shutil
+import threading
 import time
+import sys
 from huggingface_hub import snapshot_download
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+sys.path.append(os.path.join(current_dir, "indextts"))
+import gradio as gr
+from indextts.infer import IndexTTS
+from tools.i18n.i18n import I18nAuto
+i18n = I18nAuto(language="zh_CN")
+MODE = 'local'
+snapshot_download("IndexTeam/IndexTTS-1.5",local_dir="checkpoints",)
+tts = IndexTTS(model_dir="checkpoints", cfg_path="checkpoints/config.yaml")
+os.makedirs("outputs/tasks",exist_ok=True)
+os.makedirs("prompts",exist_ok=True)
+@spaces.GPU
+def infer(voice, text,output_path=None):
+    if not tts:
+        raise Exception("Model not loaded")
+    if not output_path:
+        output_path = os.path.join("outputs", f"spk_{int(time.time())}.wav")
+    tts.infer(voice, text, output_path)
     return output_path
+def gen_single(prompt, text):
+    output_path = infer(prompt, text)
+    return gr.update(value=output_path,visible=True)
+def update_prompt_audio():
+    update_button = gr.update(interactive=True)
+    return update_button
+with gr.Blocks() as demo:
+    mutex = threading.Lock()
+    gr.HTML('''
+ <h2><center>Echo AI: High-Fidelity, Controllable, and Zero-Shot Text-to-Speech for the Real World</center></h2>
+<p align="center">
+<a href='https://arxiv.org/abs/2502.05512'><img src='https://img.shields.io/badge/ArXiv-2502.05512-red'></a>
+    ''')
+    with gr.Tab("audio generation"):
+        with gr.Row():
+            os.makedirs("prompts",exist_ok=True)
+            prompt_audio = gr.Audio(label="Please upload reference audio",key="prompt_audio",
+                                    sources=["upload","microphone"],type="filepath")
+            prompt_list = os.listdir("prompts")
+            default = ''
+            if prompt_list:
+                default = prompt_list[0]
+            input_text_single = gr.Textbox(label="Please enter target text",key="input_text_single")
+            gen_button = gr.Button("generate speech",key="gen_button",interactive=True)
+            output_audio = gr.Audio(label="Generate results", visible=False,key="output_audio")
+    prompt_audio.upload(update_prompt_audio,
+                         inputs=[],
+                         outputs=[gen_button])
+    gen_button.click(gen_single,
+                     inputs=[prompt_audio, input_text_single],
+                     outputs=[output_audio])
+def main():
+    tts.load_normalizer()
+    demo.queue(20)
+    demo.launch(server_name="0.0.0.0")
+if __name__ == "__main__":
+    main()