Spaces:

openfree
/

AI-Podcast

Running on Zero

App Files Files Community

openfree commited on May 25

Commit

6e12ecc

verified ·

1 Parent(s): a159e51

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -60

app.py CHANGED Viewed

@@ -11,6 +11,8 @@ import wave
 import base64
 import numpy as np
 import soundfile as sf
 from dataclasses import dataclass
 from typing import List, Tuple, Dict, Optional
 from pathlib import Path
@@ -34,7 +36,7 @@ from transformers import (
 # Spark TTS imports
 try:
-    from transformers import AutoModel
     SPARK_AVAILABLE = True
 except:
     SPARK_AVAILABLE = False
@@ -65,8 +67,7 @@ class UnifiedAudioConverter:
         self.local_model = None
         self.tokenizer = None
         self.melo_models = None
-        self.spark_model = None
-        self.spark_tokenizer = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def initialize_api_mode(self, api_key: str):
@@ -90,19 +91,31 @@ class UnifiedAudioConverter:
             )
     def initialize_spark_tts(self):
-        """Initialize Spark TTS model"""
-        if SPARK_AVAILABLE and self.spark_model is None:
             try:
-                self.spark_model = AutoModel.from_pretrained(
-                    "SparkAudio/Spark-TTS-0.5B",
-                    trust_remote_code=True,
-                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-                ).to(self.device)
-                # Spark TTS는 별도의 토크나이저가 필요하지 않을 수 있음
-                print("Spark TTS model loaded successfully")
             except Exception as e:
-                print(f"Failed to load Spark TTS: {e}")
     def initialize_melo_tts(self):
         """Initialize MeloTTS models"""
         if MELO_AVAILABLE and self.melo_models is None:
@@ -274,54 +287,79 @@ class UnifiedAudioConverter:
         return tmp_path
     def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
-        """Convert text to speech using Spark TTS"""
-        if not SPARK_AVAILABLE or self.spark_model is None:
             raise RuntimeError("Spark TTS not available")
         try:
-            combined_audio = []
-            sample_rate = 22050  # Default sample rate for Spark TTS
-            # Two different speaker configurations for variety
-            speakers = ["female_1", "male_1"]  # Adjust based on actual Spark TTS speaker options
             for i, turn in enumerate(conversation_json["conversation"]):
                 text = turn["text"]
                 if not text.strip():
                     continue
-                # Generate audio with Spark TTS
-                # Note: The exact API might differ, adjust based on actual Spark TTS documentation
                 try:
-                    with torch.no_grad():
-                        audio_output = self.spark_model.synthesize(
-                            text=text,
-                            voice=speakers[i % 2] if len(speakers) > 1 else speakers[0],
-                            speed=1.0,
-                            # Add other parameters as needed
-                        )
-                    # Convert to numpy array if needed
-                    if isinstance(audio_output, torch.Tensor):
-                        audio_output = audio_output.cpu().numpy()
-                    combined_audio.append(audio_output)
                 except Exception as e:
-                    print(f"Error generating audio for turn {i}: {e}")
-                    # Generate silence as fallback
-                    silence = np.zeros(int(sample_rate * 0.5))  # 0.5 second silence
-                    combined_audio.append(silence)
-            # Combine all audio segments
-            if combined_audio:
-                final_audio = np.concatenate(combined_audio)
-                # Save to file
-                output_path = "spark_podcast_output.wav"
-                sf.write(output_path, final_audio, sample_rate)
             else:
-                raise RuntimeError("No audio generated")
             # Generate conversation text
             conversation_text = "\n".join(
@@ -329,7 +367,7 @@ class UnifiedAudioConverter:
                 for i, turn in enumerate(conversation_json["conversation"])
             )
-            return output_path, conversation_text
         except Exception as e:
             raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
@@ -386,15 +424,18 @@ class UnifiedAudioConverter:
         try:
             audio_segments = []
             for filename in filenames:
-                audio_segment = AudioSegment.from_file(filename)
-                audio_segments.append(audio_segment)
-            combined = sum(audio_segments)
-            combined.export(output_file, format="wav")
             # Clean up temporary files
             for filename in filenames:
-                os.remove(filename)
         except Exception as e:
             raise RuntimeError(f"Failed to combine audio files: {e}")
@@ -462,7 +503,7 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
             )
         elif tts_engine == "Spark-TTS":
             if not SPARK_AVAILABLE:
-                return "Spark TTS not available. Please install required dependencies.", None
             converter.initialize_spark_tts()
             output_file, _ = converter.text_to_speech_spark(conversation_json)
         else:  # MeloTTS
@@ -519,8 +560,8 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 gr.Markdown("""
                 **Recommended:**
-                - 🌟 **Edge-TTS**: Best quality, cloud-based
-                - 🤖 **Spark-TTS**: Local AI model, good quality
                 **Additional Option:**
                 - ⚡ **MeloTTS**: Local processing, GPU recommended
@@ -558,14 +599,24 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 visible=True
             )
-    # TTS 엔진별 설명 추가
     with gr.Row():
         gr.Markdown("""
         ### TTS Engine Details:
         - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
-        - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters). Runs on your device, good for privacy.
         - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
         """)
     gr.Examples(

 import base64
 import numpy as np
 import soundfile as sf
+import subprocess
+import shutil
 from dataclasses import dataclass
 from typing import List, Tuple, Dict, Optional
 from pathlib import Path
 # Spark TTS imports
 try:
+    from huggingface_hub import snapshot_download
     SPARK_AVAILABLE = True
 except:
     SPARK_AVAILABLE = False
         self.local_model = None
         self.tokenizer = None
         self.melo_models = None
+        self.spark_model_dir = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def initialize_api_mode(self, api_key: str):
             )
     def initialize_spark_tts(self):
+        """Initialize Spark TTS model by downloading if needed"""
+        if not SPARK_AVAILABLE:
+            raise RuntimeError("Spark TTS dependencies not available")
+        model_dir = "pretrained_models/Spark-TTS-0.5B"
+        # Check if model exists, if not download it
+        if not os.path.exists(model_dir):
+            print("Downloading Spark-TTS model...")
             try:
+                os.makedirs("pretrained_models", exist_ok=True)
+                snapshot_download(
+                    "SparkAudio/Spark-TTS-0.5B",
+                    local_dir=model_dir
+                )
+                print("Spark-TTS model downloaded successfully")
             except Exception as e:
+                raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
+        self.spark_model_dir = model_dir
+        # Check if we have the CLI inference script
+        if not os.path.exists("cli/inference.py"):
+            print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
     def initialize_melo_tts(self):
         """Initialize MeloTTS models"""
         if MELO_AVAILABLE and self.melo_models is None:
         return tmp_path
     def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
+        """Convert text to speech using Spark TTS CLI"""
+        if not SPARK_AVAILABLE or not self.spark_model_dir:
             raise RuntimeError("Spark TTS not available")
         try:
+            output_dir = self._create_output_directory()
+            audio_files = []
+            # Create different voice characteristics for different speakers
+            voice_configs = [
+                {"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
+                {"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
+            ]
             for i, turn in enumerate(conversation_json["conversation"]):
                 text = turn["text"]
                 if not text.strip():
                     continue
+                # Use different voice config for each speaker
+                voice_config = voice_configs[i % len(voice_configs)]
+                output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
+                # Run Spark TTS CLI inference
+                cmd = [
+                    "python", "-m", "cli.inference",
+                    "--text", text,
+                    "--device", "0" if torch.cuda.is_available() else "cpu",
+                    "--save_dir", output_dir,
+                    "--model_dir", self.spark_model_dir,
+                    "--prompt_text", voice_config["prompt_text"],
+                    "--output_name", f"spark_output_{i}.wav"
+                ]
                 try:
+                    # Run the command
+                    result = subprocess.run(
+                        cmd,
+                        capture_output=True,
+                        text=True,
+                        timeout=60,
+                        cwd="."  # Make sure we're in the right directory
+                    )
+                    if result.returncode == 0:
+                        audio_files.append(output_file)
+                    else:
+                        print(f"Spark TTS error for turn {i}: {result.stderr}")
+                        # Create a short silence as fallback
+                        silence = np.zeros(int(22050 * 1.0))  # 1 second of silence
+                        sf.write(output_file, silence, 22050)
+                        audio_files.append(output_file)
+                except subprocess.TimeoutExpired:
+                    print(f"Spark TTS timeout for turn {i}")
+                    # Create silence as fallback
+                    silence = np.zeros(int(22050 * 1.0))
+                    sf.write(output_file, silence, 22050)
+                    audio_files.append(output_file)
                 except Exception as e:
+                    print(f"Error running Spark TTS for turn {i}: {e}")
+                    # Create silence as fallback
+                    silence = np.zeros(int(22050 * 1.0))
+                    sf.write(output_file, silence, 22050)
+                    audio_files.append(output_file)
+            # Combine all audio files
+            if audio_files:
+                final_output = os.path.join(output_dir, "spark_combined.wav")
+                self._combine_audio_files(audio_files, final_output)
             else:
+                raise RuntimeError("No audio files generated")
             # Generate conversation text
             conversation_text = "\n".join(
                 for i, turn in enumerate(conversation_json["conversation"])
             )
+            return final_output, conversation_text
         except Exception as e:
             raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
         try:
             audio_segments = []
             for filename in filenames:
+                if os.path.exists(filename):
+                    audio_segment = AudioSegment.from_file(filename)
+                    audio_segments.append(audio_segment)
+            if audio_segments:
+                combined = sum(audio_segments)
+                combined.export(output_file, format="wav")
             # Clean up temporary files
             for filename in filenames:
+                if os.path.exists(filename):
+                    os.remove(filename)
         except Exception as e:
             raise RuntimeError(f"Failed to combine audio files: {e}")
             )
         elif tts_engine == "Spark-TTS":
             if not SPARK_AVAILABLE:
+                return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
             converter.initialize_spark_tts()
             output_file, _ = converter.text_to_speech_spark(conversation_json)
         else:  # MeloTTS
                 gr.Markdown("""
                 **Recommended:**
+                - 🌟 **Edge-TTS**: Best quality, cloud-based, instant setup
+                - 🤖 **Spark-TTS**: Local AI model (0.5B), zero-shot voice cloning
                 **Additional Option:**
                 - ⚡ **MeloTTS**: Local processing, GPU recommended
                 visible=True
             )
+    # TTS 엔진별 설명 및 설치 안내 추가
     with gr.Row():
         gr.Markdown("""
         ### TTS Engine Details:
         - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
+        - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
+          - **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
+          - Features: Bilingual support (Chinese/English), controllable speech generation
+          - License: CC BY-NC-SA (Non-commercial use only)
         - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
+        ### Spark-TTS Setup Instructions:
+        ```bash
+        git clone https://github.com/SparkAudio/Spark-TTS.git
+        cd Spark-TTS
+        pip install -r requirements.txt
+        ```
         """)
     gr.Examples(