Spaces:

openfree
/

AI-Podcast

Running on Zero

App Files Files Community

openfree commited on May 25

Commit

24c8c4e

verified ·

1 Parent(s): 47a4f4f

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -11

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import httpx
 import tempfile
 import wave
 import base64
 from dataclasses import dataclass
 from typing import List, Tuple, Dict, Optional
 from pathlib import Path
@@ -30,6 +32,13 @@ from transformers import (
     BitsAndBytesConfig,
 )
 # MeloTTS imports (for local mode)
 try:
     os.system("python -m unidic download")
@@ -56,6 +65,8 @@ class UnifiedAudioConverter:
         self.local_model = None
         self.tokenizer = None
         self.melo_models = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def initialize_api_mode(self, api_key: str):
@@ -77,7 +88,23 @@ class UnifiedAudioConverter:
                 self.config.local_model_name,
                 revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
             )
         if MELO_AVAILABLE and self.melo_models is None:
             self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
@@ -246,6 +273,67 @@ class UnifiedAudioConverter:
         return tmp_path
     def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
         """Convert text to speech using MeloTTS"""
         if not MELO_AVAILABLE or not self.melo_models:
@@ -272,7 +360,7 @@ class UnifiedAudioConverter:
             combined_audio += audio_segment
         # Save final audio
-        final_audio_path = "final_podcast.mp3"
         combined_audio.export(final_audio_path, format="mp3")
         # Generate conversation text
@@ -372,9 +460,15 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
                 "en-US-AvaMultilingualNeural",
                 "en-US-AndrewMultilingualNeural"
             )
         else:  # MeloTTS
             if not MELO_AVAILABLE:
                 return "MeloTTS not available. Please install required dependencies.", None
             output_file, _ = converter.text_to_speech_melo(conversation_json)
         return "Audio generated successfully!", output_file
@@ -412,12 +506,25 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 label="Processing Mode",
                 info="API: Faster, requires API key | Local: Slower, runs on device"
             )
-            tts_selector = gr.Radio(
-                choices=["Edge-TTS", "MeloTTS"],
-                value="Edge-TTS",
-                label="TTS Engine",
-                info="Edge-TTS: More natural | MeloTTS: Requires GPU"
-            )
     convert_btn = gr.Button("🎯 Generate Conversation", variant="primary", size="lg")
@@ -427,7 +534,7 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 label="Generated Conversation (Editable)",
                 lines=15,
                 max_lines=30,
-                interactive=True,  # 편집 가능하도록 변경
                 placeholder="Generated conversation will appear here. You can edit it before generating audio.",
                 info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
             )
@@ -451,13 +558,24 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 visible=True
             )
     gr.Examples(
         examples=[
             ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
-            ["https://www.bbc.com/news/technology-67988517", "API", "Edge-TTS"],
         ],
         inputs=[url_input, mode_selector, tts_selector],
-        outputs=[conversation_output, audio_output],
         fn=synthesize_sync,
         cache_examples=False,
     )

 import tempfile
 import wave
 import base64
+import numpy as np
+import soundfile as sf
 from dataclasses import dataclass
 from typing import List, Tuple, Dict, Optional
 from pathlib import Path
     BitsAndBytesConfig,
 )
+# Spark TTS imports
+try:
+    from transformers import AutoModel
+    SPARK_AVAILABLE = True
+except:
+    SPARK_AVAILABLE = False
 # MeloTTS imports (for local mode)
 try:
     os.system("python -m unidic download")
         self.local_model = None
         self.tokenizer = None
         self.melo_models = None
+        self.spark_model = None
+        self.spark_tokenizer = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def initialize_api_mode(self, api_key: str):
                 self.config.local_model_name,
                 revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
             )
+    def initialize_spark_tts(self):
+        """Initialize Spark TTS model"""
+        if SPARK_AVAILABLE and self.spark_model is None:
+            try:
+                self.spark_model = AutoModel.from_pretrained(
+                    "SparkAudio/Spark-TTS-0.5B",
+                    trust_remote_code=True,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+                ).to(self.device)
+                # Spark TTS는 별도의 토크나이저가 필요하지 않을 수 있음
+                print("Spark TTS model loaded successfully")
+            except Exception as e:
+                print(f"Failed to load Spark TTS: {e}")
+    def initialize_melo_tts(self):
+        """Initialize MeloTTS models"""
         if MELO_AVAILABLE and self.melo_models is None:
             self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
         return tmp_path
+    def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
+        """Convert text to speech using Spark TTS"""
+        if not SPARK_AVAILABLE or self.spark_model is None:
+            raise RuntimeError("Spark TTS not available")
+        try:
+            combined_audio = []
+            sample_rate = 22050  # Default sample rate for Spark TTS
+            # Two different speaker configurations for variety
+            speakers = ["female_1", "male_1"]  # Adjust based on actual Spark TTS speaker options
+            for i, turn in enumerate(conversation_json["conversation"]):
+                text = turn["text"]
+                if not text.strip():
+                    continue
+                # Generate audio with Spark TTS
+                # Note: The exact API might differ, adjust based on actual Spark TTS documentation
+                try:
+                    with torch.no_grad():
+                        audio_output = self.spark_model.synthesize(
+                            text=text,
+                            voice=speakers[i % 2] if len(speakers) > 1 else speakers[0],
+                            speed=1.0,
+                            # Add other parameters as needed
+                        )
+                    # Convert to numpy array if needed
+                    if isinstance(audio_output, torch.Tensor):
+                        audio_output = audio_output.cpu().numpy()
+                    combined_audio.append(audio_output)
+                except Exception as e:
+                    print(f"Error generating audio for turn {i}: {e}")
+                    # Generate silence as fallback
+                    silence = np.zeros(int(sample_rate * 0.5))  # 0.5 second silence
+                    combined_audio.append(silence)
+            # Combine all audio segments
+            if combined_audio:
+                final_audio = np.concatenate(combined_audio)
+                # Save to file
+                output_path = "spark_podcast_output.wav"
+                sf.write(output_path, final_audio, sample_rate)
+            else:
+                raise RuntimeError("No audio generated")
+            # Generate conversation text
+            conversation_text = "\n".join(
+                f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
+                for i, turn in enumerate(conversation_json["conversation"])
+            )
+            return output_path, conversation_text
+        except Exception as e:
+            raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
     def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
         """Convert text to speech using MeloTTS"""
         if not MELO_AVAILABLE or not self.melo_models:
             combined_audio += audio_segment
         # Save final audio
+        final_audio_path = "melo_podcast.mp3"
         combined_audio.export(final_audio_path, format="mp3")
         # Generate conversation text
                 "en-US-AvaMultilingualNeural",
                 "en-US-AndrewMultilingualNeural"
             )
+        elif tts_engine == "Spark-TTS":
+            if not SPARK_AVAILABLE:
+                return "Spark TTS not available. Please install required dependencies.", None
+            converter.initialize_spark_tts()
+            output_file, _ = converter.text_to_speech_spark(conversation_json)
         else:  # MeloTTS
             if not MELO_AVAILABLE:
                 return "MeloTTS not available. Please install required dependencies.", None
+            converter.initialize_melo_tts()
             output_file, _ = converter.text_to_speech_melo(conversation_json)
         return "Audio generated successfully!", output_file
                 label="Processing Mode",
                 info="API: Faster, requires API key | Local: Slower, runs on device"
             )
+            # TTS 엔진 선택 - 기본 2개와 추가 옵션으로 구분
+            with gr.Group():
+                gr.Markdown("### TTS Engine Selection")
+                tts_selector = gr.Radio(
+                    choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
+                    value="Edge-TTS",
+                    label="TTS Engine",
+                    info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
+                )
+                gr.Markdown("""
+                **Recommended:**
+                - 🌟 **Edge-TTS**: Best quality, cloud-based
+                - 🤖 **Spark-TTS**: Local AI model, good quality
+                **Additional Option:**
+                - ⚡ **MeloTTS**: Local processing, GPU recommended
+                """)
     convert_btn = gr.Button("🎯 Generate Conversation", variant="primary", size="lg")
                 label="Generated Conversation (Editable)",
                 lines=15,
                 max_lines=30,
+                interactive=True,
                 placeholder="Generated conversation will appear here. You can edit it before generating audio.",
                 info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
             )
                 visible=True
             )
+    # TTS 엔진별 설명 추가
+    with gr.Row():
+        gr.Markdown("""
+        ### TTS Engine Details:
+        - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
+        - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters). Runs on your device, good for privacy.
+        - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
+        """)
     gr.Examples(
         examples=[
             ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
+            ["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS"],
+            ["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS"],
         ],
         inputs=[url_input, mode_selector, tts_selector],
+        outputs=[conversation_output, status_output],
         fn=synthesize_sync,
         cache_examples=False,
     )