Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,8 @@ import httpx
|
|
9 |
import tempfile
|
10 |
import wave
|
11 |
import base64
|
|
|
|
|
12 |
from dataclasses import dataclass
|
13 |
from typing import List, Tuple, Dict, Optional
|
14 |
from pathlib import Path
|
@@ -30,6 +32,13 @@ from transformers import (
|
|
30 |
BitsAndBytesConfig,
|
31 |
)
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# MeloTTS imports (for local mode)
|
34 |
try:
|
35 |
os.system("python -m unidic download")
|
@@ -56,6 +65,8 @@ class UnifiedAudioConverter:
|
|
56 |
self.local_model = None
|
57 |
self.tokenizer = None
|
58 |
self.melo_models = None
|
|
|
|
|
59 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
60 |
|
61 |
def initialize_api_mode(self, api_key: str):
|
@@ -77,7 +88,23 @@ class UnifiedAudioConverter:
|
|
77 |
self.config.local_model_name,
|
78 |
revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
|
79 |
)
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
if MELO_AVAILABLE and self.melo_models is None:
|
82 |
self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
|
83 |
|
@@ -246,6 +273,67 @@ class UnifiedAudioConverter:
|
|
246 |
|
247 |
return tmp_path
|
248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
|
250 |
"""Convert text to speech using MeloTTS"""
|
251 |
if not MELO_AVAILABLE or not self.melo_models:
|
@@ -272,7 +360,7 @@ class UnifiedAudioConverter:
|
|
272 |
combined_audio += audio_segment
|
273 |
|
274 |
# Save final audio
|
275 |
-
final_audio_path = "
|
276 |
combined_audio.export(final_audio_path, format="mp3")
|
277 |
|
278 |
# Generate conversation text
|
@@ -372,9 +460,15 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
|
|
372 |
"en-US-AvaMultilingualNeural",
|
373 |
"en-US-AndrewMultilingualNeural"
|
374 |
)
|
|
|
|
|
|
|
|
|
|
|
375 |
else: # MeloTTS
|
376 |
if not MELO_AVAILABLE:
|
377 |
return "MeloTTS not available. Please install required dependencies.", None
|
|
|
378 |
output_file, _ = converter.text_to_speech_melo(conversation_json)
|
379 |
|
380 |
return "Audio generated successfully!", output_file
|
@@ -412,12 +506,25 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
412 |
label="Processing Mode",
|
413 |
info="API: Faster, requires API key | Local: Slower, runs on device"
|
414 |
)
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
|
422 |
convert_btn = gr.Button("๐ฏ Generate Conversation", variant="primary", size="lg")
|
423 |
|
@@ -427,7 +534,7 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
427 |
label="Generated Conversation (Editable)",
|
428 |
lines=15,
|
429 |
max_lines=30,
|
430 |
-
interactive=True,
|
431 |
placeholder="Generated conversation will appear here. You can edit it before generating audio.",
|
432 |
info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
|
433 |
)
|
@@ -451,13 +558,24 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
451 |
visible=True
|
452 |
)
|
453 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
gr.Examples(
|
455 |
examples=[
|
456 |
["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
|
457 |
-
["https://www.bbc.com/news/technology-67988517", "API", "
|
|
|
458 |
],
|
459 |
inputs=[url_input, mode_selector, tts_selector],
|
460 |
-
outputs=[conversation_output,
|
461 |
fn=synthesize_sync,
|
462 |
cache_examples=False,
|
463 |
)
|
|
|
9 |
import tempfile
|
10 |
import wave
|
11 |
import base64
|
12 |
+
import numpy as np
|
13 |
+
import soundfile as sf
|
14 |
from dataclasses import dataclass
|
15 |
from typing import List, Tuple, Dict, Optional
|
16 |
from pathlib import Path
|
|
|
32 |
BitsAndBytesConfig,
|
33 |
)
|
34 |
|
35 |
+
# Spark TTS imports
|
36 |
+
try:
|
37 |
+
from transformers import AutoModel
|
38 |
+
SPARK_AVAILABLE = True
|
39 |
+
except:
|
40 |
+
SPARK_AVAILABLE = False
|
41 |
+
|
42 |
# MeloTTS imports (for local mode)
|
43 |
try:
|
44 |
os.system("python -m unidic download")
|
|
|
65 |
self.local_model = None
|
66 |
self.tokenizer = None
|
67 |
self.melo_models = None
|
68 |
+
self.spark_model = None
|
69 |
+
self.spark_tokenizer = None
|
70 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
71 |
|
72 |
def initialize_api_mode(self, api_key: str):
|
|
|
88 |
self.config.local_model_name,
|
89 |
revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
|
90 |
)
|
91 |
+
|
92 |
+
def initialize_spark_tts(self):
|
93 |
+
"""Initialize Spark TTS model"""
|
94 |
+
if SPARK_AVAILABLE and self.spark_model is None:
|
95 |
+
try:
|
96 |
+
self.spark_model = AutoModel.from_pretrained(
|
97 |
+
"SparkAudio/Spark-TTS-0.5B",
|
98 |
+
trust_remote_code=True,
|
99 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
|
100 |
+
).to(self.device)
|
101 |
+
# Spark TTS๋ ๋ณ๋์ ํ ํฌ๋์ด์ ๊ฐ ํ์ํ์ง ์์ ์ ์์
|
102 |
+
print("Spark TTS model loaded successfully")
|
103 |
+
except Exception as e:
|
104 |
+
print(f"Failed to load Spark TTS: {e}")
|
105 |
+
|
106 |
+
def initialize_melo_tts(self):
|
107 |
+
"""Initialize MeloTTS models"""
|
108 |
if MELO_AVAILABLE and self.melo_models is None:
|
109 |
self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
|
110 |
|
|
|
273 |
|
274 |
return tmp_path
|
275 |
|
276 |
+
def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
|
277 |
+
"""Convert text to speech using Spark TTS"""
|
278 |
+
if not SPARK_AVAILABLE or self.spark_model is None:
|
279 |
+
raise RuntimeError("Spark TTS not available")
|
280 |
+
|
281 |
+
try:
|
282 |
+
combined_audio = []
|
283 |
+
sample_rate = 22050 # Default sample rate for Spark TTS
|
284 |
+
|
285 |
+
# Two different speaker configurations for variety
|
286 |
+
speakers = ["female_1", "male_1"] # Adjust based on actual Spark TTS speaker options
|
287 |
+
|
288 |
+
for i, turn in enumerate(conversation_json["conversation"]):
|
289 |
+
text = turn["text"]
|
290 |
+
if not text.strip():
|
291 |
+
continue
|
292 |
+
|
293 |
+
# Generate audio with Spark TTS
|
294 |
+
# Note: The exact API might differ, adjust based on actual Spark TTS documentation
|
295 |
+
try:
|
296 |
+
with torch.no_grad():
|
297 |
+
audio_output = self.spark_model.synthesize(
|
298 |
+
text=text,
|
299 |
+
voice=speakers[i % 2] if len(speakers) > 1 else speakers[0],
|
300 |
+
speed=1.0,
|
301 |
+
# Add other parameters as needed
|
302 |
+
)
|
303 |
+
|
304 |
+
# Convert to numpy array if needed
|
305 |
+
if isinstance(audio_output, torch.Tensor):
|
306 |
+
audio_output = audio_output.cpu().numpy()
|
307 |
+
|
308 |
+
combined_audio.append(audio_output)
|
309 |
+
|
310 |
+
except Exception as e:
|
311 |
+
print(f"Error generating audio for turn {i}: {e}")
|
312 |
+
# Generate silence as fallback
|
313 |
+
silence = np.zeros(int(sample_rate * 0.5)) # 0.5 second silence
|
314 |
+
combined_audio.append(silence)
|
315 |
+
|
316 |
+
# Combine all audio segments
|
317 |
+
if combined_audio:
|
318 |
+
final_audio = np.concatenate(combined_audio)
|
319 |
+
|
320 |
+
# Save to file
|
321 |
+
output_path = "spark_podcast_output.wav"
|
322 |
+
sf.write(output_path, final_audio, sample_rate)
|
323 |
+
else:
|
324 |
+
raise RuntimeError("No audio generated")
|
325 |
+
|
326 |
+
# Generate conversation text
|
327 |
+
conversation_text = "\n".join(
|
328 |
+
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
329 |
+
for i, turn in enumerate(conversation_json["conversation"])
|
330 |
+
)
|
331 |
+
|
332 |
+
return output_path, conversation_text
|
333 |
+
|
334 |
+
except Exception as e:
|
335 |
+
raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
|
336 |
+
|
337 |
def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
|
338 |
"""Convert text to speech using MeloTTS"""
|
339 |
if not MELO_AVAILABLE or not self.melo_models:
|
|
|
360 |
combined_audio += audio_segment
|
361 |
|
362 |
# Save final audio
|
363 |
+
final_audio_path = "melo_podcast.mp3"
|
364 |
combined_audio.export(final_audio_path, format="mp3")
|
365 |
|
366 |
# Generate conversation text
|
|
|
460 |
"en-US-AvaMultilingualNeural",
|
461 |
"en-US-AndrewMultilingualNeural"
|
462 |
)
|
463 |
+
elif tts_engine == "Spark-TTS":
|
464 |
+
if not SPARK_AVAILABLE:
|
465 |
+
return "Spark TTS not available. Please install required dependencies.", None
|
466 |
+
converter.initialize_spark_tts()
|
467 |
+
output_file, _ = converter.text_to_speech_spark(conversation_json)
|
468 |
else: # MeloTTS
|
469 |
if not MELO_AVAILABLE:
|
470 |
return "MeloTTS not available. Please install required dependencies.", None
|
471 |
+
converter.initialize_melo_tts()
|
472 |
output_file, _ = converter.text_to_speech_melo(conversation_json)
|
473 |
|
474 |
return "Audio generated successfully!", output_file
|
|
|
506 |
label="Processing Mode",
|
507 |
info="API: Faster, requires API key | Local: Slower, runs on device"
|
508 |
)
|
509 |
+
|
510 |
+
# TTS ์์ง ์ ํ - ๊ธฐ๋ณธ 2๊ฐ์ ์ถ๊ฐ ์ต์
์ผ๋ก ๊ตฌ๋ถ
|
511 |
+
with gr.Group():
|
512 |
+
gr.Markdown("### TTS Engine Selection")
|
513 |
+
tts_selector = gr.Radio(
|
514 |
+
choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
|
515 |
+
value="Edge-TTS",
|
516 |
+
label="TTS Engine",
|
517 |
+
info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
|
518 |
+
)
|
519 |
+
|
520 |
+
gr.Markdown("""
|
521 |
+
**Recommended:**
|
522 |
+
- ๐ **Edge-TTS**: Best quality, cloud-based
|
523 |
+
- ๐ค **Spark-TTS**: Local AI model, good quality
|
524 |
+
|
525 |
+
**Additional Option:**
|
526 |
+
- โก **MeloTTS**: Local processing, GPU recommended
|
527 |
+
""")
|
528 |
|
529 |
convert_btn = gr.Button("๐ฏ Generate Conversation", variant="primary", size="lg")
|
530 |
|
|
|
534 |
label="Generated Conversation (Editable)",
|
535 |
lines=15,
|
536 |
max_lines=30,
|
537 |
+
interactive=True,
|
538 |
placeholder="Generated conversation will appear here. You can edit it before generating audio.",
|
539 |
info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
|
540 |
)
|
|
|
558 |
visible=True
|
559 |
)
|
560 |
|
561 |
+
# TTS ์์ง๋ณ ์ค๋ช
์ถ๊ฐ
|
562 |
+
with gr.Row():
|
563 |
+
gr.Markdown("""
|
564 |
+
### TTS Engine Details:
|
565 |
+
|
566 |
+
- **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
|
567 |
+
- **Spark-TTS**: SparkAudio's local AI model (0.5B parameters). Runs on your device, good for privacy.
|
568 |
+
- **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
|
569 |
+
""")
|
570 |
+
|
571 |
gr.Examples(
|
572 |
examples=[
|
573 |
["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
|
574 |
+
["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS"],
|
575 |
+
["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS"],
|
576 |
],
|
577 |
inputs=[url_input, mode_selector, tts_selector],
|
578 |
+
outputs=[conversation_output, status_output],
|
579 |
fn=synthesize_sync,
|
580 |
cache_examples=False,
|
581 |
)
|