openfree commited on
Commit
24c8c4e
ยท
verified ยท
1 Parent(s): 47a4f4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -11
app.py CHANGED
@@ -9,6 +9,8 @@ import httpx
9
  import tempfile
10
  import wave
11
  import base64
 
 
12
  from dataclasses import dataclass
13
  from typing import List, Tuple, Dict, Optional
14
  from pathlib import Path
@@ -30,6 +32,13 @@ from transformers import (
30
  BitsAndBytesConfig,
31
  )
32
 
 
 
 
 
 
 
 
33
  # MeloTTS imports (for local mode)
34
  try:
35
  os.system("python -m unidic download")
@@ -56,6 +65,8 @@ class UnifiedAudioConverter:
56
  self.local_model = None
57
  self.tokenizer = None
58
  self.melo_models = None
 
 
59
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
60
 
61
  def initialize_api_mode(self, api_key: str):
@@ -77,7 +88,23 @@ class UnifiedAudioConverter:
77
  self.config.local_model_name,
78
  revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
79
  )
80
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  if MELO_AVAILABLE and self.melo_models is None:
82
  self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
83
 
@@ -246,6 +273,67 @@ class UnifiedAudioConverter:
246
 
247
  return tmp_path
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
250
  """Convert text to speech using MeloTTS"""
251
  if not MELO_AVAILABLE or not self.melo_models:
@@ -272,7 +360,7 @@ class UnifiedAudioConverter:
272
  combined_audio += audio_segment
273
 
274
  # Save final audio
275
- final_audio_path = "final_podcast.mp3"
276
  combined_audio.export(final_audio_path, format="mp3")
277
 
278
  # Generate conversation text
@@ -372,9 +460,15 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
372
  "en-US-AvaMultilingualNeural",
373
  "en-US-AndrewMultilingualNeural"
374
  )
 
 
 
 
 
375
  else: # MeloTTS
376
  if not MELO_AVAILABLE:
377
  return "MeloTTS not available. Please install required dependencies.", None
 
378
  output_file, _ = converter.text_to_speech_melo(conversation_json)
379
 
380
  return "Audio generated successfully!", output_file
@@ -412,12 +506,25 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
412
  label="Processing Mode",
413
  info="API: Faster, requires API key | Local: Slower, runs on device"
414
  )
415
- tts_selector = gr.Radio(
416
- choices=["Edge-TTS", "MeloTTS"],
417
- value="Edge-TTS",
418
- label="TTS Engine",
419
- info="Edge-TTS: More natural | MeloTTS: Requires GPU"
420
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
  convert_btn = gr.Button("๐ŸŽฏ Generate Conversation", variant="primary", size="lg")
423
 
@@ -427,7 +534,7 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
427
  label="Generated Conversation (Editable)",
428
  lines=15,
429
  max_lines=30,
430
- interactive=True, # ํŽธ์ง‘ ๊ฐ€๋Šฅํ•˜๋„๋ก ๋ณ€๊ฒฝ
431
  placeholder="Generated conversation will appear here. You can edit it before generating audio.",
432
  info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
433
  )
@@ -451,13 +558,24 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
451
  visible=True
452
  )
453
 
 
 
 
 
 
 
 
 
 
 
454
  gr.Examples(
455
  examples=[
456
  ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
457
- ["https://www.bbc.com/news/technology-67988517", "API", "Edge-TTS"],
 
458
  ],
459
  inputs=[url_input, mode_selector, tts_selector],
460
- outputs=[conversation_output, audio_output],
461
  fn=synthesize_sync,
462
  cache_examples=False,
463
  )
 
9
  import tempfile
10
  import wave
11
  import base64
12
+ import numpy as np
13
+ import soundfile as sf
14
  from dataclasses import dataclass
15
  from typing import List, Tuple, Dict, Optional
16
  from pathlib import Path
 
32
  BitsAndBytesConfig,
33
  )
34
 
35
+ # Spark TTS imports
36
+ try:
37
+ from transformers import AutoModel
38
+ SPARK_AVAILABLE = True
39
+ except:
40
+ SPARK_AVAILABLE = False
41
+
42
  # MeloTTS imports (for local mode)
43
  try:
44
  os.system("python -m unidic download")
 
65
  self.local_model = None
66
  self.tokenizer = None
67
  self.melo_models = None
68
+ self.spark_model = None
69
+ self.spark_tokenizer = None
70
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
71
 
72
  def initialize_api_mode(self, api_key: str):
 
88
  self.config.local_model_name,
89
  revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
90
  )
91
+
92
+ def initialize_spark_tts(self):
93
+ """Initialize Spark TTS model"""
94
+ if SPARK_AVAILABLE and self.spark_model is None:
95
+ try:
96
+ self.spark_model = AutoModel.from_pretrained(
97
+ "SparkAudio/Spark-TTS-0.5B",
98
+ trust_remote_code=True,
99
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
100
+ ).to(self.device)
101
+ # Spark TTS๋Š” ๋ณ„๋„์˜ ํ† ํฌ๋‚˜์ด์ €๊ฐ€ ํ•„์š”ํ•˜์ง€ ์•Š์„ ์ˆ˜ ์žˆ์Œ
102
+ print("Spark TTS model loaded successfully")
103
+ except Exception as e:
104
+ print(f"Failed to load Spark TTS: {e}")
105
+
106
+ def initialize_melo_tts(self):
107
+ """Initialize MeloTTS models"""
108
  if MELO_AVAILABLE and self.melo_models is None:
109
  self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
110
 
 
273
 
274
  return tmp_path
275
 
276
+ def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
277
+ """Convert text to speech using Spark TTS"""
278
+ if not SPARK_AVAILABLE or self.spark_model is None:
279
+ raise RuntimeError("Spark TTS not available")
280
+
281
+ try:
282
+ combined_audio = []
283
+ sample_rate = 22050 # Default sample rate for Spark TTS
284
+
285
+ # Two different speaker configurations for variety
286
+ speakers = ["female_1", "male_1"] # Adjust based on actual Spark TTS speaker options
287
+
288
+ for i, turn in enumerate(conversation_json["conversation"]):
289
+ text = turn["text"]
290
+ if not text.strip():
291
+ continue
292
+
293
+ # Generate audio with Spark TTS
294
+ # Note: The exact API might differ, adjust based on actual Spark TTS documentation
295
+ try:
296
+ with torch.no_grad():
297
+ audio_output = self.spark_model.synthesize(
298
+ text=text,
299
+ voice=speakers[i % 2] if len(speakers) > 1 else speakers[0],
300
+ speed=1.0,
301
+ # Add other parameters as needed
302
+ )
303
+
304
+ # Convert to numpy array if needed
305
+ if isinstance(audio_output, torch.Tensor):
306
+ audio_output = audio_output.cpu().numpy()
307
+
308
+ combined_audio.append(audio_output)
309
+
310
+ except Exception as e:
311
+ print(f"Error generating audio for turn {i}: {e}")
312
+ # Generate silence as fallback
313
+ silence = np.zeros(int(sample_rate * 0.5)) # 0.5 second silence
314
+ combined_audio.append(silence)
315
+
316
+ # Combine all audio segments
317
+ if combined_audio:
318
+ final_audio = np.concatenate(combined_audio)
319
+
320
+ # Save to file
321
+ output_path = "spark_podcast_output.wav"
322
+ sf.write(output_path, final_audio, sample_rate)
323
+ else:
324
+ raise RuntimeError("No audio generated")
325
+
326
+ # Generate conversation text
327
+ conversation_text = "\n".join(
328
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
329
+ for i, turn in enumerate(conversation_json["conversation"])
330
+ )
331
+
332
+ return output_path, conversation_text
333
+
334
+ except Exception as e:
335
+ raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
336
+
337
  def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
338
  """Convert text to speech using MeloTTS"""
339
  if not MELO_AVAILABLE or not self.melo_models:
 
360
  combined_audio += audio_segment
361
 
362
  # Save final audio
363
+ final_audio_path = "melo_podcast.mp3"
364
  combined_audio.export(final_audio_path, format="mp3")
365
 
366
  # Generate conversation text
 
460
  "en-US-AvaMultilingualNeural",
461
  "en-US-AndrewMultilingualNeural"
462
  )
463
+ elif tts_engine == "Spark-TTS":
464
+ if not SPARK_AVAILABLE:
465
+ return "Spark TTS not available. Please install required dependencies.", None
466
+ converter.initialize_spark_tts()
467
+ output_file, _ = converter.text_to_speech_spark(conversation_json)
468
  else: # MeloTTS
469
  if not MELO_AVAILABLE:
470
  return "MeloTTS not available. Please install required dependencies.", None
471
+ converter.initialize_melo_tts()
472
  output_file, _ = converter.text_to_speech_melo(conversation_json)
473
 
474
  return "Audio generated successfully!", output_file
 
506
  label="Processing Mode",
507
  info="API: Faster, requires API key | Local: Slower, runs on device"
508
  )
509
+
510
+ # TTS ์—”์ง„ ์„ ํƒ - ๊ธฐ๋ณธ 2๊ฐœ์™€ ์ถ”๊ฐ€ ์˜ต์…˜์œผ๋กœ ๊ตฌ๋ถ„
511
+ with gr.Group():
512
+ gr.Markdown("### TTS Engine Selection")
513
+ tts_selector = gr.Radio(
514
+ choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
515
+ value="Edge-TTS",
516
+ label="TTS Engine",
517
+ info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
518
+ )
519
+
520
+ gr.Markdown("""
521
+ **Recommended:**
522
+ - ๐ŸŒŸ **Edge-TTS**: Best quality, cloud-based
523
+ - ๐Ÿค– **Spark-TTS**: Local AI model, good quality
524
+
525
+ **Additional Option:**
526
+ - โšก **MeloTTS**: Local processing, GPU recommended
527
+ """)
528
 
529
  convert_btn = gr.Button("๐ŸŽฏ Generate Conversation", variant="primary", size="lg")
530
 
 
534
  label="Generated Conversation (Editable)",
535
  lines=15,
536
  max_lines=30,
537
+ interactive=True,
538
  placeholder="Generated conversation will appear here. You can edit it before generating audio.",
539
  info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
540
  )
 
558
  visible=True
559
  )
560
 
561
+ # TTS ์—”์ง„๋ณ„ ์„ค๋ช… ์ถ”๊ฐ€
562
+ with gr.Row():
563
+ gr.Markdown("""
564
+ ### TTS Engine Details:
565
+
566
+ - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
567
+ - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters). Runs on your device, good for privacy.
568
+ - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
569
+ """)
570
+
571
  gr.Examples(
572
  examples=[
573
  ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
574
+ ["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS"],
575
+ ["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS"],
576
  ],
577
  inputs=[url_input, mode_selector, tts_selector],
578
+ outputs=[conversation_output, status_output],
579
  fn=synthesize_sync,
580
  cache_examples=False,
581
  )