openfree commited on
Commit
e038f5e
·
verified ·
1 Parent(s): 44dadf2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -743
app.py CHANGED
@@ -1,754 +1,35 @@
1
- import gradio as gr
2
  import os
3
- import asyncio
4
- import torch
5
- import io
6
- import json
7
- import re
8
- import httpx
9
- import tempfile
10
- import wave
11
- import base64
12
- import numpy as np
13
- import soundfile as sf
14
- import subprocess
15
- import shutil
16
- from dataclasses import dataclass
17
- from typing import List, Tuple, Dict, Optional
18
- from pathlib import Path
19
- from threading import Thread
20
- from dotenv import load_dotenv
21
 
22
- # Edge TTS imports
23
- import edge_tts
24
- from pydub import AudioSegment
25
-
26
- # OpenAI imports
27
- from openai import OpenAI
28
-
29
- # Transformers imports (for local mode)
30
- from transformers import (
31
- AutoModelForCausalLM,
32
- AutoTokenizer,
33
- TextIteratorStreamer,
34
- BitsAndBytesConfig,
35
- )
36
-
37
- # Spark TTS imports
38
- try:
39
- from huggingface_hub import snapshot_download
40
- SPARK_AVAILABLE = True
41
- except:
42
- SPARK_AVAILABLE = False
43
-
44
- # MeloTTS imports (for local mode)
45
- try:
46
- os.system("python -m unidic download")
47
- from melo.api import TTS as MeloTTS
48
- MELO_AVAILABLE = True
49
- except:
50
- MELO_AVAILABLE = False
51
-
52
- load_dotenv()
53
-
54
-
55
- @dataclass
56
- class ConversationConfig:
57
- max_words: int = 6000
58
- prefix_url: str = "https://r.jina.ai/"
59
- model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
60
- local_model_name: str = "NousResearch/Hermes-2-Pro-Llama-3-8B"
61
-
62
-
63
- class UnifiedAudioConverter:
64
- def __init__(self, config: ConversationConfig):
65
- self.config = config
66
- self.llm_client = None
67
- self.local_model = None
68
- self.tokenizer = None
69
- self.melo_models = None
70
- self.spark_model_dir = None
71
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
72
-
73
- def initialize_api_mode(self, api_key: str):
74
- """Initialize API mode with Together API"""
75
- self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
76
-
77
- def initialize_local_mode(self):
78
- """Initialize local mode with Hugging Face model"""
79
- if self.local_model is None:
80
- quantization_config = BitsAndBytesConfig(
81
- load_in_4bit=True,
82
- bnb_4bit_compute_dtype=torch.float16
83
- )
84
- self.local_model = AutoModelForCausalLM.from_pretrained(
85
- self.config.local_model_name,
86
- quantization_config=quantization_config
87
- )
88
- self.tokenizer = AutoTokenizer.from_pretrained(
89
- self.config.local_model_name,
90
- revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
91
- )
92
-
93
- def initialize_spark_tts(self):
94
- """Initialize Spark TTS model by downloading if needed"""
95
- if not SPARK_AVAILABLE:
96
- raise RuntimeError("Spark TTS dependencies not available")
97
-
98
- model_dir = "pretrained_models/Spark-TTS-0.5B"
99
-
100
- # Check if model exists, if not download it
101
- if not os.path.exists(model_dir):
102
- print("Downloading Spark-TTS model...")
103
- try:
104
- os.makedirs("pretrained_models", exist_ok=True)
105
- snapshot_download(
106
- "SparkAudio/Spark-TTS-0.5B",
107
- local_dir=model_dir
108
- )
109
- print("Spark-TTS model downloaded successfully")
110
- except Exception as e:
111
- raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
112
-
113
- self.spark_model_dir = model_dir
114
-
115
- # Check if we have the CLI inference script
116
- if not os.path.exists("cli/inference.py"):
117
- print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
118
-
119
- def initialize_melo_tts(self):
120
- """Initialize MeloTTS models"""
121
- if MELO_AVAILABLE and self.melo_models is None:
122
- self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
123
-
124
- def fetch_text(self, url: str) -> str:
125
- """Fetch text content from URL"""
126
- if not url:
127
- raise ValueError("URL cannot be empty")
128
-
129
- if not url.startswith("http://") and not url.startswith("https://"):
130
- raise ValueError("URL must start with 'http://' or 'https://'")
131
-
132
- full_url = f"{self.config.prefix_url}{url}"
133
- try:
134
- response = httpx.get(full_url, timeout=60.0)
135
- response.raise_for_status()
136
- return response.text
137
- except httpx.HTTPError as e:
138
- raise RuntimeError(f"Failed to fetch URL: {e}")
139
-
140
- def _build_prompt(self, text: str, language: str = "English") -> str:
141
- """Build prompt for conversation generation"""
142
- if language == "Korean":
143
- template = """
144
- {
145
- "conversation": [
146
- {"speaker": "", "text": ""},
147
- {"speaker": "", "text": ""}
148
- ]
149
- }
150
- """
151
- return (
152
- f"{text}\n\n제공된 텍스트를 두 명의 전문가 간의 짧고 유익하며 명확한 "
153
- f"팟캐스트 대화로 변환해주세요. 톤은 전문적이고 매력적이어야 합니다. "
154
- f"다음 형식을 준수하고 JSON만 반환해주세요:\n{template}"
155
- )
156
- else:
157
- template = """
158
- {
159
- "conversation": [
160
- {"speaker": "", "text": ""},
161
- {"speaker": "", "text": ""}
162
- ]
163
- }
164
- """
165
- return (
166
- f"{text}\n\nConvert the provided text into a short, informative and crisp "
167
- f"podcast conversation between two experts. The tone should be "
168
- f"professional and engaging. Please adhere to the following "
169
- f"format and return ONLY the JSON:\n{template}"
170
- )
171
-
172
- def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
173
- """Extract conversation using API"""
174
- if not self.llm_client:
175
- raise RuntimeError("API mode not initialized")
176
-
177
- try:
178
- # 언어별 프롬프트 구성
179
- if language == "Korean":
180
- system_message = "당신은 한국어로 팟캐스트 대화를 생성하는 전문가입니다. 자연스럽고 유익한 한국어 대화를 만들어주세요."
181
- else:
182
- system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
183
-
184
- chat_completion = self.llm_client.chat.completions.create(
185
- messages=[
186
- {"role": "system", "content": system_message},
187
- {"role": "user", "content": self._build_prompt(text, language)}
188
- ],
189
- model=self.config.model_name,
190
- )
191
-
192
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
193
- json_match = re.search(pattern, chat_completion.choices[0].message.content)
194
-
195
- if not json_match:
196
- raise ValueError("No valid JSON found in response")
197
-
198
- return json.loads(json_match.group())
199
- except Exception as e:
200
- raise RuntimeError(f"Failed to extract conversation: {e}")
201
-
202
- def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
203
- """Extract conversation using local model"""
204
- if not self.local_model or not self.tokenizer:
205
- raise RuntimeError("Local mode not initialized")
206
-
207
- # 언어별 시스템 메시지
208
- if language == "Korean":
209
- system_message = "당신은 한국어로 팟캐스트 대화를 생성하는 전문가입니다. 자연스럽고 유익한 한국어 대화를 만들어주세요."
210
- else:
211
- system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
212
-
213
- chat = [
214
- {"role": "system", "content": system_message},
215
- {"role": "user", "content": self._build_prompt(text, language)}
216
- ]
217
-
218
- terminators = [
219
- self.tokenizer.eos_token_id,
220
- self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
221
- ]
222
-
223
- messages = self.tokenizer.apply_chat_template(
224
- chat, tokenize=False, add_generation_prompt=True
225
- )
226
- model_inputs = self.tokenizer([messages], return_tensors="pt").to(self.device)
227
-
228
- streamer = TextIteratorStreamer(
229
- self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
230
- )
231
 
232
- generate_kwargs = dict(
233
- model_inputs,
234
- streamer=streamer,
235
- max_new_tokens=4000,
236
- do_sample=True,
237
- temperature=0.9,
238
- eos_token_id=terminators,
239
- )
240
-
241
- t = Thread(target=self.local_model.generate, kwargs=generate_kwargs)
242
- t.start()
243
-
244
- partial_text = ""
245
- for new_text in streamer:
246
- partial_text += new_text
247
-
248
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
249
- json_match = re.search(pattern, partial_text)
250
 
251
- if json_match:
252
- return json.loads(json_match.group())
253
- else:
254
- # Return a default template based on language
255
- if language == "Korean":
256
- return {
257
- "conversation": [
258
- {"speaker": "진행자", "text": "안녕하세요, 팟캐스트에 오신 것을 환영합니다."},
259
- {"speaker": "게스트", "text": "안녕하세요, 초대해 주셔서 감사합니다."}
260
- ]
261
- }
262
- else:
263
- return {
264
- "conversation": [
265
- {"speaker": "Host", "text": "Welcome to our podcast."},
266
- {"speaker": "Guest", "text": "Thank you for having me."}
267
- ]
268
- }
269
-
270
- def parse_conversation_text(self, conversation_text: str) -> Dict:
271
- """Parse conversation text back to JSON format"""
272
- lines = conversation_text.strip().split('\n')
273
- conversation_data = {"conversation": []}
274
 
275
- for line in lines:
276
- if ':' in line:
277
- speaker, text = line.split(':', 1)
278
- conversation_data["conversation"].append({
279
- "speaker": speaker.strip(),
280
- "text": text.strip()
281
- })
282
 
283
- return conversation_data
284
-
285
- async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
286
- """Convert text to speech using Edge TTS"""
287
- output_dir = Path(self._create_output_directory())
288
- filenames = []
289
-
290
  try:
291
- # 언어별 음성 설정
292
- if language == "Korean":
293
- voices = [
294
- "ko-KR-SunHiNeural", # 여성 음성 (자연스러운 한국어)
295
- "ko-KR-HyunsuNeural" # 남성 음성 (자연스러운 한국어)
296
- ]
297
- else:
298
- voices = [
299
- "en-US-AvaMultilingualNeural", # 여성 음성
300
- "en-US-AndrewMultilingualNeural" # 남성 음성
301
- ]
302
-
303
- for i, turn in enumerate(conversation_json["conversation"]):
304
- filename = output_dir / f"output_{i}.wav"
305
- voice = voices[i % len(voices)]
306
-
307
- tmp_path = await self._generate_audio_edge(turn["text"], voice)
308
- os.rename(tmp_path, filename)
309
- filenames.append(str(filename))
310
-
311
- # Combine audio files
312
- final_output = os.path.join(output_dir, "combined_output.wav")
313
- self._combine_audio_files(filenames, final_output)
314
-
315
- # Generate conversation text
316
- conversation_text = "\n".join(
317
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
318
- for i, turn in enumerate(conversation_json["conversation"])
319
- )
320
-
321
- return final_output, conversation_text
322
- except Exception as e:
323
- raise RuntimeError(f"Failed to convert text to speech: {e}")
324
-
325
- async def _generate_audio_edge(self, text: str, voice: str) -> str:
326
- """Generate audio using Edge TTS"""
327
- if not text.strip():
328
- raise ValueError("Text cannot be empty")
329
-
330
- voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
331
- communicate = edge_tts.Communicate(text, voice_short_name)
332
-
333
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
334
- tmp_path = tmp_file.name
335
- await communicate.save(tmp_path)
336
-
337
- return tmp_path
338
-
339
- def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
340
- """Convert text to speech using Spark TTS CLI"""
341
- if not SPARK_AVAILABLE or not self.spark_model_dir:
342
- raise RuntimeError("Spark TTS not available")
343
-
344
- try:
345
- output_dir = self._create_output_directory()
346
- audio_files = []
347
 
348
- # Create different voice characteristics for different speakers
349
- if language == "Korean":
350
- voice_configs = [
351
- {"prompt_text": "안녕하세요, 오늘 팟캐스트 진행을 맡은 진행자입니다.", "gender": "female"},
352
- {"prompt_text": "안녕하세요, 오늘 게스트로 참여하게 되어 기쁩니다.", "gender": "male"}
353
- ]
354
- else:
355
- voice_configs = [
356
- {"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
357
- {"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
358
- ]
359
-
360
- for i, turn in enumerate(conversation_json["conversation"]):
361
- text = turn["text"]
362
- if not text.strip():
363
- continue
364
-
365
- # Use different voice config for each speaker
366
- voice_config = voice_configs[i % len(voice_configs)]
367
-
368
- output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
369
-
370
- # Run Spark TTS CLI inference
371
- cmd = [
372
- "python", "-m", "cli.inference",
373
- "--text", text,
374
- "--device", "0" if torch.cuda.is_available() else "cpu",
375
- "--save_dir", output_dir,
376
- "--model_dir", self.spark_model_dir,
377
- "--prompt_text", voice_config["prompt_text"],
378
- "--output_name", f"spark_output_{i}.wav"
379
- ]
380
-
381
- try:
382
- # Run the command
383
- result = subprocess.run(
384
- cmd,
385
- capture_output=True,
386
- text=True,
387
- timeout=60,
388
- cwd="." # Make sure we're in the right directory
389
- )
390
-
391
- if result.returncode == 0:
392
- audio_files.append(output_file)
393
- else:
394
- print(f"Spark TTS error for turn {i}: {result.stderr}")
395
- # Create a short silence as fallback
396
- silence = np.zeros(int(22050 * 1.0)) # 1 second of silence
397
- sf.write(output_file, silence, 22050)
398
- audio_files.append(output_file)
399
-
400
- except subprocess.TimeoutExpired:
401
- print(f"Spark TTS timeout for turn {i}")
402
- # Create silence as fallback
403
- silence = np.zeros(int(22050 * 1.0))
404
- sf.write(output_file, silence, 22050)
405
- audio_files.append(output_file)
406
- except Exception as e:
407
- print(f"Error running Spark TTS for turn {i}: {e}")
408
- # Create silence as fallback
409
- silence = np.zeros(int(22050 * 1.0))
410
- sf.write(output_file, silence, 22050)
411
- audio_files.append(output_file)
412
-
413
- # Combine all audio files
414
- if audio_files:
415
- final_output = os.path.join(output_dir, "spark_combined.wav")
416
- self._combine_audio_files(audio_files, final_output)
417
- else:
418
- raise RuntimeError("No audio files generated")
419
-
420
- # Generate conversation text
421
- conversation_text = "\n".join(
422
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
423
- for i, turn in enumerate(conversation_json["conversation"])
424
- )
425
-
426
- return final_output, conversation_text
427
-
428
- except Exception as e:
429
- raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
430
-
431
- def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
432
- """Convert text to speech using MeloTTS"""
433
- if not MELO_AVAILABLE or not self.melo_models:
434
- raise RuntimeError("MeloTTS not available")
435
-
436
- speakers = ["EN-Default", "EN-US"]
437
- combined_audio = AudioSegment.empty()
438
-
439
- for i, turn in enumerate(conversation_json["conversation"]):
440
- bio = io.BytesIO()
441
- text = turn["text"]
442
- speaker = speakers[i % 2]
443
- speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
444
-
445
- # Generate audio
446
- self.melo_models["EN"].tts_to_file(
447
- text, speaker_id, bio, speed=1.0,
448
- pbar=progress.tqdm if progress else None,
449
- format="wav"
450
- )
451
-
452
- bio.seek(0)
453
- audio_segment = AudioSegment.from_file(bio, format="wav")
454
- combined_audio += audio_segment
455
-
456
- # Save final audio
457
- final_audio_path = "melo_podcast.mp3"
458
- combined_audio.export(final_audio_path, format="mp3")
459
-
460
- # Generate conversation text
461
- conversation_text = "\n".join(
462
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
463
- for i, turn in enumerate(conversation_json["conversation"])
464
- )
465
-
466
- return final_audio_path, conversation_text
467
-
468
- def _create_output_directory(self) -> str:
469
- """Create a unique output directory"""
470
- random_bytes = os.urandom(8)
471
- folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
472
- os.makedirs(folder_name, exist_ok=True)
473
- return folder_name
474
-
475
- def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
476
- """Combine multiple audio files into one"""
477
- if not filenames:
478
- raise ValueError("No input files provided")
479
-
480
- try:
481
- audio_segments = []
482
- for filename in filenames:
483
- if os.path.exists(filename):
484
- audio_segment = AudioSegment.from_file(filename)
485
- audio_segments.append(audio_segment)
486
-
487
- if audio_segments:
488
- combined = sum(audio_segments)
489
- combined.export(output_file, format="wav")
490
-
491
- # Clean up temporary files
492
- for filename in filenames:
493
- if os.path.exists(filename):
494
- os.remove(filename)
495
-
496
- except Exception as e:
497
- raise RuntimeError(f"Failed to combine audio files: {e}")
498
-
499
-
500
- # Global converter instance
501
- converter = UnifiedAudioConverter(ConversationConfig())
502
-
503
-
504
- async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
505
- """Main synthesis function"""
506
- if not article_url:
507
- return "Please provide a valid URL.", None
508
-
509
- try:
510
- # Fetch text from URL
511
- text = converter.fetch_text(article_url)
512
-
513
- # Limit text to max words
514
- words = text.split()
515
- if len(words) > converter.config.max_words:
516
- text = " ".join(words[:converter.config.max_words])
517
-
518
- # Extract conversation based on mode
519
- if mode == "API":
520
- api_key = os.environ.get("TOGETHER_API_KEY")
521
- if not api_key:
522
- return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
523
- converter.initialize_api_mode(api_key)
524
- conversation_json = converter.extract_conversation_api(text, language)
525
- else: # Local mode
526
- converter.initialize_local_mode()
527
- conversation_json = converter.extract_conversation_local(text, language)
528
-
529
- # Generate conversation text
530
- conversation_text = "\n".join(
531
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
532
- for i, turn in enumerate(conversation_json["conversation"])
533
- )
534
-
535
- return conversation_text, None
536
-
537
- except Exception as e:
538
- return f"Error: {str(e)}", None
539
-
540
-
541
- async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
542
- """Regenerate audio from edited conversation text"""
543
- if not conversation_text.strip():
544
- return "Please provide conversation text.", None
545
-
546
- try:
547
- # Parse the conversation text back to JSON format
548
- conversation_json = converter.parse_conversation_text(conversation_text)
549
-
550
- if not conversation_json["conversation"]:
551
- return "No valid conversation found in the text.", None
552
-
553
- # 한국어인 경우 Edge-TTS만 사용 (다른 TTS는 한국어 지원이 제한적)
554
- if language == "Korean" and tts_engine != "Edge-TTS":
555
- return "한국어는 Edge-TTS만 지원됩니다. TTS 엔진이 자동으로 Edge-TTS로 변경됩니다.", None
556
-
557
- # Generate audio based on TTS engine
558
- if tts_engine == "Edge-TTS":
559
- output_file, _ = await converter.text_to_speech_edge(conversation_json, language)
560
- elif tts_engine == "Spark-TTS":
561
- if not SPARK_AVAILABLE:
562
- return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
563
- converter.initialize_spark_tts()
564
- output_file, _ = converter.text_to_speech_spark(conversation_json, language)
565
- else: # MeloTTS
566
- if not MELO_AVAILABLE:
567
- return "MeloTTS not available. Please install required dependencies.", None
568
- if language == "Korean":
569
- return "MeloTTS does not support Korean. Please use Edge-TTS for Korean.", None
570
- converter.initialize_melo_tts()
571
- output_file, _ = converter.text_to_speech_melo(conversation_json)
572
-
573
- return "Audio generated successfully!", output_file
574
-
575
  except Exception as e:
576
- return f"Error generating audio: {str(e)}", None
577
-
578
-
579
- def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
580
- """Synchronous wrapper for async synthesis"""
581
- return asyncio.run(synthesize(article_url, mode, tts_engine, language))
582
-
583
-
584
- def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
585
- """Synchronous wrapper for async audio regeneration"""
586
- return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
587
-
588
-
589
- def update_tts_engine_for_korean(language):
590
- """한국어 선택 시 TTS 엔진 옵션 업데이트"""
591
- if language == "Korean":
592
- return gr.Radio(
593
- choices=["Edge-TTS"],
594
- value="Edge-TTS",
595
- label="TTS Engine",
596
- info="한국어는 Edge-TTS만 지원됩니다",
597
- interactive=False
598
- )
599
- else:
600
- return gr.Radio(
601
- choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
602
- value="Edge-TTS",
603
- label="TTS Engine",
604
- info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU",
605
- interactive=True
606
- )
607
-
608
-
609
- # Gradio Interface
610
- with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
611
- gr.Markdown("# 🎙️ URL to Podcast Converter")
612
- gr.Markdown("Convert any article, blog, or news into an engaging podcast conversation!")
613
-
614
- with gr.Row():
615
- with gr.Column(scale=3):
616
- url_input = gr.Textbox(
617
- label="Article URL",
618
- placeholder="Enter the article URL here...",
619
- value=""
620
- )
621
- with gr.Column(scale=1):
622
- # 언어 선택 추가
623
- language_selector = gr.Radio(
624
- choices=["English", "Korean"],
625
- value="English",
626
- label="Language / 언어",
627
- info="Select output language / 출력 언어를 선택하세요"
628
- )
629
-
630
- mode_selector = gr.Radio(
631
- choices=["API", "Local"],
632
- value="API",
633
- label="Processing Mode",
634
- info="API: Faster, requires API key | Local: Slower, runs on device"
635
- )
636
-
637
- # TTS 엔진 선택
638
- with gr.Group():
639
- gr.Markdown("### TTS Engine Selection")
640
- tts_selector = gr.Radio(
641
- choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
642
- value="Edge-TTS",
643
- label="TTS Engine",
644
- info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
645
- )
646
-
647
- gr.Markdown("""
648
- **Recommended:**
649
- - 🌟 **Edge-TTS**: Best quality, cloud-based, instant setup
650
- - 🤖 **Spark-TTS**: Local AI model (0.5B), zero-shot voice cloning
651
-
652
- **Additional Option:**
653
- - ⚡ **MeloTTS**: Local processing, GPU recommended
654
-
655
- **한국어 지원:**
656
- - 🇰🇷 한국어 선택 시 Edge-TTS만 사용 가능합니다
657
- """)
658
-
659
- convert_btn = gr.Button("🎯 Generate Conversation / 대화 생성", variant="primary", size="lg")
660
-
661
- with gr.Row():
662
- with gr.Column():
663
- conversation_output = gr.Textbox(
664
- label="Generated Conversation (Editable) / 생성된 대화 (편집 가능)",
665
- lines=15,
666
- max_lines=30,
667
- interactive=True,
668
- placeholder="Generated conversation will appear here. You can edit it before generating audio.\n생성된 대화가 여기에 표시됩니다. 오디오 생성 전에 편집할 수 있습니다.",
669
- info="Edit the conversation as needed. Format: 'Speaker Name: Text' / 필요에 따라 대화를 편집하세요. 형식: '화자 이름: 텍스트'"
670
- )
671
-
672
- # 오디오 생성 버튼 추가
673
- with gr.Row():
674
- generate_audio_btn = gr.Button("🎙️ Generate Audio from Text / 텍스트에서 오디오 생성", variant="secondary", size="lg")
675
- gr.Markdown("*Edit the conversation above, then click to generate audio / 위의 대화를 편집한 후 클릭하여 오디오를 생성하세요*")
676
-
677
- with gr.Column():
678
- audio_output = gr.Audio(
679
- label="Podcast Audio / 팟캐스트 오디오",
680
- type="filepath",
681
- interactive=False
682
- )
683
-
684
- # 상태 메시지 추가
685
- status_output = gr.Textbox(
686
- label="Status / 상태",
687
- interactive=False,
688
- visible=True
689
- )
690
-
691
- # TTS 엔진별 설명 및 설치 안내 추가
692
- with gr.Row():
693
- gr.Markdown("""
694
- ### TTS Engine Details / TTS 엔진 상세정보:
695
-
696
- - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
697
- - 🇰🇷 **한국어 지원**: 자연스러운 한국어 음성 (여성: SunHi, 남성: InJoon)
698
- - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
699
- - **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
700
- - Features: Bilingual support (Chinese/English), controllable speech generation
701
- - License: CC BY-NC-SA (Non-commercial use only)
702
- - ⚠️ **한국어 미지원**
703
- - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
704
- - ⚠️ **한국어 미지원**
705
-
706
- ### Spark-TTS Setup Instructions:
707
- ```bash
708
- git clone https://github.com/SparkAudio/Spark-TTS.git
709
- cd Spark-TTS
710
- pip install -r requirements.txt
711
- ```
712
- """)
713
-
714
- gr.Examples(
715
- examples=[
716
- ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS", "English"],
717
- ["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS", "English"],
718
- ["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS", "Korean"],
719
- ],
720
- inputs=[url_input, mode_selector, tts_selector, language_selector],
721
- outputs=[conversation_output, status_output],
722
- fn=synthesize_sync,
723
- cache_examples=False,
724
- )
725
-
726
- # 언어 변경 시 TTS 엔진 옵션 업데이트
727
- language_selector.change(
728
- fn=update_tts_engine_for_korean,
729
- inputs=[language_selector],
730
- outputs=[tts_selector]
731
- )
732
-
733
- # 이벤트 연결
734
- convert_btn.click(
735
- fn=synthesize_sync,
736
- inputs=[url_input, mode_selector, tts_selector, language_selector],
737
- outputs=[conversation_output, status_output]
738
- )
739
-
740
- generate_audio_btn.click(
741
- fn=regenerate_audio_sync,
742
- inputs=[conversation_output, tts_selector, language_selector],
743
- outputs=[status_output, audio_output]
744
- )
745
-
746
 
747
- # Launch the app
748
  if __name__ == "__main__":
749
- demo.queue(api_open=True, default_concurrency_limit=10).launch(
750
- show_api=True,
751
- share=False,
752
- server_name="0.0.0.0",
753
- server_port=7860
754
- )
 
 
1
  import os
2
+ import sys
3
+ import streamlit as st
4
+ from tempfile import NamedTemporaryFile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ def main():
7
+ try:
8
+ # Get the code from secrets
9
+ code = os.environ.get("MAIN_CODE")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ if not code:
12
+ st.error("⚠️ The application code wasn't found in secrets. Please add the MAIN_CODE secret.")
13
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Create a temporary Python file
16
+ with NamedTemporaryFile(suffix='.py', delete=False, mode='w') as tmp:
17
+ tmp.write(code)
18
+ tmp_path = tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ # Execute the code
21
+ exec(compile(code, tmp_path, 'exec'), globals())
 
 
 
 
 
22
 
23
+ # Clean up the temporary file
 
 
 
 
 
 
24
  try:
25
+ os.unlink(tmp_path)
26
+ except:
27
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  except Exception as e:
30
+ st.error(f"⚠️ Error loading or executing the application: {str(e)}")
31
+ import traceback
32
+ st.code(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
34
  if __name__ == "__main__":
35
+ main()