openfree commited on
Commit
49e7687
ยท
verified ยท
1 Parent(s): 897d75d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +754 -0
app.py ADDED
@@ -0,0 +1,754 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import asyncio
4
+ import torch
5
+ import io
6
+ import json
7
+ import re
8
+ import httpx
9
+ import tempfile
10
+ import wave
11
+ import base64
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import subprocess
15
+ import shutil
16
+ from dataclasses import dataclass
17
+ from typing import List, Tuple, Dict, Optional
18
+ from pathlib import Path
19
+ from threading import Thread
20
+ from dotenv import load_dotenv
21
+
22
+ # Edge TTS imports
23
+ import edge_tts
24
+ from pydub import AudioSegment
25
+
26
+ # OpenAI imports
27
+ from openai import OpenAI
28
+
29
+ # Transformers imports (for local mode)
30
+ from transformers import (
31
+ AutoModelForCausalLM,
32
+ AutoTokenizer,
33
+ TextIteratorStreamer,
34
+ BitsAndBytesConfig,
35
+ )
36
+
37
+ # Spark TTS imports
38
+ try:
39
+ from huggingface_hub import snapshot_download
40
+ SPARK_AVAILABLE = True
41
+ except:
42
+ SPARK_AVAILABLE = False
43
+
44
+ # MeloTTS imports (for local mode)
45
+ try:
46
+ os.system("python -m unidic download")
47
+ from melo.api import TTS as MeloTTS
48
+ MELO_AVAILABLE = True
49
+ except:
50
+ MELO_AVAILABLE = False
51
+
52
+ load_dotenv()
53
+
54
+
55
+ @dataclass
56
+ class ConversationConfig:
57
+ max_words: int = 6000
58
+ prefix_url: str = "https://r.jina.ai/"
59
+ model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
60
+ local_model_name: str = "NousResearch/Hermes-2-Pro-Llama-3-8B"
61
+
62
+
63
+ class UnifiedAudioConverter:
64
+ def __init__(self, config: ConversationConfig):
65
+ self.config = config
66
+ self.llm_client = None
67
+ self.local_model = None
68
+ self.tokenizer = None
69
+ self.melo_models = None
70
+ self.spark_model_dir = None
71
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
72
+
73
+ def initialize_api_mode(self, api_key: str):
74
+ """Initialize API mode with Together API"""
75
+ self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
76
+
77
+ def initialize_local_mode(self):
78
+ """Initialize local mode with Hugging Face model"""
79
+ if self.local_model is None:
80
+ quantization_config = BitsAndBytesConfig(
81
+ load_in_4bit=True,
82
+ bnb_4bit_compute_dtype=torch.float16
83
+ )
84
+ self.local_model = AutoModelForCausalLM.from_pretrained(
85
+ self.config.local_model_name,
86
+ quantization_config=quantization_config
87
+ )
88
+ self.tokenizer = AutoTokenizer.from_pretrained(
89
+ self.config.local_model_name,
90
+ revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
91
+ )
92
+
93
+ def initialize_spark_tts(self):
94
+ """Initialize Spark TTS model by downloading if needed"""
95
+ if not SPARK_AVAILABLE:
96
+ raise RuntimeError("Spark TTS dependencies not available")
97
+
98
+ model_dir = "pretrained_models/Spark-TTS-0.5B"
99
+
100
+ # Check if model exists, if not download it
101
+ if not os.path.exists(model_dir):
102
+ print("Downloading Spark-TTS model...")
103
+ try:
104
+ os.makedirs("pretrained_models", exist_ok=True)
105
+ snapshot_download(
106
+ "SparkAudio/Spark-TTS-0.5B",
107
+ local_dir=model_dir
108
+ )
109
+ print("Spark-TTS model downloaded successfully")
110
+ except Exception as e:
111
+ raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
112
+
113
+ self.spark_model_dir = model_dir
114
+
115
+ # Check if we have the CLI inference script
116
+ if not os.path.exists("cli/inference.py"):
117
+ print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
118
+
119
+ def initialize_melo_tts(self):
120
+ """Initialize MeloTTS models"""
121
+ if MELO_AVAILABLE and self.melo_models is None:
122
+ self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
123
+
124
+ def fetch_text(self, url: str) -> str:
125
+ """Fetch text content from URL"""
126
+ if not url:
127
+ raise ValueError("URL cannot be empty")
128
+
129
+ if not url.startswith("http://") and not url.startswith("https://"):
130
+ raise ValueError("URL must start with 'http://' or 'https://'")
131
+
132
+ full_url = f"{self.config.prefix_url}{url}"
133
+ try:
134
+ response = httpx.get(full_url, timeout=60.0)
135
+ response.raise_for_status()
136
+ return response.text
137
+ except httpx.HTTPError as e:
138
+ raise RuntimeError(f"Failed to fetch URL: {e}")
139
+
140
+ def _build_prompt(self, text: str, language: str = "English") -> str:
141
+ """Build prompt for conversation generation"""
142
+ if language == "Korean":
143
+ template = """
144
+ {
145
+ "conversation": [
146
+ {"speaker": "", "text": ""},
147
+ {"speaker": "", "text": ""}
148
+ ]
149
+ }
150
+ """
151
+ return (
152
+ f"{text}\n\n์ œ๊ณต๋œ ํ…์ŠคํŠธ๋ฅผ ๋‘ ๋ช…์˜ ์ „๋ฌธ๊ฐ€ ๊ฐ„์˜ ์งง๊ณ  ์œ ์ตํ•˜๋ฉฐ ๋ช…ํ™•ํ•œ "
153
+ f"ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋กœ ๋ณ€ํ™˜ํ•ด๏ฟฝ๏ฟฝ์„ธ์š”. ํ†ค์€ ์ „๋ฌธ์ ์ด๊ณ  ๋งค๋ ฅ์ ์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. "
154
+ f"๋‹ค์Œ ํ˜•์‹์„ ์ค€์ˆ˜ํ•˜๊ณ  JSON๋งŒ ๋ฐ˜ํ™˜ํ•ด์ฃผ์„ธ์š”:\n{template}"
155
+ )
156
+ else:
157
+ template = """
158
+ {
159
+ "conversation": [
160
+ {"speaker": "", "text": ""},
161
+ {"speaker": "", "text": ""}
162
+ ]
163
+ }
164
+ """
165
+ return (
166
+ f"{text}\n\nConvert the provided text into a short, informative and crisp "
167
+ f"podcast conversation between two experts. The tone should be "
168
+ f"professional and engaging. Please adhere to the following "
169
+ f"format and return ONLY the JSON:\n{template}"
170
+ )
171
+
172
+ def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
173
+ """Extract conversation using API"""
174
+ if not self.llm_client:
175
+ raise RuntimeError("API mode not initialized")
176
+
177
+ try:
178
+ # ์–ธ์–ด๋ณ„ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
179
+ if language == "Korean":
180
+ system_message = "๋‹น์‹ ์€ ํ•œ๊ตญ์–ด๋กœ ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ์œ ์ตํ•œ ํ•œ๊ตญ์–ด ๋Œ€ํ™”๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”."
181
+ else:
182
+ system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
183
+
184
+ chat_completion = self.llm_client.chat.completions.create(
185
+ messages=[
186
+ {"role": "system", "content": system_message},
187
+ {"role": "user", "content": self._build_prompt(text, language)}
188
+ ],
189
+ model=self.config.model_name,
190
+ )
191
+
192
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
193
+ json_match = re.search(pattern, chat_completion.choices[0].message.content)
194
+
195
+ if not json_match:
196
+ raise ValueError("No valid JSON found in response")
197
+
198
+ return json.loads(json_match.group())
199
+ except Exception as e:
200
+ raise RuntimeError(f"Failed to extract conversation: {e}")
201
+
202
+ def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
203
+ """Extract conversation using local model"""
204
+ if not self.local_model or not self.tokenizer:
205
+ raise RuntimeError("Local mode not initialized")
206
+
207
+ # ์–ธ์–ด๋ณ„ ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€
208
+ if language == "Korean":
209
+ system_message = "๋‹น์‹ ์€ ํ•œ๊ตญ์–ด๋กœ ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ์œ ์ตํ•œ ํ•œ๊ตญ์–ด ๋Œ€ํ™”๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”."
210
+ else:
211
+ system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
212
+
213
+ chat = [
214
+ {"role": "system", "content": system_message},
215
+ {"role": "user", "content": self._build_prompt(text, language)}
216
+ ]
217
+
218
+ terminators = [
219
+ self.tokenizer.eos_token_id,
220
+ self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
221
+ ]
222
+
223
+ messages = self.tokenizer.apply_chat_template(
224
+ chat, tokenize=False, add_generation_prompt=True
225
+ )
226
+ model_inputs = self.tokenizer([messages], return_tensors="pt").to(self.device)
227
+
228
+ streamer = TextIteratorStreamer(
229
+ self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
230
+ )
231
+
232
+ generate_kwargs = dict(
233
+ model_inputs,
234
+ streamer=streamer,
235
+ max_new_tokens=4000,
236
+ do_sample=True,
237
+ temperature=0.9,
238
+ eos_token_id=terminators,
239
+ )
240
+
241
+ t = Thread(target=self.local_model.generate, kwargs=generate_kwargs)
242
+ t.start()
243
+
244
+ partial_text = ""
245
+ for new_text in streamer:
246
+ partial_text += new_text
247
+
248
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
249
+ json_match = re.search(pattern, partial_text)
250
+
251
+ if json_match:
252
+ return json.loads(json_match.group())
253
+ else:
254
+ # Return a default template based on language
255
+ if language == "Korean":
256
+ return {
257
+ "conversation": [
258
+ {"speaker": "์ง„ํ–‰์ž", "text": "์•ˆ๋…•ํ•˜์„ธ์š”, ํŒŸ์บ์ŠคํŠธ์— ์˜ค์‹  ๊ฒƒ์„ ํ™˜์˜ํ•ฉ๋‹ˆ๋‹ค."},
259
+ {"speaker": "๊ฒŒ์ŠคํŠธ", "text": "์•ˆ๋…•ํ•˜์„ธ์š”, ์ดˆ๋Œ€ํ•ด ์ฃผ์…”์„œ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค."}
260
+ ]
261
+ }
262
+ else:
263
+ return {
264
+ "conversation": [
265
+ {"speaker": "Host", "text": "Welcome to our podcast."},
266
+ {"speaker": "Guest", "text": "Thank you for having me."}
267
+ ]
268
+ }
269
+
270
+ def parse_conversation_text(self, conversation_text: str) -> Dict:
271
+ """Parse conversation text back to JSON format"""
272
+ lines = conversation_text.strip().split('\n')
273
+ conversation_data = {"conversation": []}
274
+
275
+ for line in lines:
276
+ if ':' in line:
277
+ speaker, text = line.split(':', 1)
278
+ conversation_data["conversation"].append({
279
+ "speaker": speaker.strip(),
280
+ "text": text.strip()
281
+ })
282
+
283
+ return conversation_data
284
+
285
+ async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
286
+ """Convert text to speech using Edge TTS"""
287
+ output_dir = Path(self._create_output_directory())
288
+ filenames = []
289
+
290
+ try:
291
+ # ์–ธ์–ด๋ณ„ ์Œ์„ฑ ์„ค์ •
292
+ if language == "Korean":
293
+ voices = [
294
+ "ko-KR-SunHiNeural", # ์—ฌ์„ฑ ์Œ์„ฑ (์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด)
295
+ "ko-KR-InJoonNeural" # ๋‚จ์„ฑ ์Œ์„ฑ (์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด)
296
+ ]
297
+ else:
298
+ voices = [
299
+ "en-US-AvaMultilingualNeural", # ์—ฌ์„ฑ ์Œ์„ฑ
300
+ "en-US-AndrewMultilingualNeural" # ๋‚จ์„ฑ ์Œ์„ฑ
301
+ ]
302
+
303
+ for i, turn in enumerate(conversation_json["conversation"]):
304
+ filename = output_dir / f"output_{i}.wav"
305
+ voice = voices[i % len(voices)]
306
+
307
+ tmp_path = await self._generate_audio_edge(turn["text"], voice)
308
+ os.rename(tmp_path, filename)
309
+ filenames.append(str(filename))
310
+
311
+ # Combine audio files
312
+ final_output = os.path.join(output_dir, "combined_output.wav")
313
+ self._combine_audio_files(filenames, final_output)
314
+
315
+ # Generate conversation text
316
+ conversation_text = "\n".join(
317
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
318
+ for i, turn in enumerate(conversation_json["conversation"])
319
+ )
320
+
321
+ return final_output, conversation_text
322
+ except Exception as e:
323
+ raise RuntimeError(f"Failed to convert text to speech: {e}")
324
+
325
+ async def _generate_audio_edge(self, text: str, voice: str) -> str:
326
+ """Generate audio using Edge TTS"""
327
+ if not text.strip():
328
+ raise ValueError("Text cannot be empty")
329
+
330
+ voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
331
+ communicate = edge_tts.Communicate(text, voice_short_name)
332
+
333
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
334
+ tmp_path = tmp_file.name
335
+ await communicate.save(tmp_path)
336
+
337
+ return tmp_path
338
+
339
+ def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
340
+ """Convert text to speech using Spark TTS CLI"""
341
+ if not SPARK_AVAILABLE or not self.spark_model_dir:
342
+ raise RuntimeError("Spark TTS not available")
343
+
344
+ try:
345
+ output_dir = self._create_output_directory()
346
+ audio_files = []
347
+
348
+ # Create different voice characteristics for different speakers
349
+ if language == "Korean":
350
+ voice_configs = [
351
+ {"prompt_text": "์•ˆ๋…•ํ•˜์„ธ์š”, ์˜ค๋Š˜ ํŒŸ์บ์ŠคํŠธ ์ง„ํ–‰์„ ๋งก์€ ์ง„ํ–‰์ž์ž…๋‹ˆ๋‹ค.", "gender": "female"},
352
+ {"prompt_text": "์•ˆ๋…•ํ•˜์„ธ์š”, ์˜ค๋Š˜ ๊ฒŒ์ŠคํŠธ๋กœ ์ฐธ์—ฌํ•˜๊ฒŒ ๋˜์–ด ๊ธฐ์ฉ๋‹ˆ๋‹ค.", "gender": "male"}
353
+ ]
354
+ else:
355
+ voice_configs = [
356
+ {"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
357
+ {"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
358
+ ]
359
+
360
+ for i, turn in enumerate(conversation_json["conversation"]):
361
+ text = turn["text"]
362
+ if not text.strip():
363
+ continue
364
+
365
+ # Use different voice config for each speaker
366
+ voice_config = voice_configs[i % len(voice_configs)]
367
+
368
+ output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
369
+
370
+ # Run Spark TTS CLI inference
371
+ cmd = [
372
+ "python", "-m", "cli.inference",
373
+ "--text", text,
374
+ "--device", "0" if torch.cuda.is_available() else "cpu",
375
+ "--save_dir", output_dir,
376
+ "--model_dir", self.spark_model_dir,
377
+ "--prompt_text", voice_config["prompt_text"],
378
+ "--output_name", f"spark_output_{i}.wav"
379
+ ]
380
+
381
+ try:
382
+ # Run the command
383
+ result = subprocess.run(
384
+ cmd,
385
+ capture_output=True,
386
+ text=True,
387
+ timeout=60,
388
+ cwd="." # Make sure we're in the right directory
389
+ )
390
+
391
+ if result.returncode == 0:
392
+ audio_files.append(output_file)
393
+ else:
394
+ print(f"Spark TTS error for turn {i}: {result.stderr}")
395
+ # Create a short silence as fallback
396
+ silence = np.zeros(int(22050 * 1.0)) # 1 second of silence
397
+ sf.write(output_file, silence, 22050)
398
+ audio_files.append(output_file)
399
+
400
+ except subprocess.TimeoutExpired:
401
+ print(f"Spark TTS timeout for turn {i}")
402
+ # Create silence as fallback
403
+ silence = np.zeros(int(22050 * 1.0))
404
+ sf.write(output_file, silence, 22050)
405
+ audio_files.append(output_file)
406
+ except Exception as e:
407
+ print(f"Error running Spark TTS for turn {i}: {e}")
408
+ # Create silence as fallback
409
+ silence = np.zeros(int(22050 * 1.0))
410
+ sf.write(output_file, silence, 22050)
411
+ audio_files.append(output_file)
412
+
413
+ # Combine all audio files
414
+ if audio_files:
415
+ final_output = os.path.join(output_dir, "spark_combined.wav")
416
+ self._combine_audio_files(audio_files, final_output)
417
+ else:
418
+ raise RuntimeError("No audio files generated")
419
+
420
+ # Generate conversation text
421
+ conversation_text = "\n".join(
422
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
423
+ for i, turn in enumerate(conversation_json["conversation"])
424
+ )
425
+
426
+ return final_output, conversation_text
427
+
428
+ except Exception as e:
429
+ raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
430
+
431
+ def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
432
+ """Convert text to speech using MeloTTS"""
433
+ if not MELO_AVAILABLE or not self.melo_models:
434
+ raise RuntimeError("MeloTTS not available")
435
+
436
+ speakers = ["EN-Default", "EN-US"]
437
+ combined_audio = AudioSegment.empty()
438
+
439
+ for i, turn in enumerate(conversation_json["conversation"]):
440
+ bio = io.BytesIO()
441
+ text = turn["text"]
442
+ speaker = speakers[i % 2]
443
+ speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
444
+
445
+ # Generate audio
446
+ self.melo_models["EN"].tts_to_file(
447
+ text, speaker_id, bio, speed=1.0,
448
+ pbar=progress.tqdm if progress else None,
449
+ format="wav"
450
+ )
451
+
452
+ bio.seek(0)
453
+ audio_segment = AudioSegment.from_file(bio, format="wav")
454
+ combined_audio += audio_segment
455
+
456
+ # Save final audio
457
+ final_audio_path = "melo_podcast.mp3"
458
+ combined_audio.export(final_audio_path, format="mp3")
459
+
460
+ # Generate conversation text
461
+ conversation_text = "\n".join(
462
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
463
+ for i, turn in enumerate(conversation_json["conversation"])
464
+ )
465
+
466
+ return final_audio_path, conversation_text
467
+
468
+ def _create_output_directory(self) -> str:
469
+ """Create a unique output directory"""
470
+ random_bytes = os.urandom(8)
471
+ folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
472
+ os.makedirs(folder_name, exist_ok=True)
473
+ return folder_name
474
+
475
+ def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
476
+ """Combine multiple audio files into one"""
477
+ if not filenames:
478
+ raise ValueError("No input files provided")
479
+
480
+ try:
481
+ audio_segments = []
482
+ for filename in filenames:
483
+ if os.path.exists(filename):
484
+ audio_segment = AudioSegment.from_file(filename)
485
+ audio_segments.append(audio_segment)
486
+
487
+ if audio_segments:
488
+ combined = sum(audio_segments)
489
+ combined.export(output_file, format="wav")
490
+
491
+ # Clean up temporary files
492
+ for filename in filenames:
493
+ if os.path.exists(filename):
494
+ os.remove(filename)
495
+
496
+ except Exception as e:
497
+ raise RuntimeError(f"Failed to combine audio files: {e}")
498
+
499
+
500
+ # Global converter instance
501
+ converter = UnifiedAudioConverter(ConversationConfig())
502
+
503
+
504
+ async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
505
+ """Main synthesis function"""
506
+ if not article_url:
507
+ return "Please provide a valid URL.", None
508
+
509
+ try:
510
+ # Fetch text from URL
511
+ text = converter.fetch_text(article_url)
512
+
513
+ # Limit text to max words
514
+ words = text.split()
515
+ if len(words) > converter.config.max_words:
516
+ text = " ".join(words[:converter.config.max_words])
517
+
518
+ # Extract conversation based on mode
519
+ if mode == "API":
520
+ api_key = os.environ.get("TOGETHER_API_KEY")
521
+ if not api_key:
522
+ return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
523
+ converter.initialize_api_mode(api_key)
524
+ conversation_json = converter.extract_conversation_api(text, language)
525
+ else: # Local mode
526
+ converter.initialize_local_mode()
527
+ conversation_json = converter.extract_conversation_local(text, language)
528
+
529
+ # Generate conversation text
530
+ conversation_text = "\n".join(
531
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
532
+ for i, turn in enumerate(conversation_json["conversation"])
533
+ )
534
+
535
+ return conversation_text, None
536
+
537
+ except Exception as e:
538
+ return f"Error: {str(e)}", None
539
+
540
+
541
+ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
542
+ """Regenerate audio from edited conversation text"""
543
+ if not conversation_text.strip():
544
+ return "Please provide conversation text.", None
545
+
546
+ try:
547
+ # Parse the conversation text back to JSON format
548
+ conversation_json = converter.parse_conversation_text(conversation_text)
549
+
550
+ if not conversation_json["conversation"]:
551
+ return "No valid conversation found in the text.", None
552
+
553
+ # ํ•œ๊ตญ์–ด์ธ ๊ฒฝ์šฐ Edge-TTS๋งŒ ์‚ฌ์šฉ (๋‹ค๋ฅธ TTS๋Š” ํ•œ๊ตญ์–ด ์ง€์›์ด ์ œํ•œ์ )
554
+ if language == "Korean" and tts_engine != "Edge-TTS":
555
+ return "ํ•œ๊ตญ์–ด๋Š” Edge-TTS๋งŒ ์ง€์›๋ฉ๋‹ˆ๋‹ค. TTS ์—”์ง„์ด ์ž๋™์œผ๋กœ Edge-TTS๋กœ ๋ณ€๊ฒฝ๋ฉ๋‹ˆ๋‹ค.", None
556
+
557
+ # Generate audio based on TTS engine
558
+ if tts_engine == "Edge-TTS":
559
+ output_file, _ = await converter.text_to_speech_edge(conversation_json, language)
560
+ elif tts_engine == "Spark-TTS":
561
+ if not SPARK_AVAILABLE:
562
+ return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
563
+ converter.initialize_spark_tts()
564
+ output_file, _ = converter.text_to_speech_spark(conversation_json, language)
565
+ else: # MeloTTS
566
+ if not MELO_AVAILABLE:
567
+ return "MeloTTS not available. Please install required dependencies.", None
568
+ if language == "Korean":
569
+ return "MeloTTS does not support Korean. Please use Edge-TTS for Korean.", None
570
+ converter.initialize_melo_tts()
571
+ output_file, _ = converter.text_to_speech_melo(conversation_json)
572
+
573
+ return "Audio generated successfully!", output_file
574
+
575
+ except Exception as e:
576
+ return f"Error generating audio: {str(e)}", None
577
+
578
+
579
+ def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
580
+ """Synchronous wrapper for async synthesis"""
581
+ return asyncio.run(synthesize(article_url, mode, tts_engine, language))
582
+
583
+
584
+ def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
585
+ """Synchronous wrapper for async audio regeneration"""
586
+ return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
587
+
588
+
589
+ def update_tts_engine_for_korean(language):
590
+ """ํ•œ๊ตญ์–ด ์„ ํƒ ์‹œ TTS ์—”์ง„ ์˜ต์…˜ ์—…๋ฐ์ดํŠธ"""
591
+ if language == "Korean":
592
+ return gr.Radio(
593
+ choices=["Edge-TTS"],
594
+ value="Edge-TTS",
595
+ label="TTS Engine",
596
+ info="ํ•œ๊ตญ์–ด๋Š” Edge-TTS๋งŒ ์ง€์›๋ฉ๋‹ˆ๋‹ค",
597
+ interactive=False
598
+ )
599
+ else:
600
+ return gr.Radio(
601
+ choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
602
+ value="Edge-TTS",
603
+ label="TTS Engine",
604
+ info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU",
605
+ interactive=True
606
+ )
607
+
608
+
609
+ # Gradio Interface
610
+ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
611
+ gr.Markdown("# ๐ŸŽ™๏ธ URL to Podcast Converter")
612
+ gr.Markdown("Convert any article, blog, or news into an engaging podcast conversation!")
613
+
614
+ with gr.Row():
615
+ with gr.Column(scale=3):
616
+ url_input = gr.Textbox(
617
+ label="Article URL",
618
+ placeholder="Enter the article URL here...",
619
+ value=""
620
+ )
621
+ with gr.Column(scale=1):
622
+ # ์–ธ์–ด ์„ ํƒ ์ถ”๊ฐ€
623
+ language_selector = gr.Radio(
624
+ choices=["English", "Korean"],
625
+ value="English",
626
+ label="Language / ์–ธ์–ด",
627
+ info="Select output language / ์ถœ๋ ฅ ์–ธ์–ด๋ฅผ ์„ ํƒํ•˜์„ธ์š”"
628
+ )
629
+
630
+ mode_selector = gr.Radio(
631
+ choices=["API", "Local"],
632
+ value="API",
633
+ label="Processing Mode",
634
+ info="API: Faster, requires API key | Local: Slower, runs on device"
635
+ )
636
+
637
+ # TTS ์—”์ง„ ์„ ํƒ
638
+ with gr.Group():
639
+ gr.Markdown("### TTS Engine Selection")
640
+ tts_selector = gr.Radio(
641
+ choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
642
+ value="Edge-TTS",
643
+ label="TTS Engine",
644
+ info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
645
+ )
646
+
647
+ gr.Markdown("""
648
+ **Recommended:**
649
+ - ๐ŸŒŸ **Edge-TTS**: Best quality, cloud-based, instant setup
650
+ - ๐Ÿค– **Spark-TTS**: Local AI model (0.5B), zero-shot voice cloning
651
+
652
+ **Additional Option:**
653
+ - โšก **MeloTTS**: Local processing, GPU recommended
654
+
655
+ **ํ•œ๊ตญ์–ด ์ง€์›:**
656
+ - ๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ด ์„ ํƒ ์‹œ Edge-TTS๋งŒ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค
657
+ """)
658
+
659
+ convert_btn = gr.Button("๐ŸŽฏ Generate Conversation / ๋Œ€ํ™” ์ƒ์„ฑ", variant="primary", size="lg")
660
+
661
+ with gr.Row():
662
+ with gr.Column():
663
+ conversation_output = gr.Textbox(
664
+ label="Generated Conversation (Editable) / ์ƒ์„ฑ๋œ ๋Œ€ํ™” (ํŽธ์ง‘ ๊ฐ€๋Šฅ)",
665
+ lines=15,
666
+ max_lines=30,
667
+ interactive=True,
668
+ placeholder="Generated conversation will appear here. You can edit it before generating audio.\n์ƒ์„ฑ๋œ ๋Œ€ํ™”๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค. ์˜ค๋””์˜ค ์ƒ์„ฑ ์ „์— ํŽธ์ง‘ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.",
669
+ info="Edit the conversation as needed. Format: 'Speaker Name: Text' / ํ•„์š”์— ๋”ฐ๋ผ ๋Œ€ํ™”๋ฅผ ํŽธ์ง‘ํ•˜์„ธ์š”. ํ˜•์‹: 'ํ™”์ž ์ด๋ฆ„: ํ…์ŠคํŠธ'"
670
+ )
671
+
672
+ # ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ฒ„ํŠผ ์ถ”๊ฐ€
673
+ with gr.Row():
674
+ generate_audio_btn = gr.Button("๐ŸŽ™๏ธ Generate Audio from Text / ํ…์ŠคํŠธ์—์„œ ์˜ค๋””์˜ค ์ƒ์„ฑ", variant="secondary", size="lg")
675
+ gr.Markdown("*Edit the conversation above, then click to generate audio / ์œ„์˜ ๋Œ€ํ™”๋ฅผ ํŽธ์ง‘ํ•œ ํ›„ ํด๋ฆญํ•˜์—ฌ ์˜ค๋””์˜ค๋ฅผ ์ƒ์„ฑํ•˜์„ธ์š”*")
676
+
677
+ with gr.Column():
678
+ audio_output = gr.Audio(
679
+ label="Podcast Audio / ํŒŸ์บ์ŠคํŠธ ์˜ค๋””์˜ค",
680
+ type="filepath",
681
+ interactive=False
682
+ )
683
+
684
+ # ์ƒํƒœ ๋ฉ”์‹œ์ง€ ์ถ”๊ฐ€
685
+ status_output = gr.Textbox(
686
+ label="Status / ์ƒํƒœ",
687
+ interactive=False,
688
+ visible=True
689
+ )
690
+
691
+ # TTS ์—”์ง„๋ณ„ ์„ค๋ช… ๋ฐ ์„ค์น˜ ์•ˆ๋‚ด ์ถ”๊ฐ€
692
+ with gr.Row():
693
+ gr.Markdown("""
694
+ ### TTS Engine Details / TTS ์—”์ง„ ์ƒ์„ธ์ •๋ณด:
695
+
696
+ - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
697
+ - ๐Ÿ‡ฐ๐Ÿ‡ท **ํ•œ๊ตญ์–ด ์ง€์›**: ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด ์Œ์„ฑ (์—ฌ์„ฑ: SunHi, ๋‚จ์„ฑ: InJoon)
698
+ - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
699
+ - **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
700
+ - Features: Bilingual support (Chinese/English), controllable speech generation
701
+ - License: CC BY-NC-SA (Non-commercial use only)
702
+ - โš ๏ธ **ํ•œ๊ตญ์–ด ๋ฏธ์ง€์›**
703
+ - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
704
+ - โš ๏ธ **ํ•œ๊ตญ์–ด ๋ฏธ์ง€์›**
705
+
706
+ ### Spark-TTS Setup Instructions:
707
+ ```bash
708
+ git clone https://github.com/SparkAudio/Spark-TTS.git
709
+ cd Spark-TTS
710
+ pip install -r requirements.txt
711
+ ```
712
+ """)
713
+
714
+ gr.Examples(
715
+ examples=[
716
+ ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS", "English"],
717
+ ["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS", "English"],
718
+ ["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS", "Korean"],
719
+ ],
720
+ inputs=[url_input, mode_selector, tts_selector, language_selector],
721
+ outputs=[conversation_output, status_output],
722
+ fn=synthesize_sync,
723
+ cache_examples=False,
724
+ )
725
+
726
+ # ์–ธ์–ด ๋ณ€๊ฒฝ ์‹œ TTS ์—”์ง„ ์˜ต์…˜ ์—…๋ฐ์ดํŠธ
727
+ language_selector.change(
728
+ fn=update_tts_engine_for_korean,
729
+ inputs=[language_selector],
730
+ outputs=[tts_selector]
731
+ )
732
+
733
+ # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
734
+ convert_btn.click(
735
+ fn=synthesize_sync,
736
+ inputs=[url_input, mode_selector, tts_selector, language_selector],
737
+ outputs=[conversation_output, status_output]
738
+ )
739
+
740
+ generate_audio_btn.click(
741
+ fn=regenerate_audio_sync,
742
+ inputs=[conversation_output, tts_selector, language_selector],
743
+ outputs=[status_output, audio_output]
744
+ )
745
+
746
+
747
+ # Launch the app
748
+ if __name__ == "__main__":
749
+ demo.queue(api_open=True, default_concurrency_limit=10).launch(
750
+ show_api=True,
751
+ share=False,
752
+ server_name="0.0.0.0",
753
+ server_port=7860
754
+ )