Spaces:
Running
on
Zero
Running
on
Zero
Update app-backup.py
Browse files- app-backup.py +165 -66
app-backup.py
CHANGED
@@ -137,31 +137,55 @@ class UnifiedAudioConverter:
|
|
137 |
except httpx.HTTPError as e:
|
138 |
raise RuntimeError(f"Failed to fetch URL: {e}")
|
139 |
|
140 |
-
def _build_prompt(self, text: str) -> str:
|
141 |
"""Build prompt for conversation generation"""
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
-
def extract_conversation_api(self, text: str) -> Dict:
|
158 |
"""Extract conversation using API"""
|
159 |
if not self.llm_client:
|
160 |
raise RuntimeError("API mode not initialized")
|
161 |
|
162 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
chat_completion = self.llm_client.chat.completions.create(
|
164 |
-
messages=[
|
|
|
|
|
|
|
165 |
model=self.config.model_name,
|
166 |
)
|
167 |
|
@@ -175,15 +199,21 @@ class UnifiedAudioConverter:
|
|
175 |
except Exception as e:
|
176 |
raise RuntimeError(f"Failed to extract conversation: {e}")
|
177 |
|
178 |
-
def extract_conversation_local(self, text: str, progress=None) -> Dict:
|
179 |
"""Extract conversation using local model"""
|
180 |
if not self.local_model or not self.tokenizer:
|
181 |
raise RuntimeError("Local mode not initialized")
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
"
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
|
188 |
terminators = [
|
189 |
self.tokenizer.eos_token_id,
|
@@ -221,13 +251,21 @@ class UnifiedAudioConverter:
|
|
221 |
if json_match:
|
222 |
return json.loads(json_match.group())
|
223 |
else:
|
224 |
-
# Return a default template
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
def parse_conversation_text(self, conversation_text: str) -> Dict:
|
233 |
"""Parse conversation text back to JSON format"""
|
@@ -244,15 +282,27 @@ class UnifiedAudioConverter:
|
|
244 |
|
245 |
return conversation_data
|
246 |
|
247 |
-
async def text_to_speech_edge(self, conversation_json: Dict,
|
248 |
"""Convert text to speech using Edge TTS"""
|
249 |
output_dir = Path(self._create_output_directory())
|
250 |
filenames = []
|
251 |
|
252 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
for i, turn in enumerate(conversation_json["conversation"]):
|
254 |
filename = output_dir / f"output_{i}.wav"
|
255 |
-
voice =
|
256 |
|
257 |
tmp_path = await self._generate_audio_edge(turn["text"], voice)
|
258 |
os.rename(tmp_path, filename)
|
@@ -286,7 +336,7 @@ class UnifiedAudioConverter:
|
|
286 |
|
287 |
return tmp_path
|
288 |
|
289 |
-
def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
|
290 |
"""Convert text to speech using Spark TTS CLI"""
|
291 |
if not SPARK_AVAILABLE or not self.spark_model_dir:
|
292 |
raise RuntimeError("Spark TTS not available")
|
@@ -296,10 +346,16 @@ class UnifiedAudioConverter:
|
|
296 |
audio_files = []
|
297 |
|
298 |
# Create different voice characteristics for different speakers
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
|
304 |
for i, turn in enumerate(conversation_json["conversation"]):
|
305 |
text = turn["text"]
|
@@ -445,7 +501,7 @@ class UnifiedAudioConverter:
|
|
445 |
converter = UnifiedAudioConverter(ConversationConfig())
|
446 |
|
447 |
|
448 |
-
async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
|
449 |
"""Main synthesis function"""
|
450 |
if not article_url:
|
451 |
return "Please provide a valid URL.", None
|
@@ -465,10 +521,10 @@ async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edg
|
|
465 |
if not api_key:
|
466 |
return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
|
467 |
converter.initialize_api_mode(api_key)
|
468 |
-
conversation_json = converter.extract_conversation_api(text)
|
469 |
else: # Local mode
|
470 |
converter.initialize_local_mode()
|
471 |
-
conversation_json = converter.extract_conversation_local(text)
|
472 |
|
473 |
# Generate conversation text
|
474 |
conversation_text = "\n".join(
|
@@ -482,7 +538,7 @@ async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edg
|
|
482 |
return f"Error: {str(e)}", None
|
483 |
|
484 |
|
485 |
-
async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS"):
|
486 |
"""Regenerate audio from edited conversation text"""
|
487 |
if not conversation_text.strip():
|
488 |
return "Please provide conversation text.", None
|
@@ -494,21 +550,23 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
|
|
494 |
if not conversation_json["conversation"]:
|
495 |
return "No valid conversation found in the text.", None
|
496 |
|
|
|
|
|
|
|
|
|
497 |
# Generate audio based on TTS engine
|
498 |
if tts_engine == "Edge-TTS":
|
499 |
-
output_file, _ = await converter.text_to_speech_edge(
|
500 |
-
conversation_json,
|
501 |
-
"en-US-AvaMultilingualNeural",
|
502 |
-
"en-US-AndrewMultilingualNeural"
|
503 |
-
)
|
504 |
elif tts_engine == "Spark-TTS":
|
505 |
if not SPARK_AVAILABLE:
|
506 |
return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
|
507 |
converter.initialize_spark_tts()
|
508 |
-
output_file, _ = converter.text_to_speech_spark(conversation_json)
|
509 |
else: # MeloTTS
|
510 |
if not MELO_AVAILABLE:
|
511 |
return "MeloTTS not available. Please install required dependencies.", None
|
|
|
|
|
512 |
converter.initialize_melo_tts()
|
513 |
output_file, _ = converter.text_to_speech_melo(conversation_json)
|
514 |
|
@@ -518,14 +576,34 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
|
|
518 |
return f"Error generating audio: {str(e)}", None
|
519 |
|
520 |
|
521 |
-
def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
|
522 |
"""Synchronous wrapper for async synthesis"""
|
523 |
-
return asyncio.run(synthesize(article_url, mode, tts_engine))
|
524 |
|
525 |
|
526 |
-
def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS"):
|
527 |
"""Synchronous wrapper for async audio regeneration"""
|
528 |
-
return asyncio.run(regenerate_audio(conversation_text, tts_engine))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
529 |
|
530 |
|
531 |
# Gradio Interface
|
@@ -541,6 +619,14 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
541 |
value=""
|
542 |
)
|
543 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
544 |
mode_selector = gr.Radio(
|
545 |
choices=["API", "Local"],
|
546 |
value="API",
|
@@ -548,7 +634,7 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
548 |
info="API: Faster, requires API key | Local: Slower, runs on device"
|
549 |
)
|
550 |
|
551 |
-
# TTS ์์ง ์ ํ
|
552 |
with gr.Group():
|
553 |
gr.Markdown("### TTS Engine Selection")
|
554 |
tts_selector = gr.Radio(
|
@@ -565,36 +651,39 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
565 |
|
566 |
**Additional Option:**
|
567 |
- โก **MeloTTS**: Local processing, GPU recommended
|
|
|
|
|
|
|
568 |
""")
|
569 |
|
570 |
-
convert_btn = gr.Button("๐ฏ Generate Conversation", variant="primary", size="lg")
|
571 |
|
572 |
with gr.Row():
|
573 |
with gr.Column():
|
574 |
conversation_output = gr.Textbox(
|
575 |
-
label="Generated Conversation (Editable)",
|
576 |
lines=15,
|
577 |
max_lines=30,
|
578 |
interactive=True,
|
579 |
-
placeholder="Generated conversation will appear here. You can edit it before generating audio
|
580 |
-
info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
|
581 |
)
|
582 |
|
583 |
# ์ค๋์ค ์์ฑ ๋ฒํผ ์ถ๊ฐ
|
584 |
with gr.Row():
|
585 |
-
generate_audio_btn = gr.Button("๐๏ธ Generate Audio from Text", variant="secondary", size="lg")
|
586 |
-
gr.Markdown("*Edit the conversation above, then click to generate audio
|
587 |
|
588 |
with gr.Column():
|
589 |
audio_output = gr.Audio(
|
590 |
-
label="Podcast Audio",
|
591 |
type="filepath",
|
592 |
interactive=False
|
593 |
)
|
594 |
|
595 |
# ์ํ ๋ฉ์์ง ์ถ๊ฐ
|
596 |
status_output = gr.Textbox(
|
597 |
-
label="Status",
|
598 |
interactive=False,
|
599 |
visible=True
|
600 |
)
|
@@ -602,14 +691,17 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
602 |
# TTS ์์ง๋ณ ์ค๋ช
๋ฐ ์ค์น ์๋ด ์ถ๊ฐ
|
603 |
with gr.Row():
|
604 |
gr.Markdown("""
|
605 |
-
### TTS Engine Details
|
606 |
|
607 |
- **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
|
|
|
608 |
- **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
|
609 |
- **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
|
610 |
- Features: Bilingual support (Chinese/English), controllable speech generation
|
611 |
- License: CC BY-NC-SA (Non-commercial use only)
|
|
|
612 |
- **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
|
|
|
613 |
|
614 |
### Spark-TTS Setup Instructions:
|
615 |
```bash
|
@@ -621,26 +713,33 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
|
|
621 |
|
622 |
gr.Examples(
|
623 |
examples=[
|
624 |
-
["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
|
625 |
-
["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS"],
|
626 |
-
["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS"],
|
627 |
],
|
628 |
-
inputs=[url_input, mode_selector, tts_selector],
|
629 |
outputs=[conversation_output, status_output],
|
630 |
fn=synthesize_sync,
|
631 |
cache_examples=False,
|
632 |
)
|
633 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
634 |
# ์ด๋ฒคํธ ์ฐ๊ฒฐ
|
635 |
convert_btn.click(
|
636 |
fn=synthesize_sync,
|
637 |
-
inputs=[url_input, mode_selector, tts_selector],
|
638 |
outputs=[conversation_output, status_output]
|
639 |
)
|
640 |
|
641 |
generate_audio_btn.click(
|
642 |
fn=regenerate_audio_sync,
|
643 |
-
inputs=[conversation_output, tts_selector],
|
644 |
outputs=[status_output, audio_output]
|
645 |
)
|
646 |
|
|
|
137 |
except httpx.HTTPError as e:
|
138 |
raise RuntimeError(f"Failed to fetch URL: {e}")
|
139 |
|
140 |
+
def _build_prompt(self, text: str, language: str = "English") -> str:
|
141 |
"""Build prompt for conversation generation"""
|
142 |
+
if language == "Korean":
|
143 |
+
template = """
|
144 |
+
{
|
145 |
+
"conversation": [
|
146 |
+
{"speaker": "", "text": ""},
|
147 |
+
{"speaker": "", "text": ""}
|
148 |
+
]
|
149 |
+
}
|
150 |
+
"""
|
151 |
+
return (
|
152 |
+
f"{text}\n\n์ ๊ณต๋ ํ
์คํธ๋ฅผ ๋ ๋ช
์ ์ ๋ฌธ๊ฐ ๊ฐ์ ์งง๊ณ ์ ์ตํ๋ฉฐ ๋ช
ํํ "
|
153 |
+
f"ํ์บ์คํธ ๋ํ๋ก ๋ณํํด์ฃผ์ธ์. ํค์ ์ ๋ฌธ์ ์ด๊ณ ๋งค๋ ฅ์ ์ด์ด์ผ ํฉ๋๋ค. "
|
154 |
+
f"๋ค์ ํ์์ ์ค์ํ๊ณ JSON๋ง ๋ฐํํด์ฃผ์ธ์:\n{template}"
|
155 |
+
)
|
156 |
+
else:
|
157 |
+
template = """
|
158 |
+
{
|
159 |
+
"conversation": [
|
160 |
+
{"speaker": "", "text": ""},
|
161 |
+
{"speaker": "", "text": ""}
|
162 |
+
]
|
163 |
+
}
|
164 |
+
"""
|
165 |
+
return (
|
166 |
+
f"{text}\n\nConvert the provided text into a short, informative and crisp "
|
167 |
+
f"podcast conversation between two experts. The tone should be "
|
168 |
+
f"professional and engaging. Please adhere to the following "
|
169 |
+
f"format and return ONLY the JSON:\n{template}"
|
170 |
+
)
|
171 |
|
172 |
+
def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
|
173 |
"""Extract conversation using API"""
|
174 |
if not self.llm_client:
|
175 |
raise RuntimeError("API mode not initialized")
|
176 |
|
177 |
try:
|
178 |
+
# ์ธ์ด๋ณ ํ๋กฌํํธ ๊ตฌ์ฑ
|
179 |
+
if language == "Korean":
|
180 |
+
system_message = "๋น์ ์ ํ๊ตญ์ด๋ก ํ์บ์คํธ ๋ํ๋ฅผ ์์ฑํ๋ ์ ๋ฌธ๊ฐ์
๋๋ค. ์์ฐ์ค๋ฝ๊ณ ์ ์ตํ ํ๊ตญ์ด ๋ํ๋ฅผ ๋ง๋ค์ด์ฃผ์ธ์."
|
181 |
+
else:
|
182 |
+
system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
|
183 |
+
|
184 |
chat_completion = self.llm_client.chat.completions.create(
|
185 |
+
messages=[
|
186 |
+
{"role": "system", "content": system_message},
|
187 |
+
{"role": "user", "content": self._build_prompt(text, language)}
|
188 |
+
],
|
189 |
model=self.config.model_name,
|
190 |
)
|
191 |
|
|
|
199 |
except Exception as e:
|
200 |
raise RuntimeError(f"Failed to extract conversation: {e}")
|
201 |
|
202 |
+
def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
|
203 |
"""Extract conversation using local model"""
|
204 |
if not self.local_model or not self.tokenizer:
|
205 |
raise RuntimeError("Local mode not initialized")
|
206 |
|
207 |
+
# ์ธ์ด๋ณ ์์คํ
๋ฉ์์ง
|
208 |
+
if language == "Korean":
|
209 |
+
system_message = "๋น์ ์ ํ๊ตญ์ด๋ก ํ์บ์คํธ ๋ํ๋ฅผ ์์ฑํ๋ ์ ๋ฌธ๊ฐ์
๋๋ค. ์์ฐ์ค๋ฝ๊ณ ์ ์ตํ ํ๊ตญ์ด ๋ํ๋ฅผ ๋ง๋ค์ด์ฃผ์ธ์."
|
210 |
+
else:
|
211 |
+
system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
|
212 |
+
|
213 |
+
chat = [
|
214 |
+
{"role": "system", "content": system_message},
|
215 |
+
{"role": "user", "content": self._build_prompt(text, language)}
|
216 |
+
]
|
217 |
|
218 |
terminators = [
|
219 |
self.tokenizer.eos_token_id,
|
|
|
251 |
if json_match:
|
252 |
return json.loads(json_match.group())
|
253 |
else:
|
254 |
+
# Return a default template based on language
|
255 |
+
if language == "Korean":
|
256 |
+
return {
|
257 |
+
"conversation": [
|
258 |
+
{"speaker": "์งํ์", "text": "์๋
ํ์ธ์, ํ์บ์คํธ์ ์ค์ ๊ฒ์ ํ์ํฉ๋๋ค."},
|
259 |
+
{"speaker": "๊ฒ์คํธ", "text": "์๋
ํ์ธ์, ์ด๋ํด ์ฃผ์
์ ๊ฐ์ฌํฉ๋๋ค."}
|
260 |
+
]
|
261 |
+
}
|
262 |
+
else:
|
263 |
+
return {
|
264 |
+
"conversation": [
|
265 |
+
{"speaker": "Host", "text": "Welcome to our podcast."},
|
266 |
+
{"speaker": "Guest", "text": "Thank you for having me."}
|
267 |
+
]
|
268 |
+
}
|
269 |
|
270 |
def parse_conversation_text(self, conversation_text: str) -> Dict:
|
271 |
"""Parse conversation text back to JSON format"""
|
|
|
282 |
|
283 |
return conversation_data
|
284 |
|
285 |
+
async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
|
286 |
"""Convert text to speech using Edge TTS"""
|
287 |
output_dir = Path(self._create_output_directory())
|
288 |
filenames = []
|
289 |
|
290 |
try:
|
291 |
+
# ์ธ์ด๋ณ ์์ฑ ์ค์
|
292 |
+
if language == "Korean":
|
293 |
+
voices = [
|
294 |
+
"ko-KR-SunHiNeural", # ์ฌ์ฑ ์์ฑ (์์ฐ์ค๋ฌ์ด ํ๊ตญ์ด)
|
295 |
+
"ko-KR-InJoonNeural" # ๋จ์ฑ ์์ฑ (์์ฐ์ค๋ฌ์ด ํ๊ตญ์ด)
|
296 |
+
]
|
297 |
+
else:
|
298 |
+
voices = [
|
299 |
+
"en-US-AvaMultilingualNeural", # ์ฌ์ฑ ์์ฑ
|
300 |
+
"en-US-AndrewMultilingualNeural" # ๋จ์ฑ ์์ฑ
|
301 |
+
]
|
302 |
+
|
303 |
for i, turn in enumerate(conversation_json["conversation"]):
|
304 |
filename = output_dir / f"output_{i}.wav"
|
305 |
+
voice = voices[i % len(voices)]
|
306 |
|
307 |
tmp_path = await self._generate_audio_edge(turn["text"], voice)
|
308 |
os.rename(tmp_path, filename)
|
|
|
336 |
|
337 |
return tmp_path
|
338 |
|
339 |
+
def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
|
340 |
"""Convert text to speech using Spark TTS CLI"""
|
341 |
if not SPARK_AVAILABLE or not self.spark_model_dir:
|
342 |
raise RuntimeError("Spark TTS not available")
|
|
|
346 |
audio_files = []
|
347 |
|
348 |
# Create different voice characteristics for different speakers
|
349 |
+
if language == "Korean":
|
350 |
+
voice_configs = [
|
351 |
+
{"prompt_text": "์๋
ํ์ธ์, ์ค๋ ํ์บ์คํธ ์งํ์ ๋งก์ ์งํ์์
๋๋ค.", "gender": "female"},
|
352 |
+
{"prompt_text": "์๋
ํ์ธ์, ์ค๋ ๊ฒ์คํธ๋ก ์ฐธ์ฌํ๊ฒ ๋์ด ๊ธฐ์ฉ๋๋ค.", "gender": "male"}
|
353 |
+
]
|
354 |
+
else:
|
355 |
+
voice_configs = [
|
356 |
+
{"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
|
357 |
+
{"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
|
358 |
+
]
|
359 |
|
360 |
for i, turn in enumerate(conversation_json["conversation"]):
|
361 |
text = turn["text"]
|
|
|
501 |
converter = UnifiedAudioConverter(ConversationConfig())
|
502 |
|
503 |
|
504 |
+
async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
|
505 |
"""Main synthesis function"""
|
506 |
if not article_url:
|
507 |
return "Please provide a valid URL.", None
|
|
|
521 |
if not api_key:
|
522 |
return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
|
523 |
converter.initialize_api_mode(api_key)
|
524 |
+
conversation_json = converter.extract_conversation_api(text, language)
|
525 |
else: # Local mode
|
526 |
converter.initialize_local_mode()
|
527 |
+
conversation_json = converter.extract_conversation_local(text, language)
|
528 |
|
529 |
# Generate conversation text
|
530 |
conversation_text = "\n".join(
|
|
|
538 |
return f"Error: {str(e)}", None
|
539 |
|
540 |
|
541 |
+
async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
|
542 |
"""Regenerate audio from edited conversation text"""
|
543 |
if not conversation_text.strip():
|
544 |
return "Please provide conversation text.", None
|
|
|
550 |
if not conversation_json["conversation"]:
|
551 |
return "No valid conversation found in the text.", None
|
552 |
|
553 |
+
# ํ๊ตญ์ด์ธ ๊ฒฝ์ฐ Edge-TTS๋ง ์ฌ์ฉ (๋ค๋ฅธ TTS๋ ํ๊ตญ์ด ์ง์์ด ์ ํ์ )
|
554 |
+
if language == "Korean" and tts_engine != "Edge-TTS":
|
555 |
+
return "ํ๊ตญ์ด๋ Edge-TTS๋ง ์ง์๋ฉ๋๋ค. TTS ์์ง์ด ์๋์ผ๋ก Edge-TTS๋ก ๋ณ๊ฒฝ๋ฉ๋๋ค.", None
|
556 |
+
|
557 |
# Generate audio based on TTS engine
|
558 |
if tts_engine == "Edge-TTS":
|
559 |
+
output_file, _ = await converter.text_to_speech_edge(conversation_json, language)
|
|
|
|
|
|
|
|
|
560 |
elif tts_engine == "Spark-TTS":
|
561 |
if not SPARK_AVAILABLE:
|
562 |
return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
|
563 |
converter.initialize_spark_tts()
|
564 |
+
output_file, _ = converter.text_to_speech_spark(conversation_json, language)
|
565 |
else: # MeloTTS
|
566 |
if not MELO_AVAILABLE:
|
567 |
return "MeloTTS not available. Please install required dependencies.", None
|
568 |
+
if language == "Korean":
|
569 |
+
return "MeloTTS does not support Korean. Please use Edge-TTS for Korean.", None
|
570 |
converter.initialize_melo_tts()
|
571 |
output_file, _ = converter.text_to_speech_melo(conversation_json)
|
572 |
|
|
|
576 |
return f"Error generating audio: {str(e)}", None
|
577 |
|
578 |
|
579 |
+
def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
|
580 |
"""Synchronous wrapper for async synthesis"""
|
581 |
+
return asyncio.run(synthesize(article_url, mode, tts_engine, language))
|
582 |
|
583 |
|
584 |
+
def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
|
585 |
"""Synchronous wrapper for async audio regeneration"""
|
586 |
+
return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
|
587 |
+
|
588 |
+
|
589 |
+
def update_tts_engine_for_korean(language):
|
590 |
+
"""ํ๊ตญ์ด ์ ํ ์ TTS ์์ง ์ต์
์
๋ฐ์ดํธ"""
|
591 |
+
if language == "Korean":
|
592 |
+
return gr.Radio(
|
593 |
+
choices=["Edge-TTS"],
|
594 |
+
value="Edge-TTS",
|
595 |
+
label="TTS Engine",
|
596 |
+
info="ํ๊ตญ์ด๋ Edge-TTS๋ง ์ง์๋ฉ๋๋ค",
|
597 |
+
interactive=False
|
598 |
+
)
|
599 |
+
else:
|
600 |
+
return gr.Radio(
|
601 |
+
choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
|
602 |
+
value="Edge-TTS",
|
603 |
+
label="TTS Engine",
|
604 |
+
info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU",
|
605 |
+
interactive=True
|
606 |
+
)
|
607 |
|
608 |
|
609 |
# Gradio Interface
|
|
|
619 |
value=""
|
620 |
)
|
621 |
with gr.Column(scale=1):
|
622 |
+
# ์ธ์ด ์ ํ ์ถ๊ฐ
|
623 |
+
language_selector = gr.Radio(
|
624 |
+
choices=["English", "Korean"],
|
625 |
+
value="English",
|
626 |
+
label="Language / ์ธ์ด",
|
627 |
+
info="Select output language / ์ถ๋ ฅ ์ธ์ด๋ฅผ ์ ํํ์ธ์"
|
628 |
+
)
|
629 |
+
|
630 |
mode_selector = gr.Radio(
|
631 |
choices=["API", "Local"],
|
632 |
value="API",
|
|
|
634 |
info="API: Faster, requires API key | Local: Slower, runs on device"
|
635 |
)
|
636 |
|
637 |
+
# TTS ์์ง ์ ํ
|
638 |
with gr.Group():
|
639 |
gr.Markdown("### TTS Engine Selection")
|
640 |
tts_selector = gr.Radio(
|
|
|
651 |
|
652 |
**Additional Option:**
|
653 |
- โก **MeloTTS**: Local processing, GPU recommended
|
654 |
+
|
655 |
+
**ํ๊ตญ์ด ์ง์:**
|
656 |
+
- ๐ฐ๐ท ํ๊ตญ์ด ์ ํ ์ Edge-TTS๋ง ๏ฟฝ๏ฟฝ๏ฟฝ์ฉ ๊ฐ๋ฅํฉ๋๋ค
|
657 |
""")
|
658 |
|
659 |
+
convert_btn = gr.Button("๐ฏ Generate Conversation / ๋ํ ์์ฑ", variant="primary", size="lg")
|
660 |
|
661 |
with gr.Row():
|
662 |
with gr.Column():
|
663 |
conversation_output = gr.Textbox(
|
664 |
+
label="Generated Conversation (Editable) / ์์ฑ๋ ๋ํ (ํธ์ง ๊ฐ๋ฅ)",
|
665 |
lines=15,
|
666 |
max_lines=30,
|
667 |
interactive=True,
|
668 |
+
placeholder="Generated conversation will appear here. You can edit it before generating audio.\n์์ฑ๋ ๋ํ๊ฐ ์ฌ๊ธฐ์ ํ์๋ฉ๋๋ค. ์ค๋์ค ์์ฑ ์ ์ ํธ์งํ ์ ์์ต๋๋ค.",
|
669 |
+
info="Edit the conversation as needed. Format: 'Speaker Name: Text' / ํ์์ ๋ฐ๋ผ ๋ํ๋ฅผ ํธ์งํ์ธ์. ํ์: 'ํ์ ์ด๋ฆ: ํ
์คํธ'"
|
670 |
)
|
671 |
|
672 |
# ์ค๋์ค ์์ฑ ๋ฒํผ ์ถ๊ฐ
|
673 |
with gr.Row():
|
674 |
+
generate_audio_btn = gr.Button("๐๏ธ Generate Audio from Text / ํ
์คํธ์์ ์ค๋์ค ์์ฑ", variant="secondary", size="lg")
|
675 |
+
gr.Markdown("*Edit the conversation above, then click to generate audio / ์์ ๋ํ๋ฅผ ํธ์งํ ํ ํด๋ฆญํ์ฌ ์ค๋์ค๋ฅผ ์์ฑํ์ธ์*")
|
676 |
|
677 |
with gr.Column():
|
678 |
audio_output = gr.Audio(
|
679 |
+
label="Podcast Audio / ํ์บ์คํธ ์ค๋์ค",
|
680 |
type="filepath",
|
681 |
interactive=False
|
682 |
)
|
683 |
|
684 |
# ์ํ ๋ฉ์์ง ์ถ๊ฐ
|
685 |
status_output = gr.Textbox(
|
686 |
+
label="Status / ์ํ",
|
687 |
interactive=False,
|
688 |
visible=True
|
689 |
)
|
|
|
691 |
# TTS ์์ง๋ณ ์ค๋ช
๋ฐ ์ค์น ์๋ด ์ถ๊ฐ
|
692 |
with gr.Row():
|
693 |
gr.Markdown("""
|
694 |
+
### TTS Engine Details / TTS ์์ง ์์ธ์ ๋ณด:
|
695 |
|
696 |
- **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
|
697 |
+
- ๐ฐ๐ท **ํ๊ตญ์ด ์ง์**: ์์ฐ์ค๋ฌ์ด ํ๊ตญ์ด ์์ฑ (์ฌ์ฑ: SunHi, ๋จ์ฑ: InJoon)
|
698 |
- **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
|
699 |
- **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
|
700 |
- Features: Bilingual support (Chinese/English), controllable speech generation
|
701 |
- License: CC BY-NC-SA (Non-commercial use only)
|
702 |
+
- โ ๏ธ **ํ๊ตญ์ด ๋ฏธ์ง์**
|
703 |
- **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
|
704 |
+
- โ ๏ธ **ํ๊ตญ์ด ๋ฏธ์ง์**
|
705 |
|
706 |
### Spark-TTS Setup Instructions:
|
707 |
```bash
|
|
|
713 |
|
714 |
gr.Examples(
|
715 |
examples=[
|
716 |
+
["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS", "English"],
|
717 |
+
["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS", "English"],
|
718 |
+
["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS", "Korean"],
|
719 |
],
|
720 |
+
inputs=[url_input, mode_selector, tts_selector, language_selector],
|
721 |
outputs=[conversation_output, status_output],
|
722 |
fn=synthesize_sync,
|
723 |
cache_examples=False,
|
724 |
)
|
725 |
|
726 |
+
# ์ธ์ด ๋ณ๊ฒฝ ์ TTS ์์ง ์ต์
์
๋ฐ์ดํธ
|
727 |
+
language_selector.change(
|
728 |
+
fn=update_tts_engine_for_korean,
|
729 |
+
inputs=[language_selector],
|
730 |
+
outputs=[tts_selector]
|
731 |
+
)
|
732 |
+
|
733 |
# ์ด๋ฒคํธ ์ฐ๊ฒฐ
|
734 |
convert_btn.click(
|
735 |
fn=synthesize_sync,
|
736 |
+
inputs=[url_input, mode_selector, tts_selector, language_selector],
|
737 |
outputs=[conversation_output, status_output]
|
738 |
)
|
739 |
|
740 |
generate_audio_btn.click(
|
741 |
fn=regenerate_audio_sync,
|
742 |
+
inputs=[conversation_output, tts_selector, language_selector],
|
743 |
outputs=[status_output, audio_output]
|
744 |
)
|
745 |
|