Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -79,6 +79,118 @@ load_dotenv()
|
|
79 |
BRAVE_KEY = os.getenv("BSEARCH_API")
|
80 |
BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
@dataclass
|
83 |
class ConversationConfig:
|
84 |
max_words: int = 8000 # 4000에서 6000으로 증가 (1.5배)
|
@@ -286,126 +398,6 @@ These factors are making the future of {keyword} increasingly promising.
|
|
286 |
return intro + compiled
|
287 |
|
288 |
|
289 |
-
def _build_prompt(self, text: str, language: str = "English", search_context: str = "") -> str:
|
290 |
-
"""Build prompt for conversation generation with enhanced radio talk show style"""
|
291 |
-
# 텍스트 길이 제한
|
292 |
-
max_text_length = 4500 if search_context else 6000
|
293 |
-
if len(text) > max_text_length:
|
294 |
-
text = text[:max_text_length] + "..."
|
295 |
-
|
296 |
-
if language == "Korean":
|
297 |
-
# 대화 템플릿을 더 많은 턴으로 확장 (15-20회)
|
298 |
-
template = """
|
299 |
-
{
|
300 |
-
"conversation": [
|
301 |
-
{"speaker": "준수", "text": ""},
|
302 |
-
{"speaker": "민호", "text": ""},
|
303 |
-
{"speaker": "준수", "text": ""},
|
304 |
-
{"speaker": "민호", "text": ""},
|
305 |
-
{"speaker": "준수", "text": ""},
|
306 |
-
{"speaker": "민호", "text": ""},
|
307 |
-
{"speaker": "준수", "text": ""},
|
308 |
-
{"speaker": "민호", "text": ""},
|
309 |
-
{"speaker": "준수", "text": ""},
|
310 |
-
{"speaker": "민호", "text": ""},
|
311 |
-
{"speaker": "준수", "text": ""},
|
312 |
-
{"speaker": "민호", "text": ""},
|
313 |
-
{"speaker": "준수", "text": ""},
|
314 |
-
{"speaker": "민호", "text": ""},
|
315 |
-
{"speaker": "준수", "text": ""},
|
316 |
-
{"speaker": "민호", "text": ""},
|
317 |
-
{"speaker": "준수", "text": ""},
|
318 |
-
{"speaker": "민호", "text": ""}
|
319 |
-
]
|
320 |
-
}
|
321 |
-
"""
|
322 |
-
|
323 |
-
context_part = ""
|
324 |
-
if search_context:
|
325 |
-
context_part = f"# 최신 관련 정보:\n{search_context}\n"
|
326 |
-
|
327 |
-
base_prompt = (
|
328 |
-
f"# 원본 콘텐츠:\n{text}\n\n"
|
329 |
-
f"{context_part}"
|
330 |
-
f"위 내용으로 전문적이고 심층적인 라디오 팟캐스트 대담 프로그램 대본을 작성해주세요.\n\n"
|
331 |
-
f"## 필수 요구사항:\n"
|
332 |
-
f"1. **최소 18회 이상의 대화 교환** (준수 9회, 민호 9회 이상)\n"
|
333 |
-
f"2. **대화 스타일**: 전문적이고 깊이 있는 팟캐스트 대담\n"
|
334 |
-
f"3. **화자 역할**:\n"
|
335 |
-
f" - 준수: 진행자 (통찰력 있는 질문, 핵심 포인트 정리, 청취자 관점 대변)\n"
|
336 |
-
f" - 민호: 전문가 (상세하고 전문적인 설명, 구체적 예시, 데이터 기반 분석)\n"
|
337 |
-
f"4. **답변 규칙**:\n"
|
338 |
-
f" - 준수: 1-2문장의 명확한 질문이나 요약\n"
|
339 |
-
f" - 민호: **반드시 2-4문장으로 충실하게 답변** (핵심 개념 설명 + 부연 설명 + 예시/근거)\n"
|
340 |
-
f" - 전문 용어는 쉽게 풀어서 설명\n"
|
341 |
-
f" - 구체적인 수치, 사례, 연구 결과 인용\n"
|
342 |
-
f"5. **내용 구성**:\n"
|
343 |
-
f" - 도입부 (2-3회): 주제의 중요성과 배경 설명\n"
|
344 |
-
f" - 전개부 (12-14회): 핵심 내용을 다각도로 심층 분석\n"
|
345 |
-
f" - 마무리 (2-3회): 핵심 요약과 미래 전망\n"
|
346 |
-
f"6. **전문성**: 학술적 근거와 실무적 통찰을 균형있게 포함\n"
|
347 |
-
f"7. **필수**: 서로 존댓말 사용, 청취자가 전문 지식을 얻을 수 있도록 상세히 설명\n\n"
|
348 |
-
f"반드시 위 JSON 형식으로 18회 이상의 전문적인 대화를 작성하세요:\n{template}"
|
349 |
-
)
|
350 |
-
|
351 |
-
return base_prompt
|
352 |
-
|
353 |
-
else:
|
354 |
-
# 영어 템플릿도 확장
|
355 |
-
template = """
|
356 |
-
{
|
357 |
-
"conversation": [
|
358 |
-
{"speaker": "Alex", "text": ""},
|
359 |
-
{"speaker": "Jordan", "text": ""},
|
360 |
-
{"speaker": "Alex", "text": ""},
|
361 |
-
{"speaker": "Jordan", "text": ""},
|
362 |
-
{"speaker": "Alex", "text": ""},
|
363 |
-
{"speaker": "Jordan", "text": ""},
|
364 |
-
{"speaker": "Alex", "text": ""},
|
365 |
-
{"speaker": "Jordan", "text": ""},
|
366 |
-
{"speaker": "Alex", "text": ""},
|
367 |
-
{"speaker": "Jordan", "text": ""},
|
368 |
-
{"speaker": "Alex", "text": ""},
|
369 |
-
{"speaker": "Jordan", "text": ""},
|
370 |
-
{"speaker": "Alex", "text": ""},
|
371 |
-
{"speaker": "Jordan", "text": ""},
|
372 |
-
{"speaker": "Alex", "text": ""},
|
373 |
-
{"speaker": "Jordan", "text": ""},
|
374 |
-
{"speaker": "Alex", "text": ""},
|
375 |
-
{"speaker": "Jordan", "text": ""}
|
376 |
-
]
|
377 |
-
}
|
378 |
-
"""
|
379 |
-
|
380 |
-
context_part = ""
|
381 |
-
if search_context:
|
382 |
-
context_part = f"# Latest Information:\n{search_context}\n"
|
383 |
-
|
384 |
-
base_prompt = (
|
385 |
-
f"# Content:\n{text}\n\n"
|
386 |
-
f"{context_part}"
|
387 |
-
f"Create a professional and in-depth podcast conversation.\n\n"
|
388 |
-
f"## Requirements:\n"
|
389 |
-
f"1. **Minimum 18 conversation exchanges** (Alex 9+, Jordan 9+)\n"
|
390 |
-
f"2. **Style**: Professional, insightful podcast discussion\n"
|
391 |
-
f"3. **Roles**:\n"
|
392 |
-
f" - Alex: Host (insightful questions, key point summaries, audience perspective)\n"
|
393 |
-
f" - Jordan: Expert (detailed explanations, concrete examples, data-driven analysis)\n"
|
394 |
-
f"4. **Response Rules**:\n"
|
395 |
-
f" - Alex: 1-2 sentence clear questions or summaries\n"
|
396 |
-
f" - Jordan: **Must answer in 2-4 sentences** (core concept + elaboration + example/evidence)\n"
|
397 |
-
f" - Explain technical terms clearly\n"
|
398 |
-
f" - Include specific data, cases, research findings\n"
|
399 |
-
f"5. **Structure**:\n"
|
400 |
-
f" - Introduction (2-3 exchanges): Topic importance and context\n"
|
401 |
-
f" - Main content (12-14 exchanges): Multi-angle deep analysis\n"
|
402 |
-
f" - Conclusion (2-3 exchanges): Key takeaways and future outlook\n"
|
403 |
-
f"6. **Expertise**: Balance academic rigor with practical insights\n\n"
|
404 |
-
f"Create exactly 18+ professional exchanges in this JSON format:\n{template}"
|
405 |
-
)
|
406 |
-
|
407 |
-
return base_prompt
|
408 |
-
|
409 |
class UnifiedAudioConverter:
|
410 |
def __init__(self, config: ConversationConfig):
|
411 |
self.config = config
|
@@ -557,7 +549,6 @@ class UnifiedAudioConverter:
|
|
557 |
else:
|
558 |
return MessagesFormatterType.LLAMA_3
|
559 |
|
560 |
-
|
561 |
def _build_prompt(self, text: str, language: str = "English", search_context: str = "") -> str:
|
562 |
"""Build prompt for conversation generation with enhanced professional podcast style"""
|
563 |
# 텍스트 길이 제한
|
@@ -565,31 +556,59 @@ class UnifiedAudioConverter:
|
|
565 |
if len(text) > max_text_length:
|
566 |
text = text[:max_text_length] + "..."
|
567 |
|
|
|
568 |
if language == "Korean":
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
"""
|
588 |
-
|
589 |
-
|
590 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
context_part = f"# 최신 관련 정보:\n{search_context}\n"
|
|
|
|
|
592 |
|
|
|
593 |
base_prompt = (
|
594 |
f"# 원본 콘텐츠:\n{text}\n\n"
|
595 |
f"{context_part}"
|
@@ -597,99 +616,48 @@ class UnifiedAudioConverter:
|
|
597 |
f"## 핵심 지침:\n"
|
598 |
f"1. **대화 스타일**: 전문적이면서도 이해하기 쉬운 팟캐스트 대담\n"
|
599 |
f"2. **화자 역할**:\n"
|
600 |
-
f" -
|
601 |
-
f" -
|
602 |
f"3. **중요한 답변 규칙**:\n"
|
603 |
-
f" -
|
604 |
-
f" -
|
605 |
-
f"
|
606 |
-
f"
|
607 |
-
f" - 통계나 연구 결과 인용\n"
|
608 |
-
f" - 실제 사례와 케이스 스터디\n"
|
609 |
-
f" - 전문 용어를 쉽게 풀어서 설명\n"
|
610 |
-
f" - 다양한 관점과 시각 제시\n"
|
611 |
-
f"5. **필수 규칙**: 서로 존댓말 사용, 12-15회 대화 교환\n\n"
|
612 |
f"JSON 형식으로만 반환:\n{template}"
|
613 |
)
|
614 |
-
|
615 |
-
return base_prompt
|
616 |
-
|
617 |
else:
|
618 |
-
# 영어 템플릿도 확장
|
619 |
-
template = """
|
620 |
-
{
|
621 |
-
"conversation": [
|
622 |
-
{"speaker": "Alex", "text": ""},
|
623 |
-
{"speaker": "Jordan", "text": ""},
|
624 |
-
{"speaker": "Alex", "text": ""},
|
625 |
-
{"speaker": "Jordan", "text": ""},
|
626 |
-
{"speaker": "Alex", "text": ""},
|
627 |
-
{"speaker": "Jordan", "text": ""},
|
628 |
-
{"speaker": "Alex", "text": ""},
|
629 |
-
{"speaker": "Jordan", "text": ""},
|
630 |
-
{"speaker": "Alex", "text": ""},
|
631 |
-
{"speaker": "Jordan", "text": ""},
|
632 |
-
{"speaker": "Alex", "text": ""},
|
633 |
-
{"speaker": "Jordan", "text": ""}
|
634 |
-
]
|
635 |
-
}
|
636 |
-
"""
|
637 |
-
|
638 |
-
context_part = ""
|
639 |
-
if search_context:
|
640 |
-
context_part = f"# Latest Information:\n{search_context}\n"
|
641 |
-
|
642 |
base_prompt = (
|
643 |
f"# Content:\n{text}\n\n"
|
644 |
f"{context_part}"
|
645 |
-
f"Create a professional and insightful podcast conversation.\n\n"
|
646 |
f"## Key Guidelines:\n"
|
647 |
f"1. **Style**: Professional yet accessible podcast discussion\n"
|
648 |
f"2. **Roles**:\n"
|
649 |
-
f" -
|
650 |
-
f" -
|
651 |
f"3. **Critical Response Rules**:\n"
|
652 |
-
f" -
|
653 |
-
f" -
|
654 |
-
f"
|
655 |
-
f"
|
656 |
-
f" - Cite statistics and research\n"
|
657 |
-
f" - Real cases and case studies\n"
|
658 |
-
f" - Explain technical terms clearly\n"
|
659 |
-
f" - Present multiple perspectives\n"
|
660 |
-
f"5. **Length**: 12-15 exchanges total\n\n"
|
661 |
f"Return JSON only:\n{template}"
|
662 |
)
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
|
668 |
def _build_messages_for_local(self, text: str, language: str = "English", search_context: str = "") -> List[Dict]:
|
669 |
"""Build messages for local LLM with enhanced professional podcast style"""
|
670 |
if language == "Korean":
|
671 |
system_message = (
|
672 |
"당신은 한국 최고의 전문 팟캐스트 작가입니다. "
|
673 |
-
"청취자들이 전문 지식을 쉽게 이해할 수 있는 고품질 대담을
|
674 |
-
"
|
675 |
-
"1. 진행자(준수)는 핵심을 짚는 통찰력 있는 질문으로 대화를 이끌어갑니다\n"
|
676 |
-
"2. 전문가(민호)는 반드시 2-4문��으로 깊이 있게 답변합니다 (개념+설명+예시)\n"
|
677 |
-
"3. 구체적인 데이터, 연구 결과, 실제 사례를 포함합니다\n"
|
678 |
-
"4. 전문 용어는 쉽게 풀어서 설명하되, 정확성을 유지합니다\n"
|
679 |
-
"5. 다양한 관점을 제시하여 균형잡힌 시각을 제공합니다\n"
|
680 |
-
"6. 반드시 서로 존댓말을 사용하며, 전문적이면서도 친근한 톤을 유지합니다"
|
681 |
)
|
682 |
else:
|
683 |
system_message = (
|
684 |
-
"You are an expert podcast scriptwriter
|
685 |
-
"professional discussions
|
686 |
-
"
|
687 |
-
"1. The host (Alex) asks insightful questions that drive the conversation\n"
|
688 |
-
"2. The expert (Jordan) MUST answer in 2-4 sentences (concept+explanation+example)\n"
|
689 |
-
"3. Include specific data, research findings, and real cases\n"
|
690 |
-
"4. Explain technical terms clearly while maintaining accuracy\n"
|
691 |
-
"5. Present multiple perspectives for balanced views\n"
|
692 |
-
"6. Maintain a professional yet approachable tone"
|
693 |
)
|
694 |
|
695 |
return [
|
@@ -719,36 +687,19 @@ class UnifiedAudioConverter:
|
|
719 |
chat_template = self._get_messages_formatter_type(self.config.local_model_name)
|
720 |
provider = LlamaCppPythonProvider(self.local_llm)
|
721 |
|
722 |
-
#
|
723 |
if language == "Korean":
|
724 |
system_message = (
|
725 |
"당신은 한국의 유명 팟캐스트 전문 작가입니다. "
|
726 |
-
"청취자들이 깊이 있는 전문 지식을 얻을 수 있는 고품질 대담을
|
727 |
-
"
|
728 |
-
"
|
729 |
-
"2. 전문가(민호)는 반드시 2-4문장으로 충실히 답변합니다:\n"
|
730 |
-
" - 첫 문장: 핵심 개념 설명\n"
|
731 |
-
" - 둘째 문장: 구체적인 설명이나 맥락\n"
|
732 |
-
" - 셋째-넷째 문장: 실제 예시, 데이터, 함의\n"
|
733 |
-
"3. 통계, 연구 결과, 실제 사례를 적극 활용하세요\n"
|
734 |
-
"4. 전문성을 유지하면서도 이해하기 쉽게 설명하세요\n"
|
735 |
-
"5. 12-15회의 대화 교환으로 구성하세요\n"
|
736 |
-
"6. JSON 형식으로만 응답하세요"
|
737 |
)
|
738 |
else:
|
739 |
system_message = (
|
740 |
-
"You are a professional podcast scriptwriter creating high-quality, "
|
741 |
-
"insightful discussions
|
742 |
-
"
|
743 |
-
"1. Host (Alex) asks focused 1-2 sentence questions\n"
|
744 |
-
"2. Expert (Jordan) MUST answer in 2-4 substantial sentences:\n"
|
745 |
-
" - First sentence: Core concept explanation\n"
|
746 |
-
" - Second sentence: Specific details or context\n"
|
747 |
-
" - Third-fourth sentences: Real examples, data, implications\n"
|
748 |
-
"3. Actively use statistics, research findings, real cases\n"
|
749 |
-
"4. Maintain expertise while keeping explanations accessible\n"
|
750 |
-
"5. Create 12-15 conversation exchanges\n"
|
751 |
-
"6. Respond only in JSON format"
|
752 |
)
|
753 |
|
754 |
agent = LlamaCppAgent(
|
@@ -759,10 +710,10 @@ class UnifiedAudioConverter:
|
|
759 |
)
|
760 |
|
761 |
settings = provider.get_provider_default_settings()
|
762 |
-
settings.temperature = 0.75
|
763 |
settings.top_k = 40
|
764 |
settings.top_p = 0.95
|
765 |
-
settings.max_tokens = self.config.max_tokens
|
766 |
settings.repeat_penalty = 1.1
|
767 |
settings.stream = False
|
768 |
|
@@ -783,10 +734,6 @@ class UnifiedAudioConverter:
|
|
783 |
|
784 |
if json_match:
|
785 |
conversation_data = json.loads(json_match.group())
|
786 |
-
# 대화 길이 확인 및 조정
|
787 |
-
if len(conversation_data["conversation"]) < self.config.min_conversation_turns:
|
788 |
-
print(f"Conversation too short ({len(conversation_data['conversation'])} turns), regenerating...")
|
789 |
-
# 재시도 로직 추가 가능
|
790 |
return conversation_data
|
791 |
else:
|
792 |
raise ValueError("No valid JSON found in local LLM response")
|
@@ -797,24 +744,20 @@ class UnifiedAudioConverter:
|
|
797 |
|
798 |
@spaces.GPU(duration=120)
|
799 |
def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None, search_context: str = "") -> Dict:
|
800 |
-
"""Extract conversation using legacy local model
|
801 |
try:
|
802 |
self.initialize_legacy_local_mode()
|
803 |
|
804 |
-
#
|
805 |
if language == "Korean":
|
806 |
system_message = (
|
807 |
"당신은 전문 팟캐스트 작가입니다. "
|
808 |
-
"
|
809 |
-
"구체적인 데이터와 사례를 포함하여 전문적이면서도 이해하기 쉽게 설명하세요. "
|
810 |
-
"12-15회 대화 교환으로 구성하세요."
|
811 |
)
|
812 |
else:
|
813 |
system_message = (
|
814 |
-
"You are a professional podcast scriptwriter. "
|
815 |
-
"Create
|
816 |
-
"and the expert (Jordan) gives detailed 2-4 sentence answers. "
|
817 |
-
"Include specific data and examples. Create 12-15 exchanges."
|
818 |
)
|
819 |
|
820 |
chat = [
|
@@ -839,7 +782,7 @@ class UnifiedAudioConverter:
|
|
839 |
generate_kwargs = dict(
|
840 |
model_inputs,
|
841 |
streamer=streamer,
|
842 |
-
max_new_tokens=self.config.max_new_tokens,
|
843 |
do_sample=True,
|
844 |
temperature=0.75,
|
845 |
eos_token_id=terminators,
|
@@ -862,14 +805,17 @@ class UnifiedAudioConverter:
|
|
862 |
|
863 |
except Exception as e:
|
864 |
print(f"Legacy local model also failed: {e}")
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
|
|
|
|
|
|
870 |
|
871 |
def _get_default_korean_conversation(self) -> Dict:
|
872 |
-
"""
|
873 |
return {
|
874 |
"conversation": [
|
875 |
{"speaker": "준수", "text": "안녕하세요, 여러분! 오늘은 정말 중요하고 흥미로운 주제를 다뤄보려고 합니다. 민호 박사님, 먼저 이 주제가 왜 지금 이렇게 주목받고 있는지 설명해주시겠어요?"},
|
@@ -885,12 +831,12 @@ class UnifiedAudioConverter:
|
|
885 |
{"speaker": "준수", "text": "실용적인 조언 감사합니다. 마지막으로 이 분야의 미래 전망은 어떻게 보시나요?"},
|
886 |
{"speaker": "민호", "text": "향후 10년은 인류 역사상 가장 급격한 기술 발전을 경험하는 시기가 될 것입니다. 가트너의 하이프 사이클 분석에 따르면, 현재 우리는 이 기술의 초기 단계에 불과합니다. 2030년까지는 지금으로서는 상상하기 어려운 수준의 혁신이 일어날 것으로 예상됩니다. 중요한 것은 이런 변화를 두려워하기보다는 기회로 삼아 더 나은 미래를 만들어가는 것이라고 생각합니다."},
|
887 |
{"speaker": "준수", "text": "정말 통찰력 있�� 말씀이네요. 오늘 너무나 유익한 시간이었습니다. 청취자 여러분도 오늘 논의된 내용을 바탕으로 미래를 준비하시길 바랍니다. 민호 박사님, 귀중한 시간 내주셔서 감사합니다!"},
|
888 |
-
{"speaker": "민호", "text": "감사합니다. 청취자 여러분들이 이 변화의 시대를 현명하게 헤쳐나가시길 바랍니다. 기술은 도구일 뿐이고, 그것을 어떻게 활용하는지는 우리에게 달려있다는 점을 기억해주세요.
|
889 |
]
|
890 |
}
|
891 |
|
892 |
def _get_default_english_conversation(self) -> Dict:
|
893 |
-
"""
|
894 |
return {
|
895 |
"conversation": [
|
896 |
{"speaker": "Alex", "text": "Welcome everyone to our podcast! Today we're diving into a topic that's reshaping our world. Dr. Jordan, could you start by explaining why this subject has become so critical right now?"},
|
@@ -898,24 +844,22 @@ class UnifiedAudioConverter:
|
|
898 |
{"speaker": "Alex", "text": "400% acceleration is staggering! What does this mean for everyday people who might not be tech-savvy?"},
|
899 |
{"speaker": "Jordan", "text": "The impact will be profound yet accessible. Think about how smartphones revolutionized communication - this will be similar but across every aspect of life. McKinsey's latest report projects that by 2026, these technologies will create $4.4 trillion in annual value globally. For individuals, this translates to personalized healthcare that can predict illnesses years in advance, educational systems that adapt to each student's learning style, and financial tools that democratize wealth-building strategies previously available only to the ultra-wealthy."},
|
900 |
{"speaker": "Alex", "text": "Those applications sound transformative. Can you give us a concrete example of how this is already being implemented?"},
|
901 |
-
{"speaker": "Jordan", "text": "Absolutely. Let me share a compelling case from Johns Hopkins Hospital. They've deployed an AI system that analyzes patient data in real-time, reducing diagnostic errors by 85% and cutting average diagnosis time from days to hours. In one documented case, the system identified a rare genetic disorder in a child that had been misdiagnosed for three years. The accuracy comes from analyzing patterns across millions of cases - something impossible for even the most experienced doctors to do manually.
|
902 |
{"speaker": "Alex", "text": "That's truly life-changing technology. But I imagine there are significant challenges and risks we need to consider?"},
|
903 |
-
{"speaker": "Jordan", "text": "You're absolutely right to raise this. The challenges are as significant as the opportunities. The World Economic Forum identifies three critical risks:
|
904 |
{"speaker": "Alex", "text": "How should individuals and organizations prepare for these changes?"},
|
905 |
-
{"speaker": "Jordan", "text": "Preparation requires a multi-faceted approach. For individuals, I recommend focusing on skills that complement rather than compete with AI: critical thinking, emotional intelligence, and creative problem-solving. MIT's recent study shows that professionals who combine domain expertise with AI literacy see salary increases of 40% on average. Organizations need to invest in continuous learning programs - Amazon's $700 million worker retraining initiative is a good model. Most importantly, we need to cultivate an adaptive mindset.
|
906 |
{"speaker": "Alex", "text": "That's practical advice. What about the ethical considerations? How do we ensure this technology benefits humanity as a whole?"},
|
907 |
-
{"speaker": "Jordan", "text": "Ethics must be at the forefront of development. The EU's AI Act and similar regulations worldwide are establishing important guardrails. We need transparent AI systems where decisions can be explained and audited. Companies like IBM and Google have established AI ethics boards, but we need industry-wide standards. Additionally, we must address the digital divide - UNESCO reports that 37% of the global population still lacks internet access. Without inclusive development, these technologies could exacerbate global inequality
|
908 |
{"speaker": "Alex", "text": "Looking ahead, what's your vision for how this technology will shape the next decade?"},
|
909 |
-
{"speaker": "Jordan", "text": "The next decade will be transformative beyond our current imagination.
|
910 |
-
{"speaker": "Alex", "text": "That's both exciting and sobering. Any final thoughts for our listeners?"},
|
911 |
-
{"speaker": "Jordan", "text": "I'd encourage everyone to view this as humanity's next great adventure. Yes, there are risks and challenges, but we're also on the cusp of solving problems that have plagued us for millennia - disease, poverty, environmental degradation. The key is engaged participation rather than passive observation. Stay informed through reliable sources, experiment with new technologies, and most importantly, contribute to the conversation about what kind of future we want to build. The decisions we make in the next five years will reverberate for generations."},
|
912 |
{"speaker": "Alex", "text": "Dr. Jordan, this has been an incredibly enlightening discussion. Thank you for sharing your expertise and insights with us today."},
|
913 |
-
{"speaker": "Jordan", "text": "Thank you, Alex.
|
914 |
]
|
915 |
}
|
916 |
|
917 |
def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
|
918 |
-
"""Extract conversation using API
|
919 |
if not self.llm_client:
|
920 |
raise RuntimeError("API mode not initialized")
|
921 |
|
@@ -932,26 +876,18 @@ class UnifiedAudioConverter:
|
|
932 |
except Exception as e:
|
933 |
print(f"Search failed, continuing without context: {e}")
|
934 |
|
935 |
-
#
|
936 |
if language == "Korean":
|
937 |
system_message = (
|
938 |
"당신은 한국의 최고 전문 팟캐스트 작가입니다. "
|
939 |
-
"
|
940 |
-
"
|
941 |
-
"민호(전문가)는 반드시 2-4문장으로 상세히 답변합니다. "
|
942 |
-
"구체적인 데이터, 연구 결과, 실제 사례를 포함하세요. "
|
943 |
-
"전문 용어는 쉽게 설명하고, 반드시 서로 존댓말을 사용하세요. "
|
944 |
-
"12-15회의 깊이 있는 대화 교환으로 구성하세요."
|
945 |
)
|
946 |
else:
|
947 |
system_message = (
|
948 |
-
"You are a top professional podcast scriptwriter. "
|
949 |
-
"Create high-quality discussions
|
950 |
-
"
|
951 |
-
"while Jordan (expert) MUST answer in 2-4 detailed sentences. "
|
952 |
-
"Include specific data, research findings, and real cases. "
|
953 |
-
"Explain technical terms clearly. "
|
954 |
-
"Create 12-15 insightful conversation exchanges."
|
955 |
)
|
956 |
|
957 |
chat_completion = self.llm_client.chat.completions.create(
|
@@ -994,17 +930,8 @@ class UnifiedAudioConverter:
|
|
994 |
filenames = []
|
995 |
|
996 |
try:
|
997 |
-
# 언어별 음성 설정
|
998 |
-
|
999 |
-
voices = [
|
1000 |
-
"ko-KR-HyunsuNeural", # 남성 음성 1 (차분하고 신뢰감 있는)
|
1001 |
-
"ko-KR-InJoonNeural" # 남성 음성 2 (활기차고 친근한)
|
1002 |
-
]
|
1003 |
-
else:
|
1004 |
-
voices = [
|
1005 |
-
"en-US-AndrewMultilingualNeural", # 남성 음성 1
|
1006 |
-
"en-US-BrianMultilingualNeural" # 남성 음성 2
|
1007 |
-
]
|
1008 |
|
1009 |
for i, turn in enumerate(conversation_json["conversation"]):
|
1010 |
filename = output_dir / f"output_{i}.wav"
|
@@ -1055,13 +982,13 @@ class UnifiedAudioConverter:
|
|
1055 |
# Create different voice characteristics for different speakers
|
1056 |
if language == "Korean":
|
1057 |
voice_configs = [
|
1058 |
-
{"prompt_text": "안녕하세요, 오늘 팟캐스트 진행을 맡은 준수입니다.
|
1059 |
-
{"prompt_text": "안녕하세요, 저는 오늘 이 주제에 대해 설명드릴 민호입니다.
|
1060 |
]
|
1061 |
else:
|
1062 |
voice_configs = [
|
1063 |
-
{"prompt_text": "Hello everyone, I'm Alex, your host for today's podcast.
|
1064 |
-
{"prompt_text": "Hi, I'm Jordan. I'm excited to share my insights
|
1065 |
]
|
1066 |
|
1067 |
for i, turn in enumerate(conversation_json["conversation"]):
|
@@ -1069,12 +996,9 @@ class UnifiedAudioConverter:
|
|
1069 |
if not text.strip():
|
1070 |
continue
|
1071 |
|
1072 |
-
# Use different voice config for each speaker
|
1073 |
voice_config = voice_configs[i % len(voice_configs)]
|
1074 |
-
|
1075 |
output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
|
1076 |
|
1077 |
-
# Run Spark TTS CLI inference
|
1078 |
cmd = [
|
1079 |
"python", "-m", "cli.inference",
|
1080 |
"--text", text,
|
@@ -1086,33 +1010,29 @@ class UnifiedAudioConverter:
|
|
1086 |
]
|
1087 |
|
1088 |
try:
|
1089 |
-
# Run the command
|
1090 |
result = subprocess.run(
|
1091 |
cmd,
|
1092 |
capture_output=True,
|
1093 |
text=True,
|
1094 |
timeout=60,
|
1095 |
-
cwd="."
|
1096 |
)
|
1097 |
|
1098 |
if result.returncode == 0:
|
1099 |
audio_files.append(output_file)
|
1100 |
else:
|
1101 |
print(f"Spark TTS error for turn {i}: {result.stderr}")
|
1102 |
-
|
1103 |
-
silence = np.zeros(int(22050 * 1.0)) # 1 second of silence
|
1104 |
sf.write(output_file, silence, 22050)
|
1105 |
audio_files.append(output_file)
|
1106 |
|
1107 |
except subprocess.TimeoutExpired:
|
1108 |
print(f"Spark TTS timeout for turn {i}")
|
1109 |
-
# Create silence as fallback
|
1110 |
silence = np.zeros(int(22050 * 1.0))
|
1111 |
sf.write(output_file, silence, 22050)
|
1112 |
audio_files.append(output_file)
|
1113 |
except Exception as e:
|
1114 |
print(f"Error running Spark TTS for turn {i}: {e}")
|
1115 |
-
# Create silence as fallback
|
1116 |
silence = np.zeros(int(22050 * 1.0))
|
1117 |
sf.write(output_file, silence, 22050)
|
1118 |
audio_files.append(output_file)
|
@@ -1124,7 +1044,6 @@ class UnifiedAudioConverter:
|
|
1124 |
else:
|
1125 |
raise RuntimeError("No audio files generated")
|
1126 |
|
1127 |
-
# Generate conversation text
|
1128 |
conversation_text = "\n".join(
|
1129 |
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
1130 |
for i, turn in enumerate(conversation_json["conversation"])
|
@@ -1150,7 +1069,6 @@ class UnifiedAudioConverter:
|
|
1150 |
speaker = speakers[i % 2]
|
1151 |
speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
|
1152 |
|
1153 |
-
# Generate audio
|
1154 |
self.melo_models["EN"].tts_to_file(
|
1155 |
text, speaker_id, bio, speed=1.0,
|
1156 |
pbar=progress.tqdm if progress else None,
|
@@ -1161,11 +1079,9 @@ class UnifiedAudioConverter:
|
|
1161 |
audio_segment = AudioSegment.from_file(bio, format="wav")
|
1162 |
combined_audio += audio_segment
|
1163 |
|
1164 |
-
# Save final audio
|
1165 |
final_audio_path = "melo_podcast.mp3"
|
1166 |
combined_audio.export(final_audio_path, format="mp3")
|
1167 |
|
1168 |
-
# Generate conversation text
|
1169 |
conversation_text = "\n".join(
|
1170 |
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
1171 |
for i, turn in enumerate(conversation_json["conversation"])
|
@@ -1224,10 +1140,9 @@ async def synthesize(article_input, input_type: str = "URL", mode: str = "Local"
|
|
1224 |
else: # Keyword
|
1225 |
if not article_input or not isinstance(article_input, str):
|
1226 |
return "Please provide a keyword or topic.", None
|
1227 |
-
# 키워드로 검색하여 콘텐츠 생성
|
1228 |
text = search_and_compile_content(article_input, language)
|
1229 |
-
text = f"Keyword-based content:\n{text}"
|
1230 |
-
|
1231 |
# Limit text to max words
|
1232 |
words = text.split()
|
1233 |
if len(words) > converter.config.max_words:
|
@@ -1235,19 +1150,17 @@ async def synthesize(article_input, input_type: str = "URL", mode: str = "Local"
|
|
1235 |
|
1236 |
# Extract conversation based on mode
|
1237 |
if mode == "Local":
|
1238 |
-
# 로컬 모드가 기본 (새로운 Local LLM 사용)
|
1239 |
try:
|
1240 |
conversation_json = converter.extract_conversation_local(text, language)
|
1241 |
except Exception as e:
|
1242 |
print(f"Local mode failed: {e}, trying API fallback")
|
1243 |
-
# API 폴백
|
1244 |
api_key = os.environ.get("TOGETHER_API_KEY")
|
1245 |
if api_key:
|
1246 |
converter.initialize_api_mode(api_key)
|
1247 |
conversation_json = converter.extract_conversation_api(text, language)
|
1248 |
else:
|
1249 |
raise RuntimeError("Local mode failed and no API key available for fallback")
|
1250 |
-
else: # API mode
|
1251 |
api_key = os.environ.get("TOGETHER_API_KEY")
|
1252 |
if not api_key:
|
1253 |
print("API key not found, falling back to local mode")
|
@@ -1278,15 +1191,14 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS",
|
|
1278 |
return "Please provide conversation text.", None
|
1279 |
|
1280 |
try:
|
1281 |
-
# Parse the conversation text back to JSON format
|
1282 |
conversation_json = converter.parse_conversation_text(conversation_text)
|
1283 |
|
1284 |
if not conversation_json["conversation"]:
|
1285 |
return "No valid conversation found in the text.", None
|
1286 |
|
1287 |
-
#
|
1288 |
-
if language
|
1289 |
-
tts_engine = "Edge-TTS"
|
1290 |
|
1291 |
# Generate audio based on TTS engine
|
1292 |
if tts_engine == "Edge-TTS":
|
@@ -1299,8 +1211,8 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS",
|
|
1299 |
else: # MeloTTS
|
1300 |
if not MELO_AVAILABLE:
|
1301 |
return "MeloTTS not available. Please install required dependencies.", None
|
1302 |
-
if language
|
1303 |
-
return "MeloTTS does not support
|
1304 |
converter.initialize_melo_tts()
|
1305 |
output_file, _ = converter.text_to_speech_melo(conversation_json)
|
1306 |
|
@@ -1320,14 +1232,34 @@ def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS",
|
|
1320 |
return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
|
1321 |
|
1322 |
|
1323 |
-
def
|
1324 |
-
"""
|
1325 |
-
if language
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1326 |
return gr.Radio(
|
1327 |
choices=["Edge-TTS"],
|
1328 |
value="Edge-TTS",
|
1329 |
label="TTS Engine",
|
1330 |
-
info=
|
1331 |
interactive=False
|
1332 |
)
|
1333 |
else:
|
@@ -1363,7 +1295,7 @@ if LLAMA_CPP_AVAILABLE:
|
|
1363 |
print(f"Failed to download model at startup: {e}")
|
1364 |
|
1365 |
|
1366 |
-
# Gradio Interface - 개선된 레이아웃
|
1367 |
with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
1368 |
.container {max-width: 1200px; margin: auto; padding: 20px;}
|
1369 |
.header-text {text-align: center; margin-bottom: 30px;}
|
@@ -1375,8 +1307,8 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1375 |
# 헤더
|
1376 |
with gr.Row(elem_classes="header-text"):
|
1377 |
gr.Markdown("""
|
1378 |
-
# 🎙️ AI Podcast Generator - Professional Edition
|
1379 |
-
### Convert any article, blog, PDF document, or topic into an engaging professional podcast conversation
|
1380 |
""")
|
1381 |
|
1382 |
with gr.Row(elem_classes="discord-badge"):
|
@@ -1388,8 +1320,6 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1388 |
</p>
|
1389 |
""")
|
1390 |
|
1391 |
-
|
1392 |
-
|
1393 |
# 상태 표시 섹션
|
1394 |
with gr.Row():
|
1395 |
with gr.Column(scale=1):
|
@@ -1402,11 +1332,11 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1402 |
""")
|
1403 |
with gr.Column(scale=1):
|
1404 |
gr.Markdown("""
|
1405 |
-
####
|
1406 |
-
- **
|
1407 |
-
- **
|
1408 |
-
- **
|
1409 |
-
- **
|
1410 |
""")
|
1411 |
|
1412 |
# 메인 입력 섹션
|
@@ -1441,7 +1371,7 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1441 |
# 키워드 입력
|
1442 |
keyword_input = gr.Textbox(
|
1443 |
label="🔍 Topic/Keyword",
|
1444 |
-
placeholder="Enter a topic (e.g., 'AI trends 2024', '인공지능
|
1445 |
value="",
|
1446 |
visible=False,
|
1447 |
info="System will search and compile latest information",
|
@@ -1452,10 +1382,16 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1452 |
with gr.Column(scale=1):
|
1453 |
# 언어 선택
|
1454 |
language_selector = gr.Radio(
|
1455 |
-
choices=[
|
|
|
|
|
|
|
|
|
|
|
|
|
1456 |
value="English",
|
1457 |
-
label="🌐 Language / 언어",
|
1458 |
-
info="
|
1459 |
)
|
1460 |
|
1461 |
# 처리 모드
|
@@ -1493,7 +1429,7 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1493 |
lines=25,
|
1494 |
max_lines=50,
|
1495 |
interactive=True,
|
1496 |
-
placeholder="Professional podcast conversation will appear here...\n전문 팟캐스트 대화가 여기에
|
1497 |
info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
|
1498 |
)
|
1499 |
|
@@ -1524,20 +1460,24 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1524 |
gr.Markdown("""
|
1525 |
#### 💡 Quick Tips:
|
1526 |
- **URL**: Paste any article link
|
1527 |
-
- **PDF**: Upload documents directly
|
1528 |
- **Keyword**: Enter topics for AI research
|
|
|
1529 |
- Edit conversation before audio generation
|
1530 |
-
-
|
1531 |
""")
|
1532 |
|
1533 |
# 예제 섹션
|
1534 |
-
with gr.Accordion("📚 Examples", open=False):
|
1535 |
gr.Examples(
|
1536 |
examples=[
|
1537 |
-
["https://huggingface.co/blog/
|
1538 |
["quantum computing breakthroughs", "Keyword", "Local", "Edge-TTS", "English"],
|
1539 |
-
["https://huggingface.co/papers/2505.14810", "URL", "Local", "Edge-TTS", "Korean"],
|
1540 |
["인공지능 윤리와 규제", "Keyword", "Local", "Edge-TTS", "Korean"],
|
|
|
|
|
|
|
|
|
1541 |
],
|
1542 |
inputs=[url_input, input_type_selector, mode_selector, tts_selector, language_selector],
|
1543 |
outputs=[conversation_output, status_output],
|
@@ -1554,7 +1494,7 @@ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
|
1554 |
|
1555 |
# 언어 변경 시 TTS 엔진 옵션 업데이트
|
1556 |
language_selector.change(
|
1557 |
-
fn=
|
1558 |
inputs=[language_selector],
|
1559 |
outputs=[tts_selector]
|
1560 |
)
|
@@ -1591,4 +1531,4 @@ if __name__ == "__main__":
|
|
1591 |
share=False,
|
1592 |
server_name="0.0.0.0",
|
1593 |
server_port=7860
|
1594 |
-
)
|
|
|
79 |
BRAVE_KEY = os.getenv("BSEARCH_API")
|
80 |
BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
|
81 |
|
82 |
+
# Edge TTS 전용 언어 목록 (English 제외)
|
83 |
+
EDGE_TTS_ONLY_LANGUAGES = [
|
84 |
+
"Korean", "Japanese", "French", "German", "Spanish", "Italian",
|
85 |
+
"Portuguese", "Dutch", "Thai", "Vietnamese", "Arabic", "Hebrew",
|
86 |
+
"Indonesian", "Hindi", "Russian", "Chinese", "Norwegian", "Swedish",
|
87 |
+
"Finnish", "Danish", "Polish", "Turkish", "Greek", "Czech"
|
88 |
+
]
|
89 |
+
|
90 |
+
# 언어별 Edge TTS 음성 설정
|
91 |
+
EDGE_TTS_VOICES = {
|
92 |
+
"English": [
|
93 |
+
"en-US-AndrewMultilingualNeural", # 남성 음성 1
|
94 |
+
"en-US-BrianMultilingualNeural" # 남성 음성 2
|
95 |
+
],
|
96 |
+
"Korean": [
|
97 |
+
"ko-KR-HyunsuNeural", # 남성 음성 1 (차분하고 신뢰감 있는)
|
98 |
+
"ko-KR-InJoonNeural" # 남성 음성 2 (활기차고 친근한)
|
99 |
+
],
|
100 |
+
"Japanese": [
|
101 |
+
"ja-JP-KeitaNeural", # 남성 음성 1
|
102 |
+
"ja-JP-NanamiNeural" # 여성 음성 (백업용)
|
103 |
+
],
|
104 |
+
"French": [
|
105 |
+
"fr-FR-HenriNeural", # 남성 음성 1
|
106 |
+
"fr-FR-DeniseNeural" # 여성 음성 (백업용)
|
107 |
+
],
|
108 |
+
"German": [
|
109 |
+
"de-DE-ConradNeural", # 남성 음성 1
|
110 |
+
"de-DE-KillianNeural" # 남성 음성 2
|
111 |
+
],
|
112 |
+
"Spanish": [
|
113 |
+
"es-ES-AlvaroNeural", # 남성 음성 1
|
114 |
+
"es-ES-ElviraNeural" # 여성 음성 (백업용)
|
115 |
+
],
|
116 |
+
"Italian": [
|
117 |
+
"it-IT-DiegoNeural", # 남성 음성 1
|
118 |
+
"it-IT-IsabellaNeural" # 여성 음성 (백업용)
|
119 |
+
],
|
120 |
+
"Portuguese": [
|
121 |
+
"pt-BR-AntonioNeural", # 남성 음성 1
|
122 |
+
"pt-BR-FranciscaNeural" # 여성 음성 (백업용)
|
123 |
+
],
|
124 |
+
"Dutch": [
|
125 |
+
"nl-NL-MaartenNeural", # 남성 음성 1
|
126 |
+
"nl-NL-ColetteNeural" # 여성 음성 (백업용)
|
127 |
+
],
|
128 |
+
"Thai": [
|
129 |
+
"th-TH-NiwatNeural", # 남성 음성 1
|
130 |
+
"th-TH-PremwadeeNeural" # 여성 음성 (백업용)
|
131 |
+
],
|
132 |
+
"Vietnamese": [
|
133 |
+
"vi-VN-NamMinhNeural", # 남성 음성 1
|
134 |
+
"vi-VN-HoaiMyNeural" # 여성 음성 (백업용)
|
135 |
+
],
|
136 |
+
"Arabic": [
|
137 |
+
"ar-SA-HamedNeural", # 남성 음성 1
|
138 |
+
"ar-SA-ZariyahNeural" # 여성 음성 (백업용)
|
139 |
+
],
|
140 |
+
"Hebrew": [
|
141 |
+
"he-IL-AvriNeural", # 남성 음성 1
|
142 |
+
"he-IL-HilaNeural" # 여성 음성 (백업용)
|
143 |
+
],
|
144 |
+
"Indonesian": [
|
145 |
+
"id-ID-ArdiNeural", # 남성 음성 1
|
146 |
+
"id-ID-GadisNeural" # 여성 음성 (백업용)
|
147 |
+
],
|
148 |
+
"Hindi": [
|
149 |
+
"hi-IN-MadhurNeural", # 남성 음성 1
|
150 |
+
"hi-IN-SwaraNeural" # 여성 음성 (백업용)
|
151 |
+
],
|
152 |
+
"Russian": [
|
153 |
+
"ru-RU-DmitryNeural", # 남성 음성 1
|
154 |
+
"ru-RU-SvetlanaNeural" # 여성 음성 (백업용)
|
155 |
+
],
|
156 |
+
"Chinese": [
|
157 |
+
"zh-CN-YunxiNeural", # 남성 음성 1
|
158 |
+
"zh-CN-XiaoxiaoNeural" # 여성 음성 (백업용)
|
159 |
+
],
|
160 |
+
"Norwegian": [
|
161 |
+
"nb-NO-FinnNeural", # 남성 음성 1
|
162 |
+
"nb-NO-PernilleNeural" # 여성 음성 (백업용)
|
163 |
+
],
|
164 |
+
"Swedish": [
|
165 |
+
"sv-SE-MattiasNeural", # 남성 음성 1
|
166 |
+
"sv-SE-SofieNeural" # 여성 음성 (백업용)
|
167 |
+
],
|
168 |
+
"Finnish": [
|
169 |
+
"fi-FI-HarriNeural", # 남성 음성 1
|
170 |
+
"fi-FI-NooraNeural" # 여성 음성 (백업용)
|
171 |
+
],
|
172 |
+
"Danish": [
|
173 |
+
"da-DK-JeppeNeural", # 남성 음성 1
|
174 |
+
"da-DK-ChristelNeural" # 여성 음성 (백업용)
|
175 |
+
],
|
176 |
+
"Polish": [
|
177 |
+
"pl-PL-MarekNeural", # 남성 음성 1
|
178 |
+
"pl-PL-ZofiaNeural" # 여성 음성 (백업용)
|
179 |
+
],
|
180 |
+
"Turkish": [
|
181 |
+
"tr-TR-AhmetNeural", # 남성 음성 1
|
182 |
+
"tr-TR-EmelNeural" # 여성 음성 (백업용)
|
183 |
+
],
|
184 |
+
"Greek": [
|
185 |
+
"el-GR-NestorasNeural", # 남성 음성 1
|
186 |
+
"el-GR-AthinaNeural" # 여성 음성 (백업용)
|
187 |
+
],
|
188 |
+
"Czech": [
|
189 |
+
"cs-CZ-AntoninNeural", # 남성 음성 1
|
190 |
+
"cs-CZ-VlastaNeural" # 여성 음성 (백업용)
|
191 |
+
]
|
192 |
+
}
|
193 |
+
|
194 |
@dataclass
|
195 |
class ConversationConfig:
|
196 |
max_words: int = 8000 # 4000에서 6000으로 증가 (1.5배)
|
|
|
398 |
return intro + compiled
|
399 |
|
400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
401 |
class UnifiedAudioConverter:
|
402 |
def __init__(self, config: ConversationConfig):
|
403 |
self.config = config
|
|
|
549 |
else:
|
550 |
return MessagesFormatterType.LLAMA_3
|
551 |
|
|
|
552 |
def _build_prompt(self, text: str, language: str = "English", search_context: str = "") -> str:
|
553 |
"""Build prompt for conversation generation with enhanced professional podcast style"""
|
554 |
# 텍스트 길이 제한
|
|
|
556 |
if len(text) > max_text_length:
|
557 |
text = text[:max_text_length] + "..."
|
558 |
|
559 |
+
# 언어별 화자 이름 설정
|
560 |
if language == "Korean":
|
561 |
+
speaker1, speaker2 = "준수", "민호"
|
562 |
+
elif language == "Japanese":
|
563 |
+
speaker1, speaker2 = "Hiroshi", "Takeshi"
|
564 |
+
elif language == "French":
|
565 |
+
speaker1, speaker2 = "Pierre", "Marc"
|
566 |
+
elif language == "German":
|
567 |
+
speaker1, speaker2 = "Klaus", "Stefan"
|
568 |
+
elif language == "Spanish":
|
569 |
+
speaker1, speaker2 = "Carlos", "Miguel"
|
570 |
+
elif language == "Italian":
|
571 |
+
speaker1, speaker2 = "Marco", "Giuseppe"
|
572 |
+
elif language == "Portuguese":
|
573 |
+
speaker1, speaker2 = "João", "Pedro"
|
574 |
+
elif language == "Dutch":
|
575 |
+
speaker1, speaker2 = "Jan", "Pieter"
|
576 |
+
elif language == "Thai":
|
577 |
+
speaker1, speaker2 = "Somchai", "Prasert"
|
578 |
+
elif language == "Vietnamese":
|
579 |
+
speaker1, speaker2 = "Minh", "Duc"
|
580 |
+
elif language == "Arabic":
|
581 |
+
speaker1, speaker2 = "Ahmed", "Mohammed"
|
582 |
+
elif language == "Hebrew":
|
583 |
+
speaker1, speaker2 = "David", "Michael"
|
584 |
+
elif language == "Indonesian":
|
585 |
+
speaker1, speaker2 = "Budi", "Andi"
|
586 |
+
elif language == "Hindi":
|
587 |
+
speaker1, speaker2 = "Raj", "Amit"
|
588 |
+
elif language == "Russian":
|
589 |
+
speaker1, speaker2 = "Alexei", "Dmitri"
|
590 |
+
elif language == "Chinese":
|
591 |
+
speaker1, speaker2 = "Wei", "Jun"
|
592 |
+
else: # English and others
|
593 |
+
speaker1, speaker2 = "Alex", "Jordan"
|
594 |
+
|
595 |
+
# 대화 템플릿 생성
|
596 |
+
template = "{\n \"conversation\": [\n"
|
597 |
+
for i in range(12): # 12번의 교환
|
598 |
+
template += f" {{\"speaker\": \"{speaker1 if i % 2 == 0 else speaker2}\", \"text\": \"\"}}"
|
599 |
+
if i < 11:
|
600 |
+
template += ","
|
601 |
+
template += "\n"
|
602 |
+
template += " ]\n}"
|
603 |
+
|
604 |
+
context_part = ""
|
605 |
+
if search_context:
|
606 |
+
if language == "Korean":
|
607 |
context_part = f"# 최신 관련 정보:\n{search_context}\n"
|
608 |
+
else:
|
609 |
+
context_part = f"# Latest Information:\n{search_context}\n"
|
610 |
|
611 |
+
if language == "Korean":
|
612 |
base_prompt = (
|
613 |
f"# 원본 콘텐츠:\n{text}\n\n"
|
614 |
f"{context_part}"
|
|
|
616 |
f"## 핵심 지침:\n"
|
617 |
f"1. **대화 스타일**: 전문적이면서도 이해하기 쉬운 팟캐스트 대담\n"
|
618 |
f"2. **화자 역할**:\n"
|
619 |
+
f" - {speaker1}: 진행자/호스트 (핵심을 짚는 질문, 청취자 관점에서 궁금한 점 질문)\n"
|
620 |
+
f" - {speaker2}: 전문가 (깊이 있는 설명, 구체적 사례와 데이터 제시)\n"
|
621 |
f"3. **중요한 답변 규칙**:\n"
|
622 |
+
f" - {speaker1}: 1-2문장의 명확한 질문\n"
|
623 |
+
f" - {speaker2}: **반드시 2-4문장으로 충실히 답변** (개념 설명 + 구체적 설명 + 예시나 함의)\n"
|
624 |
+
f"4. **전문성 요소**: 통계나 연구 결과 인용, 실제 사례와 케이스 스터디, 전문 용어를 쉽게 풀어서 설명\n"
|
625 |
+
f"5. **필수 규칙**: 서로 존댓말 사용, 12회 대화 교환\n\n"
|
|
|
|
|
|
|
|
|
|
|
626 |
f"JSON 형식으로만 반환:\n{template}"
|
627 |
)
|
|
|
|
|
|
|
628 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
base_prompt = (
|
630 |
f"# Content:\n{text}\n\n"
|
631 |
f"{context_part}"
|
632 |
+
f"Create a professional and insightful podcast conversation in {language}.\n\n"
|
633 |
f"## Key Guidelines:\n"
|
634 |
f"1. **Style**: Professional yet accessible podcast discussion\n"
|
635 |
f"2. **Roles**:\n"
|
636 |
+
f" - {speaker1}: Host (insightful questions, audience perspective)\n"
|
637 |
+
f" - {speaker2}: Expert (in-depth explanations, concrete examples and data)\n"
|
638 |
f"3. **Critical Response Rules**:\n"
|
639 |
+
f" - {speaker1}: 1-2 sentence clear questions\n"
|
640 |
+
f" - {speaker2}: **Must answer in 2-4 sentences** (concept + detailed explanation + example/implication)\n"
|
641 |
+
f"4. **Professional Elements**: Cite statistics and research, real cases and case studies, explain technical terms clearly\n"
|
642 |
+
f"5. **Length**: 12 exchanges total\n\n"
|
|
|
|
|
|
|
|
|
|
|
643 |
f"Return JSON only:\n{template}"
|
644 |
)
|
645 |
+
|
646 |
+
return base_prompt
|
|
|
|
|
647 |
|
648 |
def _build_messages_for_local(self, text: str, language: str = "English", search_context: str = "") -> List[Dict]:
|
649 |
"""Build messages for local LLM with enhanced professional podcast style"""
|
650 |
if language == "Korean":
|
651 |
system_message = (
|
652 |
"당신은 한국 최고의 전문 팟캐스트 작가입니다. "
|
653 |
+
"청취자들이 전문 지식을 쉽게 이해할 수 있는 고품질 대담을 만들어냅니다. "
|
654 |
+
"반드시 서로 존댓말을 사용하며, 전문적이면서도 친근한 톤을 유지합니다."
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
)
|
656 |
else:
|
657 |
system_message = (
|
658 |
+
f"You are an expert podcast scriptwriter creating high-quality "
|
659 |
+
f"professional discussions in {language}. Make complex topics accessible "
|
660 |
+
f"while maintaining expertise and a professional yet approachable tone."
|
|
|
|
|
|
|
|
|
|
|
|
|
661 |
)
|
662 |
|
663 |
return [
|
|
|
687 |
chat_template = self._get_messages_formatter_type(self.config.local_model_name)
|
688 |
provider = LlamaCppPythonProvider(self.local_llm)
|
689 |
|
690 |
+
# 언어별 시스템 메시지
|
691 |
if language == "Korean":
|
692 |
system_message = (
|
693 |
"당신은 한국의 유명 팟캐스트 전문 작가입니다. "
|
694 |
+
"청취자들이 깊이 있는 전문 지식을 얻을 수 있는 고품질 대담을 만듭니다. "
|
695 |
+
"반드시 서로 존댓말을 사용하며, 12회의 대화 교환으로 구성하세요. "
|
696 |
+
"JSON 형식으로만 응답하세요."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
697 |
)
|
698 |
else:
|
699 |
system_message = (
|
700 |
+
f"You are a professional podcast scriptwriter creating high-quality, "
|
701 |
+
f"insightful discussions in {language}. Create exactly 12 conversation exchanges "
|
702 |
+
f"with professional expertise. Respond only in JSON format."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
703 |
)
|
704 |
|
705 |
agent = LlamaCppAgent(
|
|
|
710 |
)
|
711 |
|
712 |
settings = provider.get_provider_default_settings()
|
713 |
+
settings.temperature = 0.75
|
714 |
settings.top_k = 40
|
715 |
settings.top_p = 0.95
|
716 |
+
settings.max_tokens = self.config.max_tokens
|
717 |
settings.repeat_penalty = 1.1
|
718 |
settings.stream = False
|
719 |
|
|
|
734 |
|
735 |
if json_match:
|
736 |
conversation_data = json.loads(json_match.group())
|
|
|
|
|
|
|
|
|
737 |
return conversation_data
|
738 |
else:
|
739 |
raise ValueError("No valid JSON found in local LLM response")
|
|
|
744 |
|
745 |
@spaces.GPU(duration=120)
|
746 |
def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None, search_context: str = "") -> Dict:
|
747 |
+
"""Extract conversation using legacy local model"""
|
748 |
try:
|
749 |
self.initialize_legacy_local_mode()
|
750 |
|
751 |
+
# 언어별 시스템 메시지
|
752 |
if language == "Korean":
|
753 |
system_message = (
|
754 |
"당신은 전문 팟캐스트 작가입니다. "
|
755 |
+
"12회의 대화 교환으로 구성된 전문적인 대담을 만드세요."
|
|
|
|
|
756 |
)
|
757 |
else:
|
758 |
system_message = (
|
759 |
+
f"You are a professional podcast scriptwriter. "
|
760 |
+
f"Create a professional dialogue in {language} with 12 exchanges."
|
|
|
|
|
761 |
)
|
762 |
|
763 |
chat = [
|
|
|
782 |
generate_kwargs = dict(
|
783 |
model_inputs,
|
784 |
streamer=streamer,
|
785 |
+
max_new_tokens=self.config.max_new_tokens,
|
786 |
do_sample=True,
|
787 |
temperature=0.75,
|
788 |
eos_token_id=terminators,
|
|
|
805 |
|
806 |
except Exception as e:
|
807 |
print(f"Legacy local model also failed: {e}")
|
808 |
+
return self._get_default_conversation(language)
|
809 |
+
|
810 |
+
def _get_default_conversation(self, language: str) -> Dict:
|
811 |
+
"""언어별 기본 대화 템플릿"""
|
812 |
+
if language == "Korean":
|
813 |
+
return self._get_default_korean_conversation()
|
814 |
+
else:
|
815 |
+
return self._get_default_english_conversation()
|
816 |
|
817 |
def _get_default_korean_conversation(self) -> Dict:
|
818 |
+
"""기본 한국어 대화 템플릿"""
|
819 |
return {
|
820 |
"conversation": [
|
821 |
{"speaker": "준수", "text": "안녕하세요, 여러분! 오늘은 정말 중요하고 흥미로운 주제를 다뤄보려고 합니다. 민호 박사님, 먼저 이 주제가 왜 지금 이렇게 주목받고 있는지 설명해주시겠어요?"},
|
|
|
831 |
{"speaker": "준수", "text": "실용적인 조언 감사합니다. 마지막으로 이 분야의 미래 전망은 어떻게 보시나요?"},
|
832 |
{"speaker": "민호", "text": "향후 10년은 인류 역사상 가장 급격한 기술 발전을 경험하는 시기가 될 것입니다. 가트너의 하이프 사이클 분석에 따르면, 현재 우리는 이 기술의 초기 단계에 불과합니다. 2030년까지는 지금으로서는 상상하기 어려운 수준의 혁신이 일어날 것으로 예상됩니다. 중요한 것은 이런 변화를 두려워하기보다는 기회로 삼아 더 나은 미래를 만들어가는 것이라고 생각합니다."},
|
833 |
{"speaker": "준수", "text": "정말 통찰력 있�� 말씀이네요. 오늘 너무나 유익한 시간이었습니다. 청취자 여러분도 오늘 논의된 내용을 바탕으로 미래를 준비하시길 바랍니다. 민호 박사님, 귀중한 시간 내주셔서 감사합니다!"},
|
834 |
+
{"speaker": "민호", "text": "감사합니다. 청취자 여러분들이 이 변화의 시대를 현명하게 헤쳐나가시길 바랍니다. 기술은 도구일 뿐이고, 그것을 어떻게 활용하는지는 우리에게 달려있다는 점을 기억해주세요."}
|
835 |
]
|
836 |
}
|
837 |
|
838 |
def _get_default_english_conversation(self) -> Dict:
|
839 |
+
"""기본 영어 대화 템플릿"""
|
840 |
return {
|
841 |
"conversation": [
|
842 |
{"speaker": "Alex", "text": "Welcome everyone to our podcast! Today we're diving into a topic that's reshaping our world. Dr. Jordan, could you start by explaining why this subject has become so critical right now?"},
|
|
|
844 |
{"speaker": "Alex", "text": "400% acceleration is staggering! What does this mean for everyday people who might not be tech-savvy?"},
|
845 |
{"speaker": "Jordan", "text": "The impact will be profound yet accessible. Think about how smartphones revolutionized communication - this will be similar but across every aspect of life. McKinsey's latest report projects that by 2026, these technologies will create $4.4 trillion in annual value globally. For individuals, this translates to personalized healthcare that can predict illnesses years in advance, educational systems that adapt to each student's learning style, and financial tools that democratize wealth-building strategies previously available only to the ultra-wealthy."},
|
846 |
{"speaker": "Alex", "text": "Those applications sound transformative. Can you give us a concrete example of how this is already being implemented?"},
|
847 |
+
{"speaker": "Jordan", "text": "Absolutely. Let me share a compelling case from Johns Hopkins Hospital. They've deployed an AI system that analyzes patient data in real-time, reducing diagnostic errors by 85% and cutting average diagnosis time from days to hours. In one documented case, the system identified a rare genetic disorder in a child that had been misdiagnosed for three years. The accuracy comes from analyzing patterns across millions of cases - something impossible for even the most experienced doctors to do manually."},
|
848 |
{"speaker": "Alex", "text": "That's truly life-changing technology. But I imagine there are significant challenges and risks we need to consider?"},
|
849 |
+
{"speaker": "Jordan", "text": "You're absolutely right to raise this. The challenges are as significant as the opportunities. The World Economic Forum identifies three critical risks: algorithmic bias could perpetuate existing inequalities, cybersecurity threats become exponentially more dangerous, and there's the socioeconomic disruption with PwC estimating that 30% of jobs could be automated by 2030. However, history shows us that technological revolutions create new opportunities even as they displace old ones. The key is proactive adaptation and responsible development."},
|
850 |
{"speaker": "Alex", "text": "How should individuals and organizations prepare for these changes?"},
|
851 |
+
{"speaker": "Jordan", "text": "Preparation requires a multi-faceted approach. For individuals, I recommend focusing on skills that complement rather than compete with AI: critical thinking, emotional intelligence, and creative problem-solving. MIT's recent study shows that professionals who combine domain expertise with AI literacy see salary increases of 40% on average. Organizations need to invest in continuous learning programs - Amazon's $700 million worker retraining initiative is a good model. Most importantly, we need to cultivate an adaptive mindset."},
|
852 |
{"speaker": "Alex", "text": "That's practical advice. What about the ethical considerations? How do we ensure this technology benefits humanity as a whole?"},
|
853 |
+
{"speaker": "Jordan", "text": "Ethics must be at the forefront of development. The EU's AI Act and similar regulations worldwide are establishing important guardrails. We need transparent AI systems where decisions can be explained and audited. Companies like IBM and Google have established AI ethics boards, but we need industry-wide standards. Additionally, we must address the digital divide - UNESCO reports that 37% of the global population still lacks internet access. Without inclusive development, these technologies could exacerbate global inequality."},
|
854 |
{"speaker": "Alex", "text": "Looking ahead, what's your vision for how this technology will shape the next decade?"},
|
855 |
+
{"speaker": "Jordan", "text": "The next decade will be transformative beyond our current imagination. By 2035, I expect we'll see autonomous systems managing entire cities, personalized medicine extending human lifespan by 20-30 years, and educational AI that makes world-class education universally accessible. The convergence of AI with quantum computing, biotechnology, and nanotechnology will unlock possibilities we can barely conceive of today. However, the future isn't predetermined - it's shaped by the choices we make now about development priorities and ethical frameworks."},
|
|
|
|
|
856 |
{"speaker": "Alex", "text": "Dr. Jordan, this has been an incredibly enlightening discussion. Thank you for sharing your expertise and insights with us today."},
|
857 |
+
{"speaker": "Jordan", "text": "Thank you, Alex. For listeners wanting to dive deeper, I've compiled additional resources on my website. Remember, the future isn't something that happens to us - it's something we create together. I look forward to seeing how each of you contributes to shaping this exciting new era."}
|
858 |
]
|
859 |
}
|
860 |
|
861 |
def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
|
862 |
+
"""Extract conversation using API"""
|
863 |
if not self.llm_client:
|
864 |
raise RuntimeError("API mode not initialized")
|
865 |
|
|
|
876 |
except Exception as e:
|
877 |
print(f"Search failed, continuing without context: {e}")
|
878 |
|
879 |
+
# 언어별 시스템 메시지
|
880 |
if language == "Korean":
|
881 |
system_message = (
|
882 |
"당신은 한국의 최고 전문 팟캐스트 작가입니다. "
|
883 |
+
"12회의 깊이 있는 대화 교환으로 구성된 고품질 대담을 만드세요. "
|
884 |
+
"반드시 서로 존댓말을 사용하세요."
|
|
|
|
|
|
|
|
|
885 |
)
|
886 |
else:
|
887 |
system_message = (
|
888 |
+
f"You are a top professional podcast scriptwriter. "
|
889 |
+
f"Create high-quality discussions in {language} with exactly 12 exchanges. "
|
890 |
+
f"Include specific data, research findings, and real cases."
|
|
|
|
|
|
|
|
|
891 |
)
|
892 |
|
893 |
chat_completion = self.llm_client.chat.completions.create(
|
|
|
930 |
filenames = []
|
931 |
|
932 |
try:
|
933 |
+
# 언어별 음성 설정
|
934 |
+
voices = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["English"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
935 |
|
936 |
for i, turn in enumerate(conversation_json["conversation"]):
|
937 |
filename = output_dir / f"output_{i}.wav"
|
|
|
982 |
# Create different voice characteristics for different speakers
|
983 |
if language == "Korean":
|
984 |
voice_configs = [
|
985 |
+
{"prompt_text": "안녕하세요, 오늘 팟캐스트 진행을 맡은 준수입니다.", "gender": "male"},
|
986 |
+
{"prompt_text": "안녕하세요, 저는 오늘 이 주제에 대해 설명드릴 민호입니다.", "gender": "male"}
|
987 |
]
|
988 |
else:
|
989 |
voice_configs = [
|
990 |
+
{"prompt_text": "Hello everyone, I'm Alex, your host for today's podcast.", "gender": "male"},
|
991 |
+
{"prompt_text": "Hi, I'm Jordan. I'm excited to share my insights with you.", "gender": "male"}
|
992 |
]
|
993 |
|
994 |
for i, turn in enumerate(conversation_json["conversation"]):
|
|
|
996 |
if not text.strip():
|
997 |
continue
|
998 |
|
|
|
999 |
voice_config = voice_configs[i % len(voice_configs)]
|
|
|
1000 |
output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
|
1001 |
|
|
|
1002 |
cmd = [
|
1003 |
"python", "-m", "cli.inference",
|
1004 |
"--text", text,
|
|
|
1010 |
]
|
1011 |
|
1012 |
try:
|
|
|
1013 |
result = subprocess.run(
|
1014 |
cmd,
|
1015 |
capture_output=True,
|
1016 |
text=True,
|
1017 |
timeout=60,
|
1018 |
+
cwd="."
|
1019 |
)
|
1020 |
|
1021 |
if result.returncode == 0:
|
1022 |
audio_files.append(output_file)
|
1023 |
else:
|
1024 |
print(f"Spark TTS error for turn {i}: {result.stderr}")
|
1025 |
+
silence = np.zeros(int(22050 * 1.0))
|
|
|
1026 |
sf.write(output_file, silence, 22050)
|
1027 |
audio_files.append(output_file)
|
1028 |
|
1029 |
except subprocess.TimeoutExpired:
|
1030 |
print(f"Spark TTS timeout for turn {i}")
|
|
|
1031 |
silence = np.zeros(int(22050 * 1.0))
|
1032 |
sf.write(output_file, silence, 22050)
|
1033 |
audio_files.append(output_file)
|
1034 |
except Exception as e:
|
1035 |
print(f"Error running Spark TTS for turn {i}: {e}")
|
|
|
1036 |
silence = np.zeros(int(22050 * 1.0))
|
1037 |
sf.write(output_file, silence, 22050)
|
1038 |
audio_files.append(output_file)
|
|
|
1044 |
else:
|
1045 |
raise RuntimeError("No audio files generated")
|
1046 |
|
|
|
1047 |
conversation_text = "\n".join(
|
1048 |
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
1049 |
for i, turn in enumerate(conversation_json["conversation"])
|
|
|
1069 |
speaker = speakers[i % 2]
|
1070 |
speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
|
1071 |
|
|
|
1072 |
self.melo_models["EN"].tts_to_file(
|
1073 |
text, speaker_id, bio, speed=1.0,
|
1074 |
pbar=progress.tqdm if progress else None,
|
|
|
1079 |
audio_segment = AudioSegment.from_file(bio, format="wav")
|
1080 |
combined_audio += audio_segment
|
1081 |
|
|
|
1082 |
final_audio_path = "melo_podcast.mp3"
|
1083 |
combined_audio.export(final_audio_path, format="mp3")
|
1084 |
|
|
|
1085 |
conversation_text = "\n".join(
|
1086 |
f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
|
1087 |
for i, turn in enumerate(conversation_json["conversation"])
|
|
|
1140 |
else: # Keyword
|
1141 |
if not article_input or not isinstance(article_input, str):
|
1142 |
return "Please provide a keyword or topic.", None
|
|
|
1143 |
text = search_and_compile_content(article_input, language)
|
1144 |
+
text = f"Keyword-based content:\n{text}"
|
1145 |
+
|
1146 |
# Limit text to max words
|
1147 |
words = text.split()
|
1148 |
if len(words) > converter.config.max_words:
|
|
|
1150 |
|
1151 |
# Extract conversation based on mode
|
1152 |
if mode == "Local":
|
|
|
1153 |
try:
|
1154 |
conversation_json = converter.extract_conversation_local(text, language)
|
1155 |
except Exception as e:
|
1156 |
print(f"Local mode failed: {e}, trying API fallback")
|
|
|
1157 |
api_key = os.environ.get("TOGETHER_API_KEY")
|
1158 |
if api_key:
|
1159 |
converter.initialize_api_mode(api_key)
|
1160 |
conversation_json = converter.extract_conversation_api(text, language)
|
1161 |
else:
|
1162 |
raise RuntimeError("Local mode failed and no API key available for fallback")
|
1163 |
+
else: # API mode
|
1164 |
api_key = os.environ.get("TOGETHER_API_KEY")
|
1165 |
if not api_key:
|
1166 |
print("API key not found, falling back to local mode")
|
|
|
1191 |
return "Please provide conversation text.", None
|
1192 |
|
1193 |
try:
|
|
|
1194 |
conversation_json = converter.parse_conversation_text(conversation_text)
|
1195 |
|
1196 |
if not conversation_json["conversation"]:
|
1197 |
return "No valid conversation found in the text.", None
|
1198 |
|
1199 |
+
# Edge TTS 전용 언어는 자동으로 Edge-TTS 사용
|
1200 |
+
if language in EDGE_TTS_ONLY_LANGUAGES and tts_engine != "Edge-TTS":
|
1201 |
+
tts_engine = "Edge-TTS"
|
1202 |
|
1203 |
# Generate audio based on TTS engine
|
1204 |
if tts_engine == "Edge-TTS":
|
|
|
1211 |
else: # MeloTTS
|
1212 |
if not MELO_AVAILABLE:
|
1213 |
return "MeloTTS not available. Please install required dependencies.", None
|
1214 |
+
if language in EDGE_TTS_ONLY_LANGUAGES:
|
1215 |
+
return f"MeloTTS does not support {language}. Please use Edge-TTS for this language.", None
|
1216 |
converter.initialize_melo_tts()
|
1217 |
output_file, _ = converter.text_to_speech_melo(conversation_json)
|
1218 |
|
|
|
1232 |
return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
|
1233 |
|
1234 |
|
1235 |
+
def update_tts_engine_for_language(language):
|
1236 |
+
"""언어별 TTS 엔진 옵션 업데이트"""
|
1237 |
+
if language in EDGE_TTS_ONLY_LANGUAGES:
|
1238 |
+
language_info = {
|
1239 |
+
"Korean": "한국어는 Edge-TTS만 지원됩니다",
|
1240 |
+
"Japanese": "日本語はEdge-TTSのみサポートされています",
|
1241 |
+
"French": "Le français n'est pris en charge que par Edge-TTS",
|
1242 |
+
"German": "Deutsch wird nur von Edge-TTS unterstützt",
|
1243 |
+
"Spanish": "El español solo es compatible con Edge-TTS",
|
1244 |
+
"Italian": "L'italiano è supportato solo da Edge-TTS",
|
1245 |
+
"Portuguese": "O português é suportado apenas pelo Edge-TTS",
|
1246 |
+
"Dutch": "Nederlands wordt alleen ondersteund door Edge-TTS",
|
1247 |
+
"Thai": "ภาษาไทยรองรับเฉพาะ Edge-TTS เท่านั้น",
|
1248 |
+
"Vietnamese": "Tiếng Việt chỉ được hỗ trợ bởi Edge-TTS",
|
1249 |
+
"Arabic": "العربية مدعومة فقط من Edge-TTS",
|
1250 |
+
"Hebrew": "עברית נתמכת רק על ידי Edge-TTS",
|
1251 |
+
"Indonesian": "Bahasa Indonesia hanya didukung oleh Edge-TTS",
|
1252 |
+
"Hindi": "हिंदी केवल Edge-TTS द्वारा समर्थित है",
|
1253 |
+
"Russian": "Русский поддерживается только Edge-TTS",
|
1254 |
+
"Chinese": "中文仅支持Edge-TTS"
|
1255 |
+
}
|
1256 |
+
info_text = language_info.get(language, f"{language} is only supported by Edge-TTS")
|
1257 |
+
|
1258 |
return gr.Radio(
|
1259 |
choices=["Edge-TTS"],
|
1260 |
value="Edge-TTS",
|
1261 |
label="TTS Engine",
|
1262 |
+
info=info_text,
|
1263 |
interactive=False
|
1264 |
)
|
1265 |
else:
|
|
|
1295 |
print(f"Failed to download model at startup: {e}")
|
1296 |
|
1297 |
|
1298 |
+
# Gradio Interface - 개선된 다국어 레이아웃
|
1299 |
with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
|
1300 |
.container {max-width: 1200px; margin: auto; padding: 20px;}
|
1301 |
.header-text {text-align: center; margin-bottom: 30px;}
|
|
|
1307 |
# 헤더
|
1308 |
with gr.Row(elem_classes="header-text"):
|
1309 |
gr.Markdown("""
|
1310 |
+
# 🎙️ AI Podcast Generator - Professional Multi-Language Edition
|
1311 |
+
### Convert any article, blog, PDF document, or topic into an engaging professional podcast conversation in 24+ languages!
|
1312 |
""")
|
1313 |
|
1314 |
with gr.Row(elem_classes="discord-badge"):
|
|
|
1320 |
</p>
|
1321 |
""")
|
1322 |
|
|
|
|
|
1323 |
# 상태 표시 섹션
|
1324 |
with gr.Row():
|
1325 |
with gr.Column(scale=1):
|
|
|
1332 |
""")
|
1333 |
with gr.Column(scale=1):
|
1334 |
gr.Markdown("""
|
1335 |
+
#### 🌍 Multi-Language Support
|
1336 |
+
- **24+ Languages**: Korean, Japanese, French, German, Spanish, Italian, etc.
|
1337 |
+
- **Native Voices**: Optimized for each language
|
1338 |
+
- **Professional Style**: Expert discussions with data & insights
|
1339 |
+
- **Auto-TTS Selection**: Best engine per language
|
1340 |
""")
|
1341 |
|
1342 |
# 메인 입력 섹션
|
|
|
1371 |
# 키워드 입력
|
1372 |
keyword_input = gr.Textbox(
|
1373 |
label="🔍 Topic/Keyword",
|
1374 |
+
placeholder="Enter a topic (e.g., 'AI trends 2024', '인공지능', 'IA tendances', 'KI Trends')",
|
1375 |
value="",
|
1376 |
visible=False,
|
1377 |
info="System will search and compile latest information",
|
|
|
1382 |
with gr.Column(scale=1):
|
1383 |
# 언어 선택
|
1384 |
language_selector = gr.Radio(
|
1385 |
+
choices=[
|
1386 |
+
"English", "Korean", "Japanese", "French", "German",
|
1387 |
+
"Spanish", "Italian", "Portuguese", "Dutch", "Thai",
|
1388 |
+
"Vietnamese", "Arabic", "Hebrew", "Indonesian", "Hindi",
|
1389 |
+
"Russian", "Chinese", "Norwegian", "Swedish", "Finnish",
|
1390 |
+
"Danish", "Polish", "Turkish", "Greek", "Czech"
|
1391 |
+
],
|
1392 |
value="English",
|
1393 |
+
label="🌐 Language / 언어 / 语言",
|
1394 |
+
info="Select podcast language"
|
1395 |
)
|
1396 |
|
1397 |
# 처리 모드
|
|
|
1429 |
lines=25,
|
1430 |
max_lines=50,
|
1431 |
interactive=True,
|
1432 |
+
placeholder="Professional podcast conversation will appear here...\n전문 팟캐스트 대화가 여기에 표시됩니다...\nLa conversation professionnelle du podcast apparaîtra ici...",
|
1433 |
info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
|
1434 |
)
|
1435 |
|
|
|
1460 |
gr.Markdown("""
|
1461 |
#### 💡 Quick Tips:
|
1462 |
- **URL**: Paste any article link
|
1463 |
+
- **PDF**: Upload documents directly
|
1464 |
- **Keyword**: Enter topics for AI research
|
1465 |
+
- **24+ Languages** fully supported
|
1466 |
- Edit conversation before audio generation
|
1467 |
+
- Auto TTS engine selection per language
|
1468 |
""")
|
1469 |
|
1470 |
# 예제 섹션
|
1471 |
+
with gr.Accordion("📚 Multi-Language Examples", open=False):
|
1472 |
gr.Examples(
|
1473 |
examples=[
|
1474 |
+
["https://huggingface.co/blog/openfreeai/cycle-navigator", "URL", "Local", "Edge-TTS", "English"],
|
1475 |
["quantum computing breakthroughs", "Keyword", "Local", "Edge-TTS", "English"],
|
|
|
1476 |
["인공지능 윤리와 규제", "Keyword", "Local", "Edge-TTS", "Korean"],
|
1477 |
+
["https://huggingface.co/papers/2505.14810", "URL", "Local", "Edge-TTS", "Japanese"],
|
1478 |
+
["intelligence artificielle tendances", "Keyword", "Local", "Edge-TTS", "French"],
|
1479 |
+
["künstliche intelligenz entwicklung", "Keyword", "Local", "Edge-TTS", "German"],
|
1480 |
+
["inteligencia artificial avances", "Keyword", "Local", "Edge-TTS", "Spanish"],
|
1481 |
],
|
1482 |
inputs=[url_input, input_type_selector, mode_selector, tts_selector, language_selector],
|
1483 |
outputs=[conversation_output, status_output],
|
|
|
1494 |
|
1495 |
# 언어 변경 시 TTS 엔진 옵션 업데이트
|
1496 |
language_selector.change(
|
1497 |
+
fn=update_tts_engine_for_language,
|
1498 |
inputs=[language_selector],
|
1499 |
outputs=[tts_selector]
|
1500 |
)
|
|
|
1531 |
share=False,
|
1532 |
server_name="0.0.0.0",
|
1533 |
server_port=7860
|
1534 |
+
)
|