Spaces:

openfree
/

AI-Podcast

Running on Zero

App Files Files Community

openfree commited on May 25

Commit

b16cf9f

verified ·

1 Parent(s): 9cdb5d5

Update app-backup.py

Browse files

Files changed (1) hide show

app-backup.py +165 -66

app-backup.py CHANGED Viewed

@@ -137,31 +137,55 @@ class UnifiedAudioConverter:
         except httpx.HTTPError as e:
             raise RuntimeError(f"Failed to fetch URL: {e}")
-    def _build_prompt(self, text: str) -> str:
         """Build prompt for conversation generation"""
-        template = """
-        {
-            "conversation": [
-                {"speaker": "", "text": ""},
-                {"speaker": "", "text": ""}
-            ]
-        }
-        """
-        return (
-            f"{text}\n\nConvert the provided text into a short, informative and crisp "
-            f"podcast conversation between two experts. The tone should be "
-            f"professional and engaging. Please adhere to the following "
-            f"format and return ONLY the JSON:\n{template}"
-        )
-    def extract_conversation_api(self, text: str) -> Dict:
         """Extract conversation using API"""
         if not self.llm_client:
             raise RuntimeError("API mode not initialized")
         try:
             chat_completion = self.llm_client.chat.completions.create(
-                messages=[{"role": "user", "content": self._build_prompt(text)}],
                 model=self.config.model_name,
             )
@@ -175,15 +199,21 @@ class UnifiedAudioConverter:
         except Exception as e:
             raise RuntimeError(f"Failed to extract conversation: {e}")
-    def extract_conversation_local(self, text: str, progress=None) -> Dict:
         """Extract conversation using local model"""
         if not self.local_model or not self.tokenizer:
             raise RuntimeError("Local mode not initialized")
-        chat = [{
-            "role": "user",
-            "content": self._build_prompt(text)
-        }]
         terminators = [
             self.tokenizer.eos_token_id,
@@ -221,13 +251,21 @@ class UnifiedAudioConverter:
         if json_match:
             return json.loads(json_match.group())
         else:
-            # Return a default template if no valid JSON found
-            return {
-                "conversation": [
-                    {"speaker": "Host", "text": "Welcome to our podcast."},
-                    {"speaker": "Guest", "text": "Thank you for having me."}
-                ]
-            }
     def parse_conversation_text(self, conversation_text: str) -> Dict:
         """Parse conversation text back to JSON format"""
@@ -244,15 +282,27 @@ class UnifiedAudioConverter:
         return conversation_data
-    async def text_to_speech_edge(self, conversation_json: Dict, voice_1: str, voice_2: str) -> Tuple[str, str]:
         """Convert text to speech using Edge TTS"""
         output_dir = Path(self._create_output_directory())
         filenames = []
         try:
             for i, turn in enumerate(conversation_json["conversation"]):
                 filename = output_dir / f"output_{i}.wav"
-                voice = voice_1 if i % 2 == 0 else voice_2
                 tmp_path = await self._generate_audio_edge(turn["text"], voice)
                 os.rename(tmp_path, filename)
@@ -286,7 +336,7 @@ class UnifiedAudioConverter:
         return tmp_path
-    def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
         """Convert text to speech using Spark TTS CLI"""
         if not SPARK_AVAILABLE or not self.spark_model_dir:
             raise RuntimeError("Spark TTS not available")
@@ -296,10 +346,16 @@ class UnifiedAudioConverter:
             audio_files = []
             # Create different voice characteristics for different speakers
-            voice_configs = [
-                {"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
-                {"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
-            ]
             for i, turn in enumerate(conversation_json["conversation"]):
                 text = turn["text"]
@@ -445,7 +501,7 @@ class UnifiedAudioConverter:
 converter = UnifiedAudioConverter(ConversationConfig())
-async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
     """Main synthesis function"""
     if not article_url:
         return "Please provide a valid URL.", None
@@ -465,10 +521,10 @@ async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edg
             if not api_key:
                 return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
             converter.initialize_api_mode(api_key)
-            conversation_json = converter.extract_conversation_api(text)
         else:  # Local mode
             converter.initialize_local_mode()
-            conversation_json = converter.extract_conversation_local(text)
         # Generate conversation text
         conversation_text = "\n".join(
@@ -482,7 +538,7 @@ async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edg
         return f"Error: {str(e)}", None
-async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS"):
     """Regenerate audio from edited conversation text"""
     if not conversation_text.strip():
         return "Please provide conversation text.", None
@@ -494,21 +550,23 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
         if not conversation_json["conversation"]:
             return "No valid conversation found in the text.", None
         # Generate audio based on TTS engine
         if tts_engine == "Edge-TTS":
-            output_file, _ = await converter.text_to_speech_edge(
-                conversation_json,
-                "en-US-AvaMultilingualNeural",
-                "en-US-AndrewMultilingualNeural"
-            )
         elif tts_engine == "Spark-TTS":
             if not SPARK_AVAILABLE:
                 return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
             converter.initialize_spark_tts()
-            output_file, _ = converter.text_to_speech_spark(conversation_json)
         else:  # MeloTTS
             if not MELO_AVAILABLE:
                 return "MeloTTS not available. Please install required dependencies.", None
             converter.initialize_melo_tts()
             output_file, _ = converter.text_to_speech_melo(conversation_json)
@@ -518,14 +576,34 @@ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS")
         return f"Error generating audio: {str(e)}", None
-def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
     """Synchronous wrapper for async synthesis"""
-    return asyncio.run(synthesize(article_url, mode, tts_engine))
-def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS"):
     """Synchronous wrapper for async audio regeneration"""
-    return asyncio.run(regenerate_audio(conversation_text, tts_engine))
 # Gradio Interface
@@ -541,6 +619,14 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 value=""
             )
         with gr.Column(scale=1):
             mode_selector = gr.Radio(
                 choices=["API", "Local"],
                 value="API",
@@ -548,7 +634,7 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 info="API: Faster, requires API key | Local: Slower, runs on device"
             )
-            # TTS 엔진 선택 - 기본 2개와 추가 옵션으로 구분
             with gr.Group():
                 gr.Markdown("### TTS Engine Selection")
                 tts_selector = gr.Radio(
@@ -565,36 +651,39 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
                 **Additional Option:**
                 - ⚡ **MeloTTS**: Local processing, GPU recommended
                 """)
-    convert_btn = gr.Button("🎯 Generate Conversation", variant="primary", size="lg")
     with gr.Row():
         with gr.Column():
             conversation_output = gr.Textbox(
-                label="Generated Conversation (Editable)",
                 lines=15,
                 max_lines=30,
                 interactive=True,
-                placeholder="Generated conversation will appear here. You can edit it before generating audio.",
-                info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
             )
             # 오디오 생성 버튼 추가
             with gr.Row():
-                generate_audio_btn = gr.Button("🎙️ Generate Audio from Text", variant="secondary", size="lg")
-                gr.Markdown("*Edit the conversation above, then click to generate audio*")
         with gr.Column():
             audio_output = gr.Audio(
-                label="Podcast Audio",
                 type="filepath",
                 interactive=False
             )
             # 상태 메시지 추가
             status_output = gr.Textbox(
-                label="Status",
                 interactive=False,
                 visible=True
             )
@@ -602,14 +691,17 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
     # TTS 엔진별 설명 및 설치 안내 추가
     with gr.Row():
         gr.Markdown("""
-        ### TTS Engine Details:
         - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
         - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
           - **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
           - Features: Bilingual support (Chinese/English), controllable speech generation
           - License: CC BY-NC-SA (Non-commercial use only)
         - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
         ### Spark-TTS Setup Instructions:
         ```bash
@@ -621,26 +713,33 @@ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
     gr.Examples(
         examples=[
-            ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
-            ["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS"],
-            ["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS"],
         ],
-        inputs=[url_input, mode_selector, tts_selector],
         outputs=[conversation_output, status_output],
         fn=synthesize_sync,
         cache_examples=False,
     )
     # 이벤트 연결
     convert_btn.click(
         fn=synthesize_sync,
-        inputs=[url_input, mode_selector, tts_selector],
         outputs=[conversation_output, status_output]
     )
     generate_audio_btn.click(
         fn=regenerate_audio_sync,
-        inputs=[conversation_output, tts_selector],
         outputs=[status_output, audio_output]
     )

         except httpx.HTTPError as e:
             raise RuntimeError(f"Failed to fetch URL: {e}")
+    def _build_prompt(self, text: str, language: str = "English") -> str:
         """Build prompt for conversation generation"""
+        if language == "Korean":
+            template = """
+            {
+                "conversation": [
+                    {"speaker": "", "text": ""},
+                    {"speaker": "", "text": ""}
+                ]
+            }
+            """
+            return (
+                f"{text}\n\n제공된 텍스트를 두 명의 전문가 간의 짧고 유익하며 명확한 "
+                f"팟캐스트 대화로 변환해주세요. 톤은 전문적이고 매력적이어야 합니다. "
+                f"다음 형식을 준수하고 JSON만 반환해주세요:\n{template}"
+            )
+        else:
+            template = """
+            {
+                "conversation": [
+                    {"speaker": "", "text": ""},
+                    {"speaker": "", "text": ""}
+                ]
+            }
+            """
+            return (
+                f"{text}\n\nConvert the provided text into a short, informative and crisp "
+                f"podcast conversation between two experts. The tone should be "
+                f"professional and engaging. Please adhere to the following "
+                f"format and return ONLY the JSON:\n{template}"
+            )
+    def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
         """Extract conversation using API"""
         if not self.llm_client:
             raise RuntimeError("API mode not initialized")
         try:
+            # 언어별 프롬프트 구성
+            if language == "Korean":
+                system_message = "당신은 한국어로 팟캐스트 대화를 생성하는 전문가입니다. 자연스럽고 유익한 한국어 대화를 만들어주세요."
+            else:
+                system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
             chat_completion = self.llm_client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": self._build_prompt(text, language)}
+                ],
                 model=self.config.model_name,
             )
         except Exception as e:
             raise RuntimeError(f"Failed to extract conversation: {e}")
+    def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
         """Extract conversation using local model"""
         if not self.local_model or not self.tokenizer:
             raise RuntimeError("Local mode not initialized")
+        # 언어별 시스템 메시지
+        if language == "Korean":
+            system_message = "당신은 한국어로 팟캐스트 대화를 생성하는 전문가입니다. 자연스럽고 유익한 한국어 대화를 만들어주세요."
+        else:
+            system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
+        chat = [
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": self._build_prompt(text, language)}
+        ]
         terminators = [
             self.tokenizer.eos_token_id,
         if json_match:
             return json.loads(json_match.group())
         else:
+            # Return a default template based on language
+            if language == "Korean":
+                return {
+                    "conversation": [
+                        {"speaker": "진행자", "text": "안녕하세요, 팟캐스트에 오신 것을 환영합니다."},
+                        {"speaker": "게스트", "text": "안녕하세요, 초대해 주셔서 감사합니다."}
+                    ]
+                }
+            else:
+                return {
+                    "conversation": [
+                        {"speaker": "Host", "text": "Welcome to our podcast."},
+                        {"speaker": "Guest", "text": "Thank you for having me."}
+                    ]
+                }
     def parse_conversation_text(self, conversation_text: str) -> Dict:
         """Parse conversation text back to JSON format"""
         return conversation_data
+    async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
         """Convert text to speech using Edge TTS"""
         output_dir = Path(self._create_output_directory())
         filenames = []
         try:
+            # 언어별 음성 설정
+            if language == "Korean":
+                voices = [
+                    "ko-KR-SunHiNeural",  # 여성 음성 (자연스러운 한국어)
+                    "ko-KR-InJoonNeural"  # 남성 음성 (자연스러운 한국어)
+                ]
+            else:
+                voices = [
+                    "en-US-AvaMultilingualNeural",    # 여성 음성
+                    "en-US-AndrewMultilingualNeural"  # 남성 음성
+                ]
             for i, turn in enumerate(conversation_json["conversation"]):
                 filename = output_dir / f"output_{i}.wav"
+                voice = voices[i % len(voices)]
                 tmp_path = await self._generate_audio_edge(turn["text"], voice)
                 os.rename(tmp_path, filename)
         return tmp_path
+    def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
         """Convert text to speech using Spark TTS CLI"""
         if not SPARK_AVAILABLE or not self.spark_model_dir:
             raise RuntimeError("Spark TTS not available")
             audio_files = []
             # Create different voice characteristics for different speakers
+            if language == "Korean":
+                voice_configs = [
+                    {"prompt_text": "안녕하세요, 오늘 팟캐스트 진행을 맡은 진행자입니다.", "gender": "female"},
+                    {"prompt_text": "안녕하세요, 오늘 게스트로 참여하게 되어 기쁩니다.", "gender": "male"}
+                ]
+            else:
+                voice_configs = [
+                    {"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
+                    {"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
+                ]
             for i, turn in enumerate(conversation_json["conversation"]):
                 text = turn["text"]
 converter = UnifiedAudioConverter(ConversationConfig())
+async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
     """Main synthesis function"""
     if not article_url:
         return "Please provide a valid URL.", None
             if not api_key:
                 return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
             converter.initialize_api_mode(api_key)
+            conversation_json = converter.extract_conversation_api(text, language)
         else:  # Local mode
             converter.initialize_local_mode()
+            conversation_json = converter.extract_conversation_local(text, language)
         # Generate conversation text
         conversation_text = "\n".join(
         return f"Error: {str(e)}", None
+async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
     """Regenerate audio from edited conversation text"""
     if not conversation_text.strip():
         return "Please provide conversation text.", None
         if not conversation_json["conversation"]:
             return "No valid conversation found in the text.", None
+        # 한국어인 경우 Edge-TTS만 사용 (다른 TTS는 한국어 지원이 제한적)
+        if language == "Korean" and tts_engine != "Edge-TTS":
+            return "한국어는 Edge-TTS만 지원됩니다. TTS 엔진이 자동으로 Edge-TTS로 변경됩니다.", None
         # Generate audio based on TTS engine
         if tts_engine == "Edge-TTS":
+            output_file, _ = await converter.text_to_speech_edge(conversation_json, language)
         elif tts_engine == "Spark-TTS":
             if not SPARK_AVAILABLE:
                 return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
             converter.initialize_spark_tts()
+            output_file, _ = converter.text_to_speech_spark(conversation_json, language)
         else:  # MeloTTS
             if not MELO_AVAILABLE:
                 return "MeloTTS not available. Please install required dependencies.", None
+            if language == "Korean":
+                return "MeloTTS does not support Korean. Please use Edge-TTS for Korean.", None
             converter.initialize_melo_tts()
             output_file, _ = converter.text_to_speech_melo(conversation_json)
         return f"Error generating audio: {str(e)}", None
+def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS", language: str = "English"):
     """Synchronous wrapper for async synthesis"""
+    return asyncio.run(synthesize(article_url, mode, tts_engine, language))
+def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
     """Synchronous wrapper for async audio regeneration"""
+    return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
+def update_tts_engine_for_korean(language):
+    """한국어 선택 시 TTS 엔진 옵션 업데이트"""
+    if language == "Korean":
+        return gr.Radio(
+            choices=["Edge-TTS"],
+            value="Edge-TTS",
+            label="TTS Engine",
+            info="한국어는 Edge-TTS만 지원됩니다",
+            interactive=False
+        )
+    else:
+        return gr.Radio(
+            choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
+            value="Edge-TTS",
+            label="TTS Engine",
+            info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU",
+            interactive=True
+        )
 # Gradio Interface
                 value=""
             )
         with gr.Column(scale=1):
+            # 언어 선택 추가
+            language_selector = gr.Radio(
+                choices=["English", "Korean"],
+                value="English",
+                label="Language / 언어",
+                info="Select output language / 출력 언어를 선택하세요"
+            )
             mode_selector = gr.Radio(
                 choices=["API", "Local"],
                 value="API",
                 info="API: Faster, requires API key | Local: Slower, runs on device"
             )
+            # TTS 엔진 선택
             with gr.Group():
                 gr.Markdown("### TTS Engine Selection")
                 tts_selector = gr.Radio(
                 **Additional Option:**
                 - ⚡ **MeloTTS**: Local processing, GPU recommended
+                **한국어 지원:**
+                - 🇰🇷 한국어 선택 시 Edge-TTS만 ���용 가능합니다
                 """)
+    convert_btn = gr.Button("🎯 Generate Conversation / 대화 생성", variant="primary", size="lg")
     with gr.Row():
         with gr.Column():
             conversation_output = gr.Textbox(
+                label="Generated Conversation (Editable) / 생성된 대화 (편집 가능)",
                 lines=15,
                 max_lines=30,
                 interactive=True,
+                placeholder="Generated conversation will appear here. You can edit it before generating audio.\n생성된 대화가 여기에 표시됩니다. 오디오 생성 전에 편집할 수 있습니다.",
+                info="Edit the conversation as needed. Format: 'Speaker Name: Text' / 필요에 따라 대화를 편집하세요. 형식: '화자 이름: 텍스트'"
             )
             # 오디오 생성 버튼 추가
             with gr.Row():
+                generate_audio_btn = gr.Button("🎙️ Generate Audio from Text / 텍스트에서 오디오 생성", variant="secondary", size="lg")
+                gr.Markdown("*Edit the conversation above, then click to generate audio / 위의 대화를 편집한 후 클릭하여 오디오를 생성하세요*")
         with gr.Column():
             audio_output = gr.Audio(
+                label="Podcast Audio / 팟캐스트 오디오",
                 type="filepath",
                 interactive=False
             )
             # 상태 메시지 추가
             status_output = gr.Textbox(
+                label="Status / 상태",
                 interactive=False,
                 visible=True
             )
     # TTS 엔진별 설명 및 설치 안내 추가
     with gr.Row():
         gr.Markdown("""
+        ### TTS Engine Details / TTS 엔진 상세정보:
         - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
+          - 🇰🇷 **한국어 지원**: 자연스러운 한국어 음성 (여성: SunHi, 남성: InJoon)
         - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
           - **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
           - Features: Bilingual support (Chinese/English), controllable speech generation
           - License: CC BY-NC-SA (Non-commercial use only)
+          - ⚠️ **한국어 미지원**
         - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
+          - ⚠️ **한국어 미지원**
         ### Spark-TTS Setup Instructions:
         ```bash
     gr.Examples(
         examples=[
+            ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS", "English"],
+            ["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS", "English"],
+            ["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS", "Korean"],
         ],
+        inputs=[url_input, mode_selector, tts_selector, language_selector],
         outputs=[conversation_output, status_output],
         fn=synthesize_sync,
         cache_examples=False,
     )
+    # 언어 변경 시 TTS 엔진 옵션 업데이트
+    language_selector.change(
+        fn=update_tts_engine_for_korean,
+        inputs=[language_selector],
+        outputs=[tts_selector]
+    )
     # 이벤트 연결
     convert_btn.click(
         fn=synthesize_sync,
+        inputs=[url_input, mode_selector, tts_selector, language_selector],
         outputs=[conversation_output, status_output]
     )
     generate_audio_btn.click(
         fn=regenerate_audio_sync,
+        inputs=[conversation_output, tts_selector, language_selector],
         outputs=[status_output, audio_output]
     )