Voice-Clone-Podcast

Running on Zero

App Files Files Community

seawolf2357 commited on May 30

Commit

c012459

verified ·

1 Parent(s): bc8f404

Update app.py

Browse files

Files changed (1) hide show

app.py +980 -177

app.py CHANGED Viewed

@@ -1,14 +1,78 @@
-import random
-import numpy as np
-import torch
-from chatterbox.src.chatterbox.tts import ChatterboxTTS
-import gradio as gr
 import spaces
 import re
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
 def set_seed(seed: int):
     """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
@@ -18,6 +82,7 @@ def set_seed(seed: int):
     random.seed(seed)
     np.random.seed(seed)
 def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
     """
     텍스트를 문장 단위로 나누되, 각 청크가 max_chars를 넘지 않도록 합니다.
@@ -65,215 +130,953 @@ def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
     return chunks
-@spaces.GPU(duration=120)  # GPU 사용 시간을 충분히 설정
-def generate_tts_audio_gpu(
-    text_input: str,
-    audio_prompt_path_input: str,
-    exaggeration_input: float,
-    temperature_input: float,
-    seed_num_input: int,
-    cfgw_input: float,
-    chunk_size_input: int
-) -> tuple[int, np.ndarray]:
-    """
-    GPU에서 TTS 오디오를 생성합니다.
-    """
-    # GPU 함수 내에서 모델 로드
-    model = ChatterboxTTS.from_pretrained(DEVICE)
-    if seed_num_input != 0:
-        set_seed(int(seed_num_input))
-    # 텍스트가 짧으면 단일 생성
-    if len(text_input) <= 300:
-        print(f"단일 텍스트 생성: '{text_input[:50]}...'")
-        wav = model.generate(
-            text_input,
-            audio_prompt_path=audio_prompt_path_input,
-            exaggeration=exaggeration_input,
-            temperature=temperature_input,
-            cfg_weight=cfgw_input,
-        )
-        return (model.sr, wav.squeeze(0).numpy())
-    # 긴 텍스트는 청크로 분할
-    chunks = split_text_into_chunks(text_input, max_chars=chunk_size_input)
-    total_chunks = len(chunks)
-    print(f"텍스트를 {total_chunks}개의 청크로 분할했습니다.")
-    audio_segments = []
-    for i, chunk in enumerate(chunks):
-        print(f"청크 {i + 1}/{total_chunks} 생성 중: '{chunk[:50]}...'")
-        try:
-            wav = model.generate(
-                chunk,
-                audio_prompt_path=audio_prompt_path_input,
-                exaggeration=exaggeration_input,
-                temperature=temperature_input,
-                cfg_weight=cfgw_input,
-            )
-            wav_chunk = wav.squeeze(0).numpy()
-            audio_segments.append(wav_chunk)
-        except Exception as e:
-            print(f"청크 {i + 1} 생성 중 오류 발생: {e}")
-            continue
-    if not audio_segments:
-        raise RuntimeError("오디오 생성에 실패했습니다.")
-    # 오디오 세그먼트 연결
-    silence_duration = int(0.2 * model.sr)  # 0.2초 무음
-    silence = np.zeros(silence_duration)
-    final_audio = []
-    for i, segment in enumerate(audio_segments):
-        final_audio.append(segment)
-        if i < len(audio_segments) - 1:
-            final_audio.append(silence)
-    concatenated_audio = np.concatenate(final_audio)
-    print(f"오디오 생성 완료. 총 길이: {len(concatenated_audio) / model.sr:.2f}초")
-    return (model.sr, concatenated_audio)
-# Gradio 인터페이스
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-        # Chatterbox TTS Demo - 무제한 길이 버전
-        긴 텍스트도 청크로 나누어 처리하여 제한 없이 음성을 생성합니다.
-        ⚠️ **주의**: 긴 텍스트 처리 시 시간이 오래 걸릴 수 있습니다.
-        """
-    )
-    with gr.Row():
-        with gr.Column():
-            text = gr.Textbox(
-                value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
-                label="텍스트 입력 (길이 제한 없음)",
-                lines=10,
-                max_lines=30
-            )
-            ref_wav = gr.Audio(
-                sources=["upload", "microphone"],
-                type="filepath",
-                label="Reference Audio File (Optional)",
-                value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
-            )
-            with gr.Row():
-                exaggeration = gr.Slider(
-                    0.25, 2, step=.05,
-                    label="Exaggeration (Neutral = 0.5)",
-                    value=.5
                 )
-                cfg_weight = gr.Slider(
-                    0.2, 1, step=.05,
-                    label="CFG/Pace",
-                    value=0.5
                 )
-            chunk_size = gr.Slider(
-                100, 300, step=50,
-                label="청크 크기 (문자 수)",
-                value=250,
-                info="텍스트를 나눌 청크의 최대 크기입니다."
             )
-            with gr.Accordion("고급 옵션", open=False):
-                seed_num = gr.Number(value=0, label="Random seed (0 for random)")
-                temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
-            run_btn = gr.Button("🎤 음성 생성", variant="primary")
-        with gr.Column():
-            audio_output = gr.Audio(label="생성된 음성")
-            # 텍스트 길이 표시
-            char_count = gr.Textbox(
-                label="텍스트 정보",
-                value="0 문자",
-                interactive=False
             )
-            status = gr.Textbox(
-                label="상태",
-                value="대기 중...",
-                interactive=False
             )
-    # 텍스트 입력 시 문자 수 업데이트
-    def update_char_count(text, chunk_size):
-        char_len = len(text)
-        if char_len <= 300:
-            return f"{char_len} 문자 (단일 생성)"
-        else:
-            chunks = split_text_into_chunks(text, max_chars=chunk_size)
-            chunk_count = len(chunks)
-            estimated_time = chunk_count * 3  # 청크당 약 3초 예상
-            return f"{char_len} 문자, {chunk_count}개 청크 (예상 시간: 약 {estimated_time}초)"
-    text.change(
-        fn=update_char_count,
-        inputs=[text, chunk_size],
-        outputs=[char_count]
-    )
-    chunk_size.change(
-        fn=update_char_count,
-        inputs=[text, chunk_size],
-        outputs=[char_count]
-    )
-    # 생성 함수 래퍼 (상태 업데이트 포함)
-    def generate_with_status(text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size):
         try:
-            yield gr.update(value="처리 중... GPU를 할당받는 중입니다."), None
-            # GPU 함수 호출
-            sr, audio = generate_tts_audio_gpu(
-                text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size
             )
-            yield gr.update(value="✅ 생성 완료!"), (sr, audio)
         except Exception as e:
-            yield gr.update(value=f"❌ 오류 발생: {str(e)}"), None
-    run_btn.click(
-        fn=generate_with_status,
-        inputs=[
-            text,
-            ref_wav,
             exaggeration,
-            temp,
             seed_num,
             cfg_weight,
             chunk_size
-        ],
-        outputs=[status, audio_output],
     )
-    gr.Markdown(
-        """
-        ### 💡 사용 팁:
-        - **300자 이하**: 빠른 단일 생성
-        - **300자 초과**: 자동으로 청크로 분할하여 처리
-        - 청크 크기가 작을수록 자연스럽지만 처리 시간이 증가합니다
-        - GPU 할당을 기다리는 시간이 있을 수 있습니다
-        ### ⏱️ 예상 처리 시간:
-        - 300자 이하: 약 5-10초
-        - 1000자: 약 15-30초
-        - 5000자: 약 1-2분
-        """
     )
-# 앱 실행 시 모델 로드 제거 (GPU 함수 내에서만 로드)
-print("앱이 시작되었습니다. 모델은 첫 생성 시 로드됩니다.")
-demo.queue().launch()

 import spaces
+import gradio as gr
+import os
+import asyncio
+import torch
+import io
+import json
 import re
+import httpx
+import tempfile
+import wave
+import base64
+import numpy as np
+import soundfile as sf
+import subprocess
+import shutil
+import requests
+import logging
+import random
+from datetime import datetime, timedelta
+from typing import List, Tuple, Dict, Optional
+from pathlib import Path
+from threading import Thread
+from dotenv import load_dotenv
+# PDF processing imports
+from langchain_community.document_loaders import PyPDFLoader
+# OpenAI imports
+from openai import OpenAI
+# Transformers imports (for legacy local mode)
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    BitsAndBytesConfig,
+)
+# Llama CPP imports (for new local mode)
+try:
+    from llama_cpp import Llama
+    from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
+    from llama_cpp_agent.providers import LlamaCppPythonProvider
+    from llama_cpp_agent.chat_history import BasicChatHistory
+    from llama_cpp_agent.chat_history.messages import Roles
+    from huggingface_hub import hf_hub_download
+    LLAMA_CPP_AVAILABLE = True
+except ImportError:
+    LLAMA_CPP_AVAILABLE = False
+# Chatterbox TTS imports
+try:
+    from chatterbox.src.chatterbox.tts import ChatterboxTTS
+    CHATTERBOX_AVAILABLE = True
+except ImportError:
+    CHATTERBOX_AVAILABLE = False
+# Import config and prompts
+from config_prompts import (
+    ConversationConfig,
+    PromptBuilder,
+    DefaultConversations,
+)
+load_dotenv()
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"🚀 Running on device: {DEVICE}")
+# Brave Search API 설정
+BRAVE_KEY = os.getenv("BSEARCH_API")
+BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
 def set_seed(seed: int):
     """Sets the random seed for reproducibility across torch, numpy, and random."""
     torch.manual_seed(seed)
     random.seed(seed)
     np.random.seed(seed)
 def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
     """
     텍스트를 문장 단위로 나누되, 각 청크가 max_chars를 넘지 않도록 합니다.
     return chunks
+def brave_search(query: str, count: int = 8, freshness_days: int | None = None):
+    """Brave Search API를 사용하여 최신 정보 검색"""
+    if not BRAVE_KEY:
+        return []
+    params = {"q": query, "count": str(count)}
+    if freshness_days:
+        dt_from = (datetime.utcnow() - timedelta(days=freshness_days)).strftime("%Y-%m-%d")
+        params["freshness"] = dt_from
+    try:
+        r = requests.get(
+            BRAVE_ENDPOINT,
+            headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_KEY},
+            params=params,
+            timeout=15
+        )
+        raw = r.json().get("web", {}).get("results") or []
+        return [{
+            "title": r.get("title", ""),
+            "url": r.get("url", r.get("link", "")),
+            "snippet": r.get("description", r.get("text", "")),
+            "host": re.sub(r"https?://(www\.)?", "", r.get("url", "")).split("/")[0]
+        } for r in raw[:count]]
+    except Exception as e:
+        logging.error(f"Brave search error: {e}")
+        return []
+def format_search_results(query: str, for_keyword: bool = False) -> str:
+    """검색 결과를 포맷팅하여 반환"""
+    # 키워드 검색의 경우 더 많은 결과 사용
+    count = 5 if for_keyword else 3
+    rows = brave_search(query, count, freshness_days=7 if not for_keyword else None)
+    if not rows:
+        return ""
+    results = []
+    # 키워드 검색의 경우 더 상세한 정보 포함
+    max_results = 4 if for_keyword else 2
+    for r in rows[:max_results]:
+        if for_keyword:
+            # 키워드 검색은 더 긴 스니펫 사용
+            snippet = r['snippet'][:200] + "..." if len(r['snippet']) > 200 else r['snippet']
+            results.append(f"**{r['title']}**\n{snippet}\nSource: {r['host']}")
+        else:
+            # 일반 검색은 짧은 스니펫
+            snippet = r['snippet'][:100] + "..." if len(r['snippet']) > 100 else r['snippet']
+            results.append(f"- {r['title']}: {snippet}")
+    return "\n\n".join(results) + "\n"
+def extract_keywords_for_search(text: str, language: str = "English") -> List[str]:
+    """텍스트에서 검색할 키워드 추출"""
+    # 텍스트 앞부분만 사용 (너무 많은 텍스트 처리 방지)
+    text_sample = text[:500]
+    # 영어는 대문자로 시작하는 단어 중 가장 긴 것 1개
+    words = text_sample.split()
+    keywords = [word.strip('.,!?;:') for word in words
+               if len(word) > 4 and word[0].isupper()]
+    if keywords:
+        return [max(keywords, key=len)]  # 가장 긴 단어 1개
+    return []
+def search_and_compile_content(keyword: str, language: str = "English") -> str:
+    """키워드로 검색하여 충분한 콘텐츠 컴파일"""
+    if not BRAVE_KEY:
+        # API 없을 때도 기본 콘텐츠 생성
+        return f"""
+Comprehensive information about '{keyword}':
+{keyword} is a significant topic in modern society.
+This subject impacts our lives in various ways and has been
+gaining increasing attention recently.
+Key aspects:
+1. Technological advancement and innovation
+2. Social impact and changes
+3. Future prospects and possibilities
+4. Practical applications
+5. Global trends and developments
+Experts predict that {keyword} will become even more important,
+and it's crucial to develop a deep understanding of this topic.
+"""
+    # 영어 검색 쿼리
+    queries = [
+        f"{keyword} latest news 2024",
+        f"{keyword} explained comprehensive",
+        f"{keyword} trends forecast",
+        f"{keyword} advantages disadvantages",
+        f"{keyword} how to use",
+        f"{keyword} expert opinions"
+    ]
+    all_content = []
+    total_content_length = 0
+    for query in queries:
+        results = brave_search(query, count=5)  # 더 많은 결과 가져오기
+        for r in results[:3]:  # 각 쿼리당 상위 3개
+            content = f"**{r['title']}**\n{r['snippet']}\nSource: {r['host']}\n"
+            all_content.append(content)
+            total_content_length += len(r['snippet'])
+    # 콘텐츠가 부족하면 추가 생성
+    if total_content_length < 1000:  # 최소 1000자 확보
+        additional_content = f"""
+Additional insights:
+Recent developments in {keyword} show rapid advancement in this field.
+Many experts are actively researching this topic, and its practical
+applications continue to expand.
+Key points to note:
+- Accelerating technological innovation
+- Improving user experience
+- Enhanced accessibility
+- Increased cost efficiency
+- Growing global market
+These factors are making the future of {keyword} increasingly promising.
+"""
+        all_content.append(additional_content)
+    # 컴파일된 콘텐츠 반환
+    compiled = "\n\n".join(all_content)
+    # 키워드 기반 소개
+    intro = f"### Comprehensive information and latest trends about '{keyword}':\n\n"
+    return intro + compiled
+class UnifiedAudioConverter:
+    def __init__(self, config: ConversationConfig):
+        self.config = config
+        self.llm_client = None
+        self.legacy_local_model = None
+        self.legacy_tokenizer = None
+        # 새로운 로컬 LLM 관련
+        self.local_llm = None
+        self.local_llm_model = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # 프롬프트 빌더 추가
+        self.prompt_builder = PromptBuilder()
+    def initialize_api_mode(self, api_key: str):
+        """Initialize API mode with Together API"""
+        self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
+    @spaces.GPU(duration=120)
+    def initialize_local_mode(self):
+        """Initialize new local mode with Llama CPP"""
+        if not LLAMA_CPP_AVAILABLE:
+            raise RuntimeError("Llama CPP dependencies not available. Please install llama-cpp-python and llama-cpp-agent.")
+        if self.local_llm is None or self.local_llm_model != self.config.local_model_name:
+            try:
+                # 모델 다운로드
+                model_path = hf_hub_download(
+                    repo_id=self.config.local_model_repo,
+                    filename=self.config.local_model_name,
+                    local_dir="./models"
                 )
+                model_path_local = os.path.join("./models", self.config.local_model_name)
+                if not os.path.exists(model_path_local):
+                    raise RuntimeError(f"Model file not found at {model_path_local}")
+                # Llama 모델 초기화
+                self.local_llm = Llama(
+                    model_path=model_path_local,
+                    flash_attn=True,
+                    n_gpu_layers=81 if torch.cuda.is_available() else 0,
+                    n_batch=1024,
+                    n_ctx=16384,
                 )
+                self.local_llm_model = self.config.local_model_name
+                print(f"Local LLM initialized: {model_path_local}")
+            except Exception as e:
+                print(f"Failed to initialize local LLM: {e}")
+                raise RuntimeError(f"Failed to initialize local LLM: {e}")
+    @spaces.GPU(duration=60)
+    def initialize_legacy_local_mode(self):
+        """Initialize legacy local mode with Hugging Face model (fallback)"""
+        if self.legacy_local_model is None:
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16
             )
+            self.legacy_local_model = AutoModelForCausalLM.from_pretrained(
+                self.config.legacy_local_model_name,
+                quantization_config=quantization_config
+            )
+            self.legacy_tokenizer = AutoTokenizer.from_pretrained(
+                self.config.legacy_local_model_name,
+                revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
+            )
+    def fetch_text(self, url: str) -> str:
+        """Fetch text content from URL"""
+        if not url:
+            raise ValueError("URL cannot be empty")
+        if not url.startswith("http://") and not url.startswith("https://"):
+            raise ValueError("URL must start with 'http://' or 'https://'")
+        full_url = f"{self.config.prefix_url}{url}"
+        try:
+            response = httpx.get(full_url, timeout=60.0)
+            response.raise_for_status()
+            return response.text
+        except httpx.HTTPError as e:
+            raise RuntimeError(f"Failed to fetch URL: {e}")
+    def extract_text_from_pdf(self, pdf_file) -> str:
+        """Extract text content from PDF file"""
+        try:
+            # Gradio returns file path, not file object
+            if isinstance(pdf_file, str):
+                pdf_path = pdf_file
+            else:
+                # If it's a file object (shouldn't happen with Gradio)
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+                    tmp_file.write(pdf_file.read())
+                    pdf_path = tmp_file.name
+            # PDF 로드 및 텍스트 추출
+            loader = PyPDFLoader(pdf_path)
+            pages = loader.load()
+            # 모든 페이지의 텍스트를 결합
+            text = "\n".join([page.page_content for page in pages])
+            # 임시 파일인 경우 삭제
+            if not isinstance(pdf_file, str) and os.path.exists(pdf_path):
+                os.unlink(pdf_path)
+            return text
+        except Exception as e:
+            raise RuntimeError(f"Failed to extract text from PDF: {e}")
+    def _get_messages_formatter_type(self, model_name):
+        """Get appropriate message formatter for the model"""
+        if "Mistral" in model_name or "BitSix" in model_name:
+            return MessagesFormatterType.CHATML
+        else:
+            return MessagesFormatterType.LLAMA_3
+    @spaces.GPU(duration=120)
+    def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
+        """Extract conversation using new local LLM with enhanced professional style"""
+        try:
+            # 검색 컨텍스트 생성
+            search_context = ""
+            if BRAVE_KEY and not text.startswith("Keyword-based content:"):
+                try:
+                    keywords = extract_keywords_for_search(text, language)
+                    if keywords:
+                        search_query = f"{keywords[0]} latest news"
+                        search_context = format_search_results(search_query)
+                        print(f"Search context added for: {search_query}")
+                except Exception as e:
+                    print(f"Search failed, continuing without context: {e}")
+            # 먼저 새로운 로컬 LLM 시도
+            self.initialize_local_mode()
+            chat_template = self._get_messages_formatter_type(self.config.local_model_name)
+            provider = LlamaCppPythonProvider(self.local_llm)
+            # 영어 전용 시스템 메시지
+            system_message = (
+                f"You are a professional podcast scriptwriter creating high-quality, "
+                f"insightful discussions in English. Create exactly 12 conversation exchanges "
+                f"with professional expertise. All dialogue must be in English. "
+                f"Respond only in JSON format."
             )
+            agent = LlamaCppAgent(
+                provider,
+                system_prompt=system_message,
+                predefined_messages_formatter_type=chat_template,
+                debug_output=False
+            )
+            settings = provider.get_provider_default_settings()
+            settings.temperature = 0.75
+            settings.top_k = 40
+            settings.top_p = 0.95
+            settings.max_tokens = self.config.max_tokens
+            settings.repeat_penalty = 1.1
+            settings.stream = False
+            messages = BasicChatHistory()
+            prompt = self.prompt_builder.build_prompt(text, language, search_context)
+            response = agent.get_chat_response(
+                prompt,
+                llm_sampling_settings=settings,
+                chat_history=messages,
+                returns_streaming_generator=False,
+                print_output=False
             )
+            # JSON 파싱
+            pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
+            json_match = re.search(pattern, response)
+            if json_match:
+                conversation_data = json.loads(json_match.group())
+                return conversation_data
+            else:
+                raise ValueError("No valid JSON found in local LLM response")
+        except Exception as e:
+            print(f"Local LLM failed: {e}, falling back to legacy local method")
+            return self.extract_conversation_legacy_local(text, language, progress, search_context)
+    @spaces.GPU(duration=120)
+    def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None, search_context: str = "") -> Dict:
+        """Extract conversation using legacy local model"""
         try:
+            self.initialize_legacy_local_mode()
+            # 영어 전용 메시지
+            messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
+            terminators = [
+                self.legacy_tokenizer.eos_token_id,
+                self.legacy_tokenizer.convert_tokens_to_ids("<|eot_id|>")
+            ]
+            chat_messages = self.legacy_tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
             )
+            model_inputs = self.legacy_tokenizer([chat_messages], return_tensors="pt").to(self.device)
+            streamer = TextIteratorStreamer(
+                self.legacy_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
+            )
+            generate_kwargs = dict(
+                model_inputs,
+                streamer=streamer,
+                max_new_tokens=self.config.max_new_tokens,
+                do_sample=True,
+                temperature=0.75,
+                eos_token_id=terminators,
+            )
+            t = Thread(target=self.legacy_local_model.generate, kwargs=generate_kwargs)
+            t.start()
+            partial_text = ""
+            for new_text in streamer:
+                partial_text += new_text
+            pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
+            json_match = re.search(pattern, partial_text)
+            if json_match:
+                return json.loads(json_match.group())
+            else:
+                raise ValueError("No valid JSON found in legacy local response")
+        except Exception as e:
+            print(f"Legacy local model also failed: {e}")
+            return DefaultConversations.get_conversation("English")
+    def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
+        """Extract conversation using API"""
+        if not self.llm_client:
+            raise RuntimeError("API mode not initialized")
+        try:
+            # 검색 컨텍스트 생성
+            search_context = ""
+            if BRAVE_KEY and not text.startswith("Keyword-based content:"):
+                try:
+                    keywords = extract_keywords_for_search(text, language)
+                    if keywords:
+                        search_query = f"{keywords[0]} latest news"
+                        search_context = format_search_results(search_query)
+                        print(f"Search context added for: {search_query}")
+                except Exception as e:
+                    print(f"Search failed, continuing without context: {e}")
+            # 메시지 빌드
+            messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
+            chat_completion = self.llm_client.chat.completions.create(
+                messages=messages,
+                model=self.config.api_model_name,
+                temperature=0.75,
+            )
+            pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
+            json_match = re.search(pattern, chat_completion.choices[0].message.content)
+            if not json_match:
+                raise ValueError("No valid JSON found in response")
+            return json.loads(json_match.group())
         except Exception as e:
+            raise RuntimeError(f"Failed to extract conversation: {e}")
+    def parse_conversation_text(self, conversation_text: str) -> Dict:
+        """Parse conversation text back to JSON format"""
+        lines = conversation_text.strip().split('\n')
+        conversation_data = {"conversation": []}
+        for line in lines:
+            if ':' in line:
+                speaker, text = line.split(':', 1)
+                conversation_data["conversation"].append({
+                    "speaker": speaker.strip(),
+                    "text": text.strip()
+                })
+        return conversation_data
+    @spaces.GPU(duration=120)
+    def generate_tts_audio_gpu(
+        self,
+        conversation_json: Dict,
+        audio_prompt_path_input: str,
+        exaggeration_input: float = 0.5,
+        temperature_input: float = 0.8,
+        seed_num_input: int = 0,
+        cfgw_input: float = 0.5,
+        chunk_size_input: int = 250
+    ) -> tuple[int, np.ndarray]:
+        """
+        Chatterbox TTS를 사용하여 대화를 음성으로 변환
+        """
+        if not CHATTERBOX_AVAILABLE:
+            raise RuntimeError("Chatterbox TTS not available")
+        # GPU 함수 내에서 모델 로드
+        model = ChatterboxTTS.from_pretrained(DEVICE)
+        if seed_num_input != 0:
+            set_seed(int(seed_num_input))
+        audio_segments = []
+        for i, turn in enumerate(conversation_json["conversation"]):
+            text = turn["text"]
+            if not text.strip():
+                continue
+            print(f"생성 중: Speaker {i+1} - '{text[:50]}...'")
+            try:
+                # 텍스트가 짧으면 단일 생성
+                if len(text) <= 300:
+                    wav = model.generate(
+                        text,
+                        audio_prompt_path=audio_prompt_path_input,
+                        exaggeration=exaggeration_input,
+                        temperature=temperature_input,
+                        cfg_weight=cfgw_input,
+                    )
+                    wav_chunk = wav.squeeze(0).numpy()
+                    audio_segments.append(wav_chunk)
+                else:
+                    # 긴 텍스트는 청크로 분할
+                    chunks = split_text_into_chunks(text, max_chars=chunk_size_input)
+                    chunk_audio_segments = []
+                    for chunk in chunks:
+                        wav = model.generate(
+                            chunk,
+                            audio_prompt_path=audio_prompt_path_input,
+                            exaggeration=exaggeration_input,
+                            temperature=temperature_input,
+                            cfg_weight=cfgw_input,
+                        )
+                        wav_chunk = wav.squeeze(0).numpy()
+                        chunk_audio_segments.append(wav_chunk)
+                    # 청크들을 연결
+                    if chunk_audio_segments:
+                        silence_duration = int(0.1 * model.sr)  # 0.1초 무음
+                        silence = np.zeros(silence_duration)
+                        turn_audio = []
+                        for j, segment in enumerate(chunk_audio_segments):
+                            turn_audio.append(segment)
+                            if j < len(chunk_audio_segments) - 1:
+                                turn_audio.append(silence)
+                        concatenated_turn = np.concatenate(turn_audio)
+                        audio_segments.append(concatenated_turn)
+            except Exception as e:
+                print(f"Speaker {i+1} 생성 중 오류 발생: {e}")
+                continue
+        if not audio_segments:
+            raise RuntimeError("오디오 생성에 실패했습니다.")
+        # 모든 스피커의 오디오 세그먼트 연결
+        speaker_silence_duration = int(0.5 * model.sr)  # 스피커 간 0.5초 무음
+        speaker_silence = np.zeros(speaker_silence_duration)
+        final_audio = []
+        for i, segment in enumerate(audio_segments):
+            final_audio.append(segment)
+            if i < len(audio_segments) - 1:
+                final_audio.append(speaker_silence)
+        concatenated_audio = np.concatenate(final_audio)
+        print(f"오디오 생성 완료. 총 길이: {len(concatenated_audio) / model.sr:.2f}초")
+        return (model.sr, concatenated_audio)
+    def _create_output_directory(self) -> str:
+        """Create a unique output directory"""
+        random_bytes = os.urandom(8)
+        folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
+        os.makedirs(folder_name, exist_ok=True)
+        return folder_name
+# Global converter instance
+converter = UnifiedAudioConverter(ConversationConfig())
+async def synthesize(article_input, input_type: str = "URL", mode: str = "Local"):
+    """Main synthesis function - handles URL, PDF, and Keyword inputs"""
+    try:
+        # Extract text based on input type
+        if input_type == "URL":
+            if not article_input or not isinstance(article_input, str):
+                return "Please provide a valid URL.", None
+            text = converter.fetch_text(article_input)
+        elif input_type == "PDF":
+            if not article_input:
+                return "Please upload a PDF file.", None
+            text = converter.extract_text_from_pdf(article_input)
+        else:  # Keyword
+            if not article_input or not isinstance(article_input, str):
+                return "Please provide a keyword or topic.", None
+            text = search_and_compile_content(article_input, "English")
+            text = f"Keyword-based content:\n{text}"
+        # Limit text to max words
+        words = text.split()
+        if len(words) > converter.config.max_words:
+            text = " ".join(words[:converter.config.max_words])
+        # Extract conversation based on mode
+        if mode == "Local":
+            try:
+                conversation_json = converter.extract_conversation_local(text, "English")
+            except Exception as e:
+                print(f"Local mode failed: {e}, trying API fallback")
+                api_key = os.environ.get("TOGETHER_API_KEY")
+                if api_key:
+                    converter.initialize_api_mode(api_key)
+                    conversation_json = converter.extract_conversation_api(text, "English")
+                else:
+                    raise RuntimeError("Local mode failed and no API key available for fallback")
+        else:  # API mode
+            api_key = os.environ.get("TOGETHER_API_KEY")
+            if not api_key:
+                print("API key not found, falling back to local mode")
+                conversation_json = converter.extract_conversation_local(text, "English")
+            else:
+                try:
+                    converter.initialize_api_mode(api_key)
+                    conversation_json = converter.extract_conversation_api(text, "English")
+                except Exception as e:
+                    print(f"API mode failed: {e}, falling back to local mode")
+                    conversation_json = converter.extract_conversation_local(text, "English")
+        # Generate conversation text
+        conversation_text = "\n".join(
+            f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
+            for i, turn in enumerate(conversation_json["conversation"])
+        )
+        return conversation_text, None
+    except Exception as e:
+        return f"Error: {str(e)}", None
+async def regenerate_audio(
+    conversation_text: str,
+    ref_audio_path: str,
+    exaggeration: float = 0.5,
+    temperature: float = 0.8,
+    seed_num: int = 0,
+    cfg_weight: float = 0.5,
+    chunk_size: int = 250
+):
+    """Regenerate audio from edited conversation text using Chatterbox TTS"""
+    if not conversation_text.strip():
+        return "Please provide conversation text.", None
+    try:
+        conversation_json = converter.parse_conversation_text(conversation_text)
+        if not conversation_json["conversation"]:
+            return "No valid conversation found in the text.", None
+        # Generate audio using Chatterbox TTS
+        sr, audio = converter.generate_tts_audio_gpu(
+            conversation_json,
+            ref_audio_path,
             exaggeration,
+            temperature,
             seed_num,
             cfg_weight,
             chunk_size
+        )
+        # Save audio to file
+        output_dir = converter._create_output_directory()
+        output_file = os.path.join(output_dir, "podcast_audio.wav")
+        sf.write(output_file, audio, sr)
+        return "Audio generated successfully!", output_file
+    except Exception as e:
+        return f"Error generating audio: {str(e)}", None
+def synthesize_sync(article_input, input_type: str = "URL", mode: str = "Local"):
+    """Synchronous wrapper for async synthesis"""
+    return asyncio.run(synthesize(article_input, input_type, mode))
+def regenerate_audio_sync(conversation_text: str, ref_audio_path: str, exaggeration: float, temperature: float, seed_num: int, cfg_weight: float, chunk_size: int):
+    """Synchronous wrapper for async audio regeneration"""
+    return asyncio.run(regenerate_audio(conversation_text, ref_audio_path, exaggeration, temperature, seed_num, cfg_weight, chunk_size))
+def toggle_input_visibility(input_type):
+    """Toggle visibility of URL input, file upload, and keyword input based on input type"""
+    if input_type == "URL":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
+    elif input_type == "PDF":
+        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+    else:  # Keyword
+        return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
+def update_char_count(text, chunk_size):
+    """텍스트 길이 정보 업데이트"""
+    char_len = len(text)
+    if char_len <= 300:
+        return f"{char_len} characters (single generation)"
+    else:
+        chunks = split_text_into_chunks(text, max_chars=chunk_size)
+        chunk_count = len(chunks)
+        estimated_time = chunk_count * 3  # 청크당 약 3초 예상
+        return f"{char_len} characters, {chunk_count} chunks (estimated time: ~{estimated_time}s)"
+# 모델 초기화 (앱 시작 시)
+if LLAMA_CPP_AVAILABLE:
+    try:
+        model_path = hf_hub_download(
+            repo_id=converter.config.local_model_repo,
+            filename=converter.config.local_model_name,
+            local_dir="./models"
+        )
+        print(f"Model downloaded to: {model_path}")
+    except Exception as e:
+        print(f"Failed to download model at startup: {e}")
+# Gradio Interface
+with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
+    .container {max-width: 1200px; margin: auto; padding: 20px;}
+    .header-text {text-align: center; margin-bottom: 30px;}
+    .input-group {background: #f7f7f7; padding: 20px; border-radius: 10px; margin-bottom: 20px;}
+    .output-group {background: #f0f0f0; padding: 20px; border-radius: 10px;}
+    .status-box {background: #e8f4f8; padding: 15px; border-radius: 8px; margin-top: 10px;}
+""") as demo:
+    with gr.Column(elem_classes="container"):
+        # 헤더
+        with gr.Row(elem_classes="header-text"):
+            gr.Markdown("""
+            # 🎙️ LIVE Podcast Generator with Chatterbox TTS
+            ### Convert any article, blog, PDF document, or topic into an engaging professional podcast conversation!
+            """)
+        with gr.Row(elem_classes="discord-badge"):
+            gr.HTML("""
+            <p style="text-align: center;">
+                <a href="https://discord.gg/openfreeai" target="_blank" style="display: inline-block; margin-right: 10px;">
+                    <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="badge">
+                </a>
+                <a href="https://open.spotify.com/show/36GtIP7iqJxCwp7FfXmTYK?si=KsIsUJq7SJiiudPTaMsXAA" target="_blank" style="display: inline-block;">
+                    <img src="https://img.shields.io/static/v1?label=Spotify&message=Podcast&color=%230000ff&labelColor=%23000080&logo=Spotify&logoColor=white&style=for-the-badge" alt="badge">
+                </a>
+                <a href="https://huggingface.co/spaces/openfree/AI-Podcast" target="_blank" style="display: inline-block;">
+                    <img src="https://img.shields.io/static/v1?label=Huggingface&message=AI%20Podcast&color=%230000ff&labelColor=%23ffa500&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
+                </a>
+            </p>
+            """)
+        # 상태 표시 섹션
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown(f"""
+                #### 🤖 System Status
+                - **LLM**: {converter.config.local_model_name.split('.')[0]}
+                - **Fallback**: {converter.config.api_model_name.split('/')[-1]}
+                - **Llama CPP**: {"✅ Ready" if LLAMA_CPP_AVAILABLE else "❌ Not Available"}
+                - **Chatterbox TTS**: {"✅ Ready" if CHATTERBOX_AVAILABLE else "❌ Not Available"}
+                - **Search**: {"✅ Brave API" if BRAVE_KEY else "❌ No API"}
+                """)
+            with gr.Column(scale=1):
+                gr.Markdown("""
+                #### 🎙️ Chatterbox TTS Features
+                - **High Quality**: Neural voice synthesis
+                - **Voice Cloning**: Upload your reference audio
+                - **Unlimited Length**: Automatic text chunking
+                - **Professional Style**: Expert podcast discussions
+                """)
+        # 메인 입력 섹션
+        with gr.Group(elem_classes="input-group"):
+            with gr.Row():
+                # 왼쪽: 입력 옵션들
+                with gr.Column(scale=2):
+                    # 입력 타입 선택
+                    input_type_selector = gr.Radio(
+                        choices=["URL", "PDF", "Keyword"],
+                        value="URL",
+                        label="📥 Input Type",
+                        info="Choose your content source"
+                    )
+                    # URL 입력
+                    url_input = gr.Textbox(
+                        label="🔗 Article URL",
+                        placeholder="Enter the article URL here...",
+                        value="",
+                        visible=True,
+                        lines=2
+                    )
+                    # PDF 업로드
+                    pdf_input = gr.File(
+                        label="📄 Upload PDF",
+                        file_types=[".pdf"],
+                        visible=False
+                    )
+                    # 키워드 입력
+                    keyword_input = gr.Textbox(
+                        label="🔍 Topic/Keyword",
+                        placeholder="Enter a topic (e.g., 'AI trends 2024', 'quantum computing')",
+                        value="",
+                        visible=False,
+                        info="System will search and compile latest information",
+                        lines=2
+                    )
+                # 오른쪽: 설정 옵션들
+                with gr.Column(scale=1):
+                    # 처리 모드
+                    mode_selector = gr.Radio(
+                        choices=["Local", "API"],
+                        value="Local",
+                        label="⚙️ Processing Mode",
+                        info="Local: On-device | API: Cloud"
+                    )
+            # 생성 버튼
+            with gr.Row():
+                convert_btn = gr.Button(
+                    "🎯 Generate Professional Conversation",
+                    variant="primary",
+                    size="lg",
+                    scale=1
+                )
+        # TTS 설정 섹션
+        with gr.Group(elem_classes="input-group"):
+            gr.Markdown("### 🎙️ Chatterbox TTS Settings")
+            with gr.Row():
+                with gr.Column(scale=2):
+                    ref_audio = gr.Audio(
+                        sources=["upload", "microphone"],
+                        type="filepath",
+                        label="Reference Audio File (Upload your voice)",
+                        value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
+                        info="Upload your voice sample for voice cloning"
+                    )
+                with gr.Column(scale=1):
+                    exaggeration = gr.Slider(
+                        0.25, 2, step=.05,
+                        label="Exaggeration (Neutral = 0.5)",
+                        value=.5
+                    )
+                    cfg_weight = gr.Slider(
+                        0.2, 1, step=.05,
+                        label="CFG/Pace",
+                        value=0.5
+                    )
+                    chunk_size = gr.Slider(
+                        100, 300, step=50,
+                        label="Chunk Size (characters)",
+                        value=250,
+                        info="Text chunking for long conversations"
+                    )
+                    with gr.Accordion("Advanced Options", open=False):
+                        seed_num = gr.Number(value=0, label="Random seed (0 for random)")
+                        temperature = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
+        # 출력 섹션
+        with gr.Group(elem_classes="output-group"):
+            with gr.Row():
+                # 왼쪽: 대화 텍스트
+                with gr.Column(scale=3):
+                    conversation_output = gr.Textbox(
+                        label="💬 Generated Professional Conversation (Editable)",
+                        lines=25,
+                        max_lines=50,
+                        interactive=True,
+                        placeholder="Professional podcast conversation will appear here...",
+                        info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
+                    )
+                    # 텍스트 길이 표시
+                    char_count = gr.Textbox(
+                        label="Text Information",
+                        value="0 characters",
+                        interactive=False
+                    )
+                    # 오디오 생성 버튼
+                    with gr.Row():
+                        generate_audio_btn = gr.Button(
+                            "🎙️ Generate Audio with Chatterbox TTS",
+                            variant="secondary",
+                            size="lg"
+                        )
+                # 오른쪽: 오디오 출력 및 상태
+                with gr.Column(scale=2):
+                    audio_output = gr.Audio(
+                        label="🎧 Professional Podcast Audio",
+                        type="filepath",
+                        interactive=False
+                    )
+                    status_output = gr.Textbox(
+                        label="📊 Status",
+                        interactive=False,
+                        lines=3,
+                        elem_classes="status-box"
+                    )
+                    # 도움말
+                    gr.Markdown("""
+                    #### 💡 Quick Tips:
+                    - **URL**: Paste any article link
+                    - **PDF**: Upload documents directly
+                    - **Keyword**: Enter topics for AI research
+                    - **Voice Cloning**: Upload reference audio
+                    - Edit conversation before audio generation
+                    - Longer text automatically chunked
+                    """)
+        # 예제 섹션
+        with gr.Accordion("📚 Examples", open=False):
+            gr.Examples(
+                examples=[
+                    ["https://huggingface.co/blog/openfreeai/cycle-navigator", "URL", "Local"],
+                    ["quantum computing breakthroughs", "Keyword", "Local"],
+                    ["https://huggingface.co/papers/2505.14810", "URL", "Local"],
+                    ["artificial intelligence ethics", "Keyword", "Local"],
+                ],
+                inputs=[url_input, input_type_selector, mode_selector],
+                outputs=[conversation_output, status_output],
+                fn=synthesize_sync,
+                cache_examples=False,
+            )
+    # Input type change handler
+    input_type_selector.change(
+        fn=toggle_input_visibility,
+        inputs=[input_type_selector],
+        outputs=[url_input, pdf_input, keyword_input]
     )
+    # 텍스트 입력 시 문자 수 업데이트
+    conversation_output.change(
+        fn=update_char_count,
+        inputs=[conversation_output, chunk_size],
+        outputs=[char_count]
+    )
+    chunk_size.change(
+        fn=update_char_count,
+        inputs=[conversation_output, chunk_size],
+        outputs=[char_count]
+    )
+    # 이벤트 연결
+    def get_article_input(input_type, url_input, pdf_input, keyword_input):
+        """Get the appropriate input based on input type"""
+        if input_type == "URL":
+            return url_input
+        elif input_type == "PDF":
+            return pdf_input
+        else:  # Keyword
+            return keyword_input
+    convert_btn.click(
+        fn=lambda input_type, url_input, pdf_input, keyword_input, mode: synthesize_sync(
+            get_article_input(input_type, url_input, pdf_input, keyword_input), input_type, mode
+        ),
+        inputs=[input_type_selector, url_input, pdf_input, keyword_input, mode_selector],
+        outputs=[conversation_output, status_output]
+    )
+    generate_audio_btn.click(
+        fn=regenerate_audio_sync,
+        inputs=[conversation_output, ref_audio, exaggeration, temperature, seed_num, cfg_weight, chunk_size],
+        outputs=[status_output, audio_output]
     )
+# Launch the app
+if __name__ == "__main__":
+    demo.queue(api_open=True, default_concurrency_limit=10).launch(
+        show_api=True,
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860
+    )