seawolf2357 commited on
Commit
c012459
ยท
verified ยท
1 Parent(s): bc8f404

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +980 -177
app.py CHANGED
@@ -1,14 +1,78 @@
1
- import random
2
- import numpy as np
3
- import torch
4
- from chatterbox.src.chatterbox.tts import ChatterboxTTS
5
- import gradio as gr
6
  import spaces
 
 
 
 
 
 
7
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
  print(f"๐Ÿš€ Running on device: {DEVICE}")
11
 
 
 
 
 
 
12
  def set_seed(seed: int):
13
  """Sets the random seed for reproducibility across torch, numpy, and random."""
14
  torch.manual_seed(seed)
@@ -18,6 +82,7 @@ def set_seed(seed: int):
18
  random.seed(seed)
19
  np.random.seed(seed)
20
 
 
21
  def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
22
  """
23
  ํ…์ŠคํŠธ๋ฅผ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋‚˜๋ˆ„๋˜, ๊ฐ ์ฒญํฌ๊ฐ€ max_chars๋ฅผ ๋„˜์ง€ ์•Š๋„๋ก ํ•ฉ๋‹ˆ๋‹ค.
@@ -65,215 +130,953 @@ def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
65
 
66
  return chunks
67
 
68
- @spaces.GPU(duration=120) # GPU ์‚ฌ์šฉ ์‹œ๊ฐ„์„ ์ถฉ๋ถ„ํžˆ ์„ค์ •
69
- def generate_tts_audio_gpu(
70
- text_input: str,
71
- audio_prompt_path_input: str,
72
- exaggeration_input: float,
73
- temperature_input: float,
74
- seed_num_input: int,
75
- cfgw_input: float,
76
- chunk_size_input: int
77
- ) -> tuple[int, np.ndarray]:
78
- """
79
- GPU์—์„œ TTS ์˜ค๋””์˜ค๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
80
- """
81
- # GPU ํ•จ์ˆ˜ ๋‚ด์—์„œ ๋ชจ๋ธ ๋กœ๋“œ
82
- model = ChatterboxTTS.from_pretrained(DEVICE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- if seed_num_input != 0:
85
- set_seed(int(seed_num_input))
 
 
 
 
 
 
 
 
 
 
86
 
87
- # ํ…์ŠคํŠธ๊ฐ€ ์งง์œผ๋ฉด ๋‹จ์ผ ์ƒ์„ฑ
88
- if len(text_input) <= 300:
89
- print(f"๋‹จ์ผ ํ…์ŠคํŠธ ์ƒ์„ฑ: '{text_input[:50]}...'")
90
- wav = model.generate(
91
- text_input,
92
- audio_prompt_path=audio_prompt_path_input,
93
- exaggeration=exaggeration_input,
94
- temperature=temperature_input,
95
- cfg_weight=cfgw_input,
96
- )
97
- return (model.sr, wav.squeeze(0).numpy())
98
 
99
- # ๊ธด ํ…์ŠคํŠธ๋Š” ์ฒญํฌ๋กœ ๋ถ„ํ• 
100
- chunks = split_text_into_chunks(text_input, max_chars=chunk_size_input)
101
- total_chunks = len(chunks)
102
- print(f"ํ…์ŠคํŠธ๋ฅผ {total_chunks}๊ฐœ์˜ ์ฒญํฌ๋กœ ๋ถ„ํ• ํ–ˆ์Šต๋‹ˆ๋‹ค.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- audio_segments = []
 
 
 
 
 
 
 
 
105
 
106
- for i, chunk in enumerate(chunks):
107
- print(f"์ฒญํฌ {i + 1}/{total_chunks} ์ƒ์„ฑ ์ค‘: '{chunk[:50]}...'")
108
-
109
- try:
110
- wav = model.generate(
111
- chunk,
112
- audio_prompt_path=audio_prompt_path_input,
113
- exaggeration=exaggeration_input,
114
- temperature=temperature_input,
115
- cfg_weight=cfgw_input,
116
- )
117
- wav_chunk = wav.squeeze(0).numpy()
118
- audio_segments.append(wav_chunk)
119
- except Exception as e:
120
- print(f"์ฒญํฌ {i + 1} ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
121
- continue
122
 
123
- if not audio_segments:
124
- raise RuntimeError("์˜ค๋””์˜ค ์ƒ์„ฑ์— ์‹คํŒจํ–ˆ์Šต๋‹ˆ๋‹ค.")
 
 
 
 
125
 
126
- # ์˜ค๋””์˜ค ์„ธ๊ทธ๋จผํŠธ ์—ฐ๊ฒฐ
127
- silence_duration = int(0.2 * model.sr) # 0.2์ดˆ ๋ฌด์Œ
128
- silence = np.zeros(silence_duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- final_audio = []
131
- for i, segment in enumerate(audio_segments):
132
- final_audio.append(segment)
133
- if i < len(audio_segments) - 1:
134
- final_audio.append(silence)
135
 
136
- concatenated_audio = np.concatenate(final_audio)
 
137
 
138
- print(f"์˜ค๋””์˜ค ์ƒ์„ฑ ์™„๋ฃŒ. ์ด ๊ธธ์ด: {len(concatenated_audio) / model.sr:.2f}์ดˆ")
139
- return (model.sr, concatenated_audio)
140
 
141
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค
142
- with gr.Blocks() as demo:
143
- gr.Markdown(
144
- """
145
- # Chatterbox TTS Demo - ๋ฌด์ œํ•œ ๊ธธ์ด ๋ฒ„์ „
146
- ๊ธด ํ…์ŠคํŠธ๋„ ์ฒญํฌ๋กœ ๋‚˜๋ˆ„์–ด ์ฒ˜๋ฆฌํ•˜์—ฌ ์ œํ•œ ์—†์ด ์Œ์„ฑ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
 
 
 
147
 
148
- โš ๏ธ **์ฃผ์˜**: ๊ธด ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ ์‹œ ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
149
- """
150
- )
151
-
152
- with gr.Row():
153
- with gr.Column():
154
- text = gr.Textbox(
155
- value="Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.",
156
- label="ํ…์ŠคํŠธ ์ž…๋ ฅ (๊ธธ์ด ์ œํ•œ ์—†์Œ)",
157
- lines=10,
158
- max_lines=30
159
- )
160
-
161
- ref_wav = gr.Audio(
162
- sources=["upload", "microphone"],
163
- type="filepath",
164
- label="Reference Audio File (Optional)",
165
- value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac"
166
- )
167
-
168
- with gr.Row():
169
- exaggeration = gr.Slider(
170
- 0.25, 2, step=.05,
171
- label="Exaggeration (Neutral = 0.5)",
172
- value=.5
173
  )
174
- cfg_weight = gr.Slider(
175
- 0.2, 1, step=.05,
176
- label="CFG/Pace",
177
- value=0.5
 
 
 
 
 
 
 
 
 
178
  )
179
-
180
- chunk_size = gr.Slider(
181
- 100, 300, step=50,
182
- label="์ฒญํฌ ํฌ๊ธฐ (๋ฌธ์ž ์ˆ˜)",
183
- value=250,
184
- info="ํ…์ŠคํŠธ๋ฅผ ๋‚˜๋ˆŒ ์ฒญํฌ์˜ ์ตœ๋Œ€ ํฌ๊ธฐ์ž…๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
185
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- with gr.Accordion("๊ณ ๊ธ‰ ์˜ต์…˜", open=False):
188
- seed_num = gr.Number(value=0, label="Random seed (0 for random)")
189
- temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- run_btn = gr.Button("๐ŸŽค ์Œ์„ฑ ์ƒ์„ฑ", variant="primary")
 
 
 
 
 
192
 
193
- with gr.Column():
194
- audio_output = gr.Audio(label="์ƒ์„ฑ๋œ ์Œ์„ฑ")
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
- # ํ…์ŠคํŠธ ๊ธธ์ด ํ‘œ์‹œ
197
- char_count = gr.Textbox(
198
- label="ํ…์ŠคํŠธ ์ •๋ณด",
199
- value="0 ๋ฌธ์ž",
200
- interactive=False
 
 
 
 
 
 
 
201
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- status = gr.Textbox(
204
- label="์ƒํƒœ",
205
- value="๋Œ€๊ธฐ ์ค‘...",
206
- interactive=False
 
 
 
207
  )
208
 
209
- # ํ…์ŠคํŠธ ์ž…๋ ฅ ์‹œ ๋ฌธ์ž ์ˆ˜ ์—…๋ฐ์ดํŠธ
210
- def update_char_count(text, chunk_size):
211
- char_len = len(text)
212
- if char_len <= 300:
213
- return f"{char_len} ๋ฌธ์ž (๋‹จ์ผ ์ƒ์„ฑ)"
214
- else:
215
- chunks = split_text_into_chunks(text, max_chars=chunk_size)
216
- chunk_count = len(chunks)
217
- estimated_time = chunk_count * 3 # ์ฒญํฌ๋‹น ์•ฝ 3์ดˆ ์˜ˆ์ƒ
218
- return f"{char_len} ๋ฌธ์ž, {chunk_count}๊ฐœ ์ฒญํฌ (์˜ˆ์ƒ ์‹œ๊ฐ„: ์•ฝ {estimated_time}์ดˆ)"
219
-
220
- text.change(
221
- fn=update_char_count,
222
- inputs=[text, chunk_size],
223
- outputs=[char_count]
224
- )
225
-
226
- chunk_size.change(
227
- fn=update_char_count,
228
- inputs=[text, chunk_size],
229
- outputs=[char_count]
230
- )
231
 
232
- # ์ƒ์„ฑ ํ•จ์ˆ˜ ๋ž˜ํผ (์ƒํƒœ ์—…๋ฐ์ดํŠธ ํฌํ•จ)
233
- def generate_with_status(text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size):
 
234
  try:
235
- yield gr.update(value="์ฒ˜๋ฆฌ ์ค‘... GPU๋ฅผ ํ• ๋‹น๋ฐ›๋Š” ์ค‘์ž…๋‹ˆ๋‹ค."), None
236
 
237
- # GPU ํ•จ์ˆ˜ ํ˜ธ์ถœ
238
- sr, audio = generate_tts_audio_gpu(
239
- text, ref_wav, exaggeration, temp, seed_num, cfg_weight, chunk_size
 
 
 
 
 
 
 
240
  )
 
241
 
242
- yield gr.update(value="โœ… ์ƒ์„ฑ ์™„๋ฃŒ!"), (sr, audio)
 
 
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  except Exception as e:
245
- yield gr.update(value=f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
- run_btn.click(
248
- fn=generate_with_status,
249
- inputs=[
250
- text,
251
- ref_wav,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  exaggeration,
253
- temp,
254
  seed_num,
255
  cfg_weight,
256
  chunk_size
257
- ],
258
- outputs=[status, audio_output],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  )
260
 
261
- gr.Markdown(
262
- """
263
- ### ๐Ÿ’ก ์‚ฌ์šฉ ํŒ:
264
- - **300์ž ์ดํ•˜**: ๋น ๋ฅธ ๋‹จ์ผ ์ƒ์„ฑ
265
- - **300์ž ์ดˆ๊ณผ**: ์ž๋™์œผ๋กœ ์ฒญํฌ๋กœ ๋ถ„ํ• ํ•˜์—ฌ ์ฒ˜๋ฆฌ
266
- - ์ฒญํฌ ํฌ๊ธฐ๊ฐ€ ์ž‘์„์ˆ˜๋ก ์ž์—ฐ์Šค๋Ÿฝ์ง€๋งŒ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„์ด ์ฆ๊ฐ€ํ•ฉ๋‹ˆ๋‹ค
267
- - GPU ํ• ๋‹น์„ ๊ธฐ๋‹ค๋ฆฌ๋Š” ์‹œ๊ฐ„์ด ์žˆ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค
268
-
269
- ### โฑ๏ธ ์˜ˆ์ƒ ์ฒ˜๋ฆฌ ์‹œ๊ฐ„:
270
- - 300์ž ์ดํ•˜: ์•ฝ 5-10์ดˆ
271
- - 1000์ž: ์•ฝ 15-30์ดˆ
272
- - 5000์ž: ์•ฝ 1-2๋ถ„
273
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  )
275
 
276
- # ์•ฑ ์‹คํ–‰ ์‹œ ๋ชจ๋ธ ๋กœ๋“œ ์ œ๊ฑฐ (GPU ํ•จ์ˆ˜ ๋‚ด์—์„œ๋งŒ ๋กœ๋“œ)
277
- print("์•ฑ์ด ์‹œ์ž‘๋˜์—ˆ์Šต๋‹ˆ๋‹ค. ๋ชจ๋ธ์€ ์ฒซ ์ƒ์„ฑ ์‹œ ๋กœ๋“œ๋ฉ๋‹ˆ๋‹ค.")
278
 
279
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import spaces
2
+ import gradio as gr
3
+ import os
4
+ import asyncio
5
+ import torch
6
+ import io
7
+ import json
8
  import re
9
+ import httpx
10
+ import tempfile
11
+ import wave
12
+ import base64
13
+ import numpy as np
14
+ import soundfile as sf
15
+ import subprocess
16
+ import shutil
17
+ import requests
18
+ import logging
19
+ import random
20
+ from datetime import datetime, timedelta
21
+ from typing import List, Tuple, Dict, Optional
22
+ from pathlib import Path
23
+ from threading import Thread
24
+ from dotenv import load_dotenv
25
+
26
+ # PDF processing imports
27
+ from langchain_community.document_loaders import PyPDFLoader
28
+
29
+ # OpenAI imports
30
+ from openai import OpenAI
31
+
32
+ # Transformers imports (for legacy local mode)
33
+ from transformers import (
34
+ AutoModelForCausalLM,
35
+ AutoTokenizer,
36
+ TextIteratorStreamer,
37
+ BitsAndBytesConfig,
38
+ )
39
+
40
+ # Llama CPP imports (for new local mode)
41
+ try:
42
+ from llama_cpp import Llama
43
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
44
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
45
+ from llama_cpp_agent.chat_history import BasicChatHistory
46
+ from llama_cpp_agent.chat_history.messages import Roles
47
+ from huggingface_hub import hf_hub_download
48
+ LLAMA_CPP_AVAILABLE = True
49
+ except ImportError:
50
+ LLAMA_CPP_AVAILABLE = False
51
+
52
+ # Chatterbox TTS imports
53
+ try:
54
+ from chatterbox.src.chatterbox.tts import ChatterboxTTS
55
+ CHATTERBOX_AVAILABLE = True
56
+ except ImportError:
57
+ CHATTERBOX_AVAILABLE = False
58
+
59
+ # Import config and prompts
60
+ from config_prompts import (
61
+ ConversationConfig,
62
+ PromptBuilder,
63
+ DefaultConversations,
64
+ )
65
+
66
+ load_dotenv()
67
 
68
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
69
  print(f"๐Ÿš€ Running on device: {DEVICE}")
70
 
71
+ # Brave Search API ์„ค์ •
72
+ BRAVE_KEY = os.getenv("BSEARCH_API")
73
+ BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
74
+
75
+
76
  def set_seed(seed: int):
77
  """Sets the random seed for reproducibility across torch, numpy, and random."""
78
  torch.manual_seed(seed)
 
82
  random.seed(seed)
83
  np.random.seed(seed)
84
 
85
+
86
  def split_text_into_chunks(text: str, max_chars: int = 250) -> list[str]:
87
  """
88
  ํ…์ŠคํŠธ๋ฅผ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋‚˜๋ˆ„๋˜, ๊ฐ ์ฒญํฌ๊ฐ€ max_chars๋ฅผ ๋„˜์ง€ ์•Š๋„๋ก ํ•ฉ๋‹ˆ๋‹ค.
 
130
 
131
  return chunks
132
 
133
+
134
+ def brave_search(query: str, count: int = 8, freshness_days: int | None = None):
135
+ """Brave Search API๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ตœ์‹  ์ •๋ณด ๊ฒ€์ƒ‰"""
136
+ if not BRAVE_KEY:
137
+ return []
138
+ params = {"q": query, "count": str(count)}
139
+ if freshness_days:
140
+ dt_from = (datetime.utcnow() - timedelta(days=freshness_days)).strftime("%Y-%m-%d")
141
+ params["freshness"] = dt_from
142
+ try:
143
+ r = requests.get(
144
+ BRAVE_ENDPOINT,
145
+ headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_KEY},
146
+ params=params,
147
+ timeout=15
148
+ )
149
+ raw = r.json().get("web", {}).get("results") or []
150
+ return [{
151
+ "title": r.get("title", ""),
152
+ "url": r.get("url", r.get("link", "")),
153
+ "snippet": r.get("description", r.get("text", "")),
154
+ "host": re.sub(r"https?://(www\.)?", "", r.get("url", "")).split("/")[0]
155
+ } for r in raw[:count]]
156
+ except Exception as e:
157
+ logging.error(f"Brave search error: {e}")
158
+ return []
159
+
160
+
161
+ def format_search_results(query: str, for_keyword: bool = False) -> str:
162
+ """๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ํฌ๋งทํŒ…ํ•˜์—ฌ ๋ฐ˜ํ™˜"""
163
+ # ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰์˜ ๊ฒฝ์šฐ ๋” ๋งŽ์€ ๊ฒฐ๊ณผ ์‚ฌ์šฉ
164
+ count = 5 if for_keyword else 3
165
+ rows = brave_search(query, count, freshness_days=7 if not for_keyword else None)
166
+ if not rows:
167
+ return ""
168
 
169
+ results = []
170
+ # ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰์˜ ๊ฒฝ์šฐ ๋” ์ƒ์„ธํ•œ ์ •๋ณด ํฌํ•จ
171
+ max_results = 4 if for_keyword else 2
172
+ for r in rows[:max_results]:
173
+ if for_keyword:
174
+ # ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰์€ ๋” ๊ธด ์Šค๋‹ˆํŽซ ์‚ฌ์šฉ
175
+ snippet = r['snippet'][:200] + "..." if len(r['snippet']) > 200 else r['snippet']
176
+ results.append(f"**{r['title']}**\n{snippet}\nSource: {r['host']}")
177
+ else:
178
+ # ์ผ๋ฐ˜ ๊ฒ€์ƒ‰์€ ์งง์€ ์Šค๋‹ˆํŽซ
179
+ snippet = r['snippet'][:100] + "..." if len(r['snippet']) > 100 else r['snippet']
180
+ results.append(f"- {r['title']}: {snippet}")
181
 
182
+ return "\n\n".join(results) + "\n"
183
+
184
+
185
+ def extract_keywords_for_search(text: str, language: str = "English") -> List[str]:
186
+ """ํ…์ŠคํŠธ์—์„œ ๊ฒ€์ƒ‰ํ•  ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
187
+ # ํ…์ŠคํŠธ ์•ž๋ถ€๋ถ„๋งŒ ์‚ฌ์šฉ (๋„ˆ๋ฌด ๋งŽ์€ ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ ๋ฐฉ์ง€)
188
+ text_sample = text[:500]
 
 
 
 
189
 
190
+ # ์˜์–ด๋Š” ๋Œ€๋ฌธ์ž๋กœ ์‹œ์ž‘ํ•˜๋Š” ๋‹จ์–ด ์ค‘ ๊ฐ€์žฅ ๊ธด ๊ฒƒ 1๊ฐœ
191
+ words = text_sample.split()
192
+ keywords = [word.strip('.,!?;:') for word in words
193
+ if len(word) > 4 and word[0].isupper()]
194
+ if keywords:
195
+ return [max(keywords, key=len)] # ๊ฐ€์žฅ ๊ธด ๋‹จ์–ด 1๊ฐœ
196
+ return []
197
+
198
+
199
+ def search_and_compile_content(keyword: str, language: str = "English") -> str:
200
+ """ํ‚ค์›Œ๋“œ๋กœ ๊ฒ€์ƒ‰ํ•˜์—ฌ ์ถฉ๋ถ„ํ•œ ์ฝ˜ํ…์ธ  ์ปดํŒŒ์ผ"""
201
+ if not BRAVE_KEY:
202
+ # API ์—†์„ ๋•Œ๋„ ๊ธฐ๋ณธ ์ฝ˜ํ…์ธ  ์ƒ์„ฑ
203
+ return f"""
204
+ Comprehensive information about '{keyword}':
205
+
206
+ {keyword} is a significant topic in modern society.
207
+ This subject impacts our lives in various ways and has been
208
+ gaining increasing attention recently.
209
+
210
+ Key aspects:
211
+ 1. Technological advancement and innovation
212
+ 2. Social impact and changes
213
+ 3. Future prospects and possibilities
214
+ 4. Practical applications
215
+ 5. Global trends and developments
216
+
217
+ Experts predict that {keyword} will become even more important,
218
+ and it's crucial to develop a deep understanding of this topic.
219
+ """
220
 
221
+ # ์˜์–ด ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ
222
+ queries = [
223
+ f"{keyword} latest news 2024",
224
+ f"{keyword} explained comprehensive",
225
+ f"{keyword} trends forecast",
226
+ f"{keyword} advantages disadvantages",
227
+ f"{keyword} how to use",
228
+ f"{keyword} expert opinions"
229
+ ]
230
 
231
+ all_content = []
232
+ total_content_length = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
+ for query in queries:
235
+ results = brave_search(query, count=5) # ๋” ๋งŽ์€ ๊ฒฐ๊ณผ ๊ฐ€์ ธ์˜ค๊ธฐ
236
+ for r in results[:3]: # ๊ฐ ์ฟผ๋ฆฌ๋‹น ์ƒ์œ„ 3๊ฐœ
237
+ content = f"**{r['title']}**\n{r['snippet']}\nSource: {r['host']}\n"
238
+ all_content.append(content)
239
+ total_content_length += len(r['snippet'])
240
 
241
+ # ์ฝ˜ํ…์ธ ๊ฐ€ ๋ถ€์กฑํ•˜๋ฉด ์ถ”๊ฐ€ ์ƒ์„ฑ
242
+ if total_content_length < 1000: # ์ตœ์†Œ 1000์ž ํ™•๋ณด
243
+ additional_content = f"""
244
+ Additional insights:
245
+ Recent developments in {keyword} show rapid advancement in this field.
246
+ Many experts are actively researching this topic, and its practical
247
+ applications continue to expand.
248
+
249
+ Key points to note:
250
+ - Accelerating technological innovation
251
+ - Improving user experience
252
+ - Enhanced accessibility
253
+ - Increased cost efficiency
254
+ - Growing global market
255
+
256
+ These factors are making the future of {keyword} increasingly promising.
257
+ """
258
+ all_content.append(additional_content)
259
 
260
+ # ์ปดํŒŒ์ผ๋œ ์ฝ˜ํ…์ธ  ๋ฐ˜ํ™˜
261
+ compiled = "\n\n".join(all_content)
 
 
 
262
 
263
+ # ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ์†Œ๊ฐœ
264
+ intro = f"### Comprehensive information and latest trends about '{keyword}':\n\n"
265
 
266
+ return intro + compiled
 
267
 
268
+
269
+ class UnifiedAudioConverter:
270
+ def __init__(self, config: ConversationConfig):
271
+ self.config = config
272
+ self.llm_client = None
273
+ self.legacy_local_model = None
274
+ self.legacy_tokenizer = None
275
+ # ์ƒˆ๋กœ์šด ๋กœ์ปฌ LLM ๊ด€๋ จ
276
+ self.local_llm = None
277
+ self.local_llm_model = None
278
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
279
+ # ํ”„๋กฌํ”„ํŠธ ๋นŒ๋” ์ถ”๊ฐ€
280
+ self.prompt_builder = PromptBuilder()
281
 
282
+ def initialize_api_mode(self, api_key: str):
283
+ """Initialize API mode with Together API"""
284
+ self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
285
+
286
+ @spaces.GPU(duration=120)
287
+ def initialize_local_mode(self):
288
+ """Initialize new local mode with Llama CPP"""
289
+ if not LLAMA_CPP_AVAILABLE:
290
+ raise RuntimeError("Llama CPP dependencies not available. Please install llama-cpp-python and llama-cpp-agent.")
291
+
292
+ if self.local_llm is None or self.local_llm_model != self.config.local_model_name:
293
+ try:
294
+ # ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ
295
+ model_path = hf_hub_download(
296
+ repo_id=self.config.local_model_repo,
297
+ filename=self.config.local_model_name,
298
+ local_dir="./models"
 
 
 
 
 
 
 
 
299
  )
300
+
301
+ model_path_local = os.path.join("./models", self.config.local_model_name)
302
+
303
+ if not os.path.exists(model_path_local):
304
+ raise RuntimeError(f"Model file not found at {model_path_local}")
305
+
306
+ # Llama ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
307
+ self.local_llm = Llama(
308
+ model_path=model_path_local,
309
+ flash_attn=True,
310
+ n_gpu_layers=81 if torch.cuda.is_available() else 0,
311
+ n_batch=1024,
312
+ n_ctx=16384,
313
  )
314
+ self.local_llm_model = self.config.local_model_name
315
+ print(f"Local LLM initialized: {model_path_local}")
316
+
317
+ except Exception as e:
318
+ print(f"Failed to initialize local LLM: {e}")
319
+ raise RuntimeError(f"Failed to initialize local LLM: {e}")
320
+
321
+ @spaces.GPU(duration=60)
322
+ def initialize_legacy_local_mode(self):
323
+ """Initialize legacy local mode with Hugging Face model (fallback)"""
324
+ if self.legacy_local_model is None:
325
+ quantization_config = BitsAndBytesConfig(
326
+ load_in_4bit=True,
327
+ bnb_4bit_compute_dtype=torch.float16
328
  )
329
+ self.legacy_local_model = AutoModelForCausalLM.from_pretrained(
330
+ self.config.legacy_local_model_name,
331
+ quantization_config=quantization_config
332
+ )
333
+ self.legacy_tokenizer = AutoTokenizer.from_pretrained(
334
+ self.config.legacy_local_model_name,
335
+ revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
336
+ )
337
+
338
+ def fetch_text(self, url: str) -> str:
339
+ """Fetch text content from URL"""
340
+ if not url:
341
+ raise ValueError("URL cannot be empty")
342
+
343
+ if not url.startswith("http://") and not url.startswith("https://"):
344
+ raise ValueError("URL must start with 'http://' or 'https://'")
345
+
346
+ full_url = f"{self.config.prefix_url}{url}"
347
+ try:
348
+ response = httpx.get(full_url, timeout=60.0)
349
+ response.raise_for_status()
350
+ return response.text
351
+ except httpx.HTTPError as e:
352
+ raise RuntimeError(f"Failed to fetch URL: {e}")
353
 
354
+ def extract_text_from_pdf(self, pdf_file) -> str:
355
+ """Extract text content from PDF file"""
356
+ try:
357
+ # Gradio returns file path, not file object
358
+ if isinstance(pdf_file, str):
359
+ pdf_path = pdf_file
360
+ else:
361
+ # If it's a file object (shouldn't happen with Gradio)
362
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
363
+ tmp_file.write(pdf_file.read())
364
+ pdf_path = tmp_file.name
365
+
366
+ # PDF ๋กœ๋“œ ๋ฐ ํ…์ŠคํŠธ ์ถ”์ถœ
367
+ loader = PyPDFLoader(pdf_path)
368
+ pages = loader.load()
369
+
370
+ # ๋ชจ๋“  ํŽ˜์ด์ง€์˜ ํ…์ŠคํŠธ๋ฅผ ๊ฒฐํ•ฉ
371
+ text = "\n".join([page.page_content for page in pages])
372
+
373
+ # ์ž„์‹œ ํŒŒ์ผ์ธ ๊ฒฝ์šฐ ์‚ญ์ œ
374
+ if not isinstance(pdf_file, str) and os.path.exists(pdf_path):
375
+ os.unlink(pdf_path)
376
+
377
+ return text
378
+ except Exception as e:
379
+ raise RuntimeError(f"Failed to extract text from PDF: {e}")
380
 
381
+ def _get_messages_formatter_type(self, model_name):
382
+ """Get appropriate message formatter for the model"""
383
+ if "Mistral" in model_name or "BitSix" in model_name:
384
+ return MessagesFormatterType.CHATML
385
+ else:
386
+ return MessagesFormatterType.LLAMA_3
387
 
388
+ @spaces.GPU(duration=120)
389
+ def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
390
+ """Extract conversation using new local LLM with enhanced professional style"""
391
+ try:
392
+ # ๊ฒ€์ƒ‰ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ
393
+ search_context = ""
394
+ if BRAVE_KEY and not text.startswith("Keyword-based content:"):
395
+ try:
396
+ keywords = extract_keywords_for_search(text, language)
397
+ if keywords:
398
+ search_query = f"{keywords[0]} latest news"
399
+ search_context = format_search_results(search_query)
400
+ print(f"Search context added for: {search_query}")
401
+ except Exception as e:
402
+ print(f"Search failed, continuing without context: {e}")
403
 
404
+ # ๋จผ์ € ์ƒˆ๋กœ์šด ๋กœ์ปฌ LLM ์‹œ๋„
405
+ self.initialize_local_mode()
406
+
407
+ chat_template = self._get_messages_formatter_type(self.config.local_model_name)
408
+ provider = LlamaCppPythonProvider(self.local_llm)
409
+
410
+ # ์˜์–ด ์ „์šฉ ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€
411
+ system_message = (
412
+ f"You are a professional podcast scriptwriter creating high-quality, "
413
+ f"insightful discussions in English. Create exactly 12 conversation exchanges "
414
+ f"with professional expertise. All dialogue must be in English. "
415
+ f"Respond only in JSON format."
416
  )
417
+
418
+ agent = LlamaCppAgent(
419
+ provider,
420
+ system_prompt=system_message,
421
+ predefined_messages_formatter_type=chat_template,
422
+ debug_output=False
423
+ )
424
+
425
+ settings = provider.get_provider_default_settings()
426
+ settings.temperature = 0.75
427
+ settings.top_k = 40
428
+ settings.top_p = 0.95
429
+ settings.max_tokens = self.config.max_tokens
430
+ settings.repeat_penalty = 1.1
431
+ settings.stream = False
432
+
433
+ messages = BasicChatHistory()
434
 
435
+ prompt = self.prompt_builder.build_prompt(text, language, search_context)
436
+ response = agent.get_chat_response(
437
+ prompt,
438
+ llm_sampling_settings=settings,
439
+ chat_history=messages,
440
+ returns_streaming_generator=False,
441
+ print_output=False
442
  )
443
 
444
+ # JSON ํŒŒ์‹ฑ
445
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
446
+ json_match = re.search(pattern, response)
447
+
448
+ if json_match:
449
+ conversation_data = json.loads(json_match.group())
450
+ return conversation_data
451
+ else:
452
+ raise ValueError("No valid JSON found in local LLM response")
453
+
454
+ except Exception as e:
455
+ print(f"Local LLM failed: {e}, falling back to legacy local method")
456
+ return self.extract_conversation_legacy_local(text, language, progress, search_context)
 
 
 
 
 
 
 
 
 
457
 
458
+ @spaces.GPU(duration=120)
459
+ def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None, search_context: str = "") -> Dict:
460
+ """Extract conversation using legacy local model"""
461
  try:
462
+ self.initialize_legacy_local_mode()
463
 
464
+ # ์˜์–ด ์ „์šฉ ๋ฉ”์‹œ์ง€
465
+ messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
466
+
467
+ terminators = [
468
+ self.legacy_tokenizer.eos_token_id,
469
+ self.legacy_tokenizer.convert_tokens_to_ids("<|eot_id|>")
470
+ ]
471
+
472
+ chat_messages = self.legacy_tokenizer.apply_chat_template(
473
+ messages, tokenize=False, add_generation_prompt=True
474
  )
475
+ model_inputs = self.legacy_tokenizer([chat_messages], return_tensors="pt").to(self.device)
476
 
477
+ streamer = TextIteratorStreamer(
478
+ self.legacy_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
479
+ )
480
 
481
+ generate_kwargs = dict(
482
+ model_inputs,
483
+ streamer=streamer,
484
+ max_new_tokens=self.config.max_new_tokens,
485
+ do_sample=True,
486
+ temperature=0.75,
487
+ eos_token_id=terminators,
488
+ )
489
+
490
+ t = Thread(target=self.legacy_local_model.generate, kwargs=generate_kwargs)
491
+ t.start()
492
+
493
+ partial_text = ""
494
+ for new_text in streamer:
495
+ partial_text += new_text
496
+
497
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
498
+ json_match = re.search(pattern, partial_text)
499
+
500
+ if json_match:
501
+ return json.loads(json_match.group())
502
+ else:
503
+ raise ValueError("No valid JSON found in legacy local response")
504
+
505
+ except Exception as e:
506
+ print(f"Legacy local model also failed: {e}")
507
+ return DefaultConversations.get_conversation("English")
508
+
509
+ def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
510
+ """Extract conversation using API"""
511
+ if not self.llm_client:
512
+ raise RuntimeError("API mode not initialized")
513
+
514
+ try:
515
+ # ๊ฒ€์ƒ‰ ์ปจํ…์ŠคํŠธ ์ƒ์„ฑ
516
+ search_context = ""
517
+ if BRAVE_KEY and not text.startswith("Keyword-based content:"):
518
+ try:
519
+ keywords = extract_keywords_for_search(text, language)
520
+ if keywords:
521
+ search_query = f"{keywords[0]} latest news"
522
+ search_context = format_search_results(search_query)
523
+ print(f"Search context added for: {search_query}")
524
+ except Exception as e:
525
+ print(f"Search failed, continuing without context: {e}")
526
+
527
+ # ๋ฉ”์‹œ์ง€ ๋นŒ๋“œ
528
+ messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
529
+
530
+ chat_completion = self.llm_client.chat.completions.create(
531
+ messages=messages,
532
+ model=self.config.api_model_name,
533
+ temperature=0.75,
534
+ )
535
+
536
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
537
+ json_match = re.search(pattern, chat_completion.choices[0].message.content)
538
+
539
+ if not json_match:
540
+ raise ValueError("No valid JSON found in response")
541
+
542
+ return json.loads(json_match.group())
543
  except Exception as e:
544
+ raise RuntimeError(f"Failed to extract conversation: {e}")
545
+
546
+ def parse_conversation_text(self, conversation_text: str) -> Dict:
547
+ """Parse conversation text back to JSON format"""
548
+ lines = conversation_text.strip().split('\n')
549
+ conversation_data = {"conversation": []}
550
+
551
+ for line in lines:
552
+ if ':' in line:
553
+ speaker, text = line.split(':', 1)
554
+ conversation_data["conversation"].append({
555
+ "speaker": speaker.strip(),
556
+ "text": text.strip()
557
+ })
558
+
559
+ return conversation_data
560
 
561
+ @spaces.GPU(duration=120)
562
+ def generate_tts_audio_gpu(
563
+ self,
564
+ conversation_json: Dict,
565
+ audio_prompt_path_input: str,
566
+ exaggeration_input: float = 0.5,
567
+ temperature_input: float = 0.8,
568
+ seed_num_input: int = 0,
569
+ cfgw_input: float = 0.5,
570
+ chunk_size_input: int = 250
571
+ ) -> tuple[int, np.ndarray]:
572
+ """
573
+ Chatterbox TTS๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋Œ€ํ™”๋ฅผ ์Œ์„ฑ์œผ๋กœ ๋ณ€ํ™˜
574
+ """
575
+ if not CHATTERBOX_AVAILABLE:
576
+ raise RuntimeError("Chatterbox TTS not available")
577
+
578
+ # GPU ํ•จ์ˆ˜ ๋‚ด์—์„œ ๋ชจ๋ธ ๋กœ๋“œ
579
+ model = ChatterboxTTS.from_pretrained(DEVICE)
580
+
581
+ if seed_num_input != 0:
582
+ set_seed(int(seed_num_input))
583
+
584
+ audio_segments = []
585
+
586
+ for i, turn in enumerate(conversation_json["conversation"]):
587
+ text = turn["text"]
588
+ if not text.strip():
589
+ continue
590
+
591
+ print(f"์ƒ์„ฑ ์ค‘: Speaker {i+1} - '{text[:50]}...'")
592
+
593
+ try:
594
+ # ํ…์ŠคํŠธ๊ฐ€ ์งง์œผ๋ฉด ๋‹จ์ผ ์ƒ์„ฑ
595
+ if len(text) <= 300:
596
+ wav = model.generate(
597
+ text,
598
+ audio_prompt_path=audio_prompt_path_input,
599
+ exaggeration=exaggeration_input,
600
+ temperature=temperature_input,
601
+ cfg_weight=cfgw_input,
602
+ )
603
+ wav_chunk = wav.squeeze(0).numpy()
604
+ audio_segments.append(wav_chunk)
605
+ else:
606
+ # ๊ธด ํ…์ŠคํŠธ๋Š” ์ฒญํฌ๋กœ ๋ถ„ํ• 
607
+ chunks = split_text_into_chunks(text, max_chars=chunk_size_input)
608
+
609
+ chunk_audio_segments = []
610
+ for chunk in chunks:
611
+ wav = model.generate(
612
+ chunk,
613
+ audio_prompt_path=audio_prompt_path_input,
614
+ exaggeration=exaggeration_input,
615
+ temperature=temperature_input,
616
+ cfg_weight=cfgw_input,
617
+ )
618
+ wav_chunk = wav.squeeze(0).numpy()
619
+ chunk_audio_segments.append(wav_chunk)
620
+
621
+ # ์ฒญํฌ๋“ค์„ ์—ฐ๊ฒฐ
622
+ if chunk_audio_segments:
623
+ silence_duration = int(0.1 * model.sr) # 0.1์ดˆ ๋ฌด์Œ
624
+ silence = np.zeros(silence_duration)
625
+
626
+ turn_audio = []
627
+ for j, segment in enumerate(chunk_audio_segments):
628
+ turn_audio.append(segment)
629
+ if j < len(chunk_audio_segments) - 1:
630
+ turn_audio.append(silence)
631
+
632
+ concatenated_turn = np.concatenate(turn_audio)
633
+ audio_segments.append(concatenated_turn)
634
+
635
+ except Exception as e:
636
+ print(f"Speaker {i+1} ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
637
+ continue
638
+
639
+ if not audio_segments:
640
+ raise RuntimeError("์˜ค๋””์˜ค ์ƒ์„ฑ์— ์‹คํŒจํ–ˆ์Šต๋‹ˆ๋‹ค.")
641
+
642
+ # ๋ชจ๋“  ์Šคํ”ผ์ปค์˜ ์˜ค๋””์˜ค ์„ธ๊ทธ๋จผํŠธ ์—ฐ๊ฒฐ
643
+ speaker_silence_duration = int(0.5 * model.sr) # ์Šคํ”ผ์ปค ๊ฐ„ 0.5์ดˆ ๋ฌด์Œ
644
+ speaker_silence = np.zeros(speaker_silence_duration)
645
+
646
+ final_audio = []
647
+ for i, segment in enumerate(audio_segments):
648
+ final_audio.append(segment)
649
+ if i < len(audio_segments) - 1:
650
+ final_audio.append(speaker_silence)
651
+
652
+ concatenated_audio = np.concatenate(final_audio)
653
+
654
+ print(f"์˜ค๋””์˜ค ์ƒ์„ฑ ์™„๋ฃŒ. ์ด ๊ธธ์ด: {len(concatenated_audio) / model.sr:.2f}์ดˆ")
655
+ return (model.sr, concatenated_audio)
656
+
657
+ def _create_output_directory(self) -> str:
658
+ """Create a unique output directory"""
659
+ random_bytes = os.urandom(8)
660
+ folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
661
+ os.makedirs(folder_name, exist_ok=True)
662
+ return folder_name
663
+
664
+
665
+ # Global converter instance
666
+ converter = UnifiedAudioConverter(ConversationConfig())
667
+
668
+
669
+ async def synthesize(article_input, input_type: str = "URL", mode: str = "Local"):
670
+ """Main synthesis function - handles URL, PDF, and Keyword inputs"""
671
+ try:
672
+ # Extract text based on input type
673
+ if input_type == "URL":
674
+ if not article_input or not isinstance(article_input, str):
675
+ return "Please provide a valid URL.", None
676
+ text = converter.fetch_text(article_input)
677
+ elif input_type == "PDF":
678
+ if not article_input:
679
+ return "Please upload a PDF file.", None
680
+ text = converter.extract_text_from_pdf(article_input)
681
+ else: # Keyword
682
+ if not article_input or not isinstance(article_input, str):
683
+ return "Please provide a keyword or topic.", None
684
+ text = search_and_compile_content(article_input, "English")
685
+ text = f"Keyword-based content:\n{text}"
686
+
687
+ # Limit text to max words
688
+ words = text.split()
689
+ if len(words) > converter.config.max_words:
690
+ text = " ".join(words[:converter.config.max_words])
691
+
692
+ # Extract conversation based on mode
693
+ if mode == "Local":
694
+ try:
695
+ conversation_json = converter.extract_conversation_local(text, "English")
696
+ except Exception as e:
697
+ print(f"Local mode failed: {e}, trying API fallback")
698
+ api_key = os.environ.get("TOGETHER_API_KEY")
699
+ if api_key:
700
+ converter.initialize_api_mode(api_key)
701
+ conversation_json = converter.extract_conversation_api(text, "English")
702
+ else:
703
+ raise RuntimeError("Local mode failed and no API key available for fallback")
704
+ else: # API mode
705
+ api_key = os.environ.get("TOGETHER_API_KEY")
706
+ if not api_key:
707
+ print("API key not found, falling back to local mode")
708
+ conversation_json = converter.extract_conversation_local(text, "English")
709
+ else:
710
+ try:
711
+ converter.initialize_api_mode(api_key)
712
+ conversation_json = converter.extract_conversation_api(text, "English")
713
+ except Exception as e:
714
+ print(f"API mode failed: {e}, falling back to local mode")
715
+ conversation_json = converter.extract_conversation_local(text, "English")
716
+
717
+ # Generate conversation text
718
+ conversation_text = "\n".join(
719
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
720
+ for i, turn in enumerate(conversation_json["conversation"])
721
+ )
722
+
723
+ return conversation_text, None
724
+
725
+ except Exception as e:
726
+ return f"Error: {str(e)}", None
727
+
728
+
729
+ async def regenerate_audio(
730
+ conversation_text: str,
731
+ ref_audio_path: str,
732
+ exaggeration: float = 0.5,
733
+ temperature: float = 0.8,
734
+ seed_num: int = 0,
735
+ cfg_weight: float = 0.5,
736
+ chunk_size: int = 250
737
+ ):
738
+ """Regenerate audio from edited conversation text using Chatterbox TTS"""
739
+ if not conversation_text.strip():
740
+ return "Please provide conversation text.", None
741
+
742
+ try:
743
+ conversation_json = converter.parse_conversation_text(conversation_text)
744
+
745
+ if not conversation_json["conversation"]:
746
+ return "No valid conversation found in the text.", None
747
+
748
+ # Generate audio using Chatterbox TTS
749
+ sr, audio = converter.generate_tts_audio_gpu(
750
+ conversation_json,
751
+ ref_audio_path,
752
  exaggeration,
753
+ temperature,
754
  seed_num,
755
  cfg_weight,
756
  chunk_size
757
+ )
758
+
759
+ # Save audio to file
760
+ output_dir = converter._create_output_directory()
761
+ output_file = os.path.join(output_dir, "podcast_audio.wav")
762
+ sf.write(output_file, audio, sr)
763
+
764
+ return "Audio generated successfully!", output_file
765
+
766
+ except Exception as e:
767
+ return f"Error generating audio: {str(e)}", None
768
+
769
+
770
+ def synthesize_sync(article_input, input_type: str = "URL", mode: str = "Local"):
771
+ """Synchronous wrapper for async synthesis"""
772
+ return asyncio.run(synthesize(article_input, input_type, mode))
773
+
774
+
775
+ def regenerate_audio_sync(conversation_text: str, ref_audio_path: str, exaggeration: float, temperature: float, seed_num: int, cfg_weight: float, chunk_size: int):
776
+ """Synchronous wrapper for async audio regeneration"""
777
+ return asyncio.run(regenerate_audio(conversation_text, ref_audio_path, exaggeration, temperature, seed_num, cfg_weight, chunk_size))
778
+
779
+
780
+ def toggle_input_visibility(input_type):
781
+ """Toggle visibility of URL input, file upload, and keyword input based on input type"""
782
+ if input_type == "URL":
783
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
784
+ elif input_type == "PDF":
785
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
786
+ else: # Keyword
787
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
788
+
789
+
790
+ def update_char_count(text, chunk_size):
791
+ """ํ…์ŠคํŠธ ๊ธธ์ด ์ •๋ณด ์—…๋ฐ์ดํŠธ"""
792
+ char_len = len(text)
793
+ if char_len <= 300:
794
+ return f"{char_len} characters (single generation)"
795
+ else:
796
+ chunks = split_text_into_chunks(text, max_chars=chunk_size)
797
+ chunk_count = len(chunks)
798
+ estimated_time = chunk_count * 3 # ์ฒญํฌ๋‹น ์•ฝ 3์ดˆ ์˜ˆ์ƒ
799
+ return f"{char_len} characters, {chunk_count} chunks (estimated time: ~{estimated_time}s)"
800
+
801
+
802
+ # ๋ชจ๋ธ ์ดˆ๊ธฐํ™” (์•ฑ ์‹œ์ž‘ ์‹œ)
803
+ if LLAMA_CPP_AVAILABLE:
804
+ try:
805
+ model_path = hf_hub_download(
806
+ repo_id=converter.config.local_model_repo,
807
+ filename=converter.config.local_model_name,
808
+ local_dir="./models"
809
+ )
810
+ print(f"Model downloaded to: {model_path}")
811
+ except Exception as e:
812
+ print(f"Failed to download model at startup: {e}")
813
+
814
+
815
+ # Gradio Interface
816
+ with gr.Blocks(theme='soft', title="AI Podcast Generator", css="""
817
+ .container {max-width: 1200px; margin: auto; padding: 20px;}
818
+ .header-text {text-align: center; margin-bottom: 30px;}
819
+ .input-group {background: #f7f7f7; padding: 20px; border-radius: 10px; margin-bottom: 20px;}
820
+ .output-group {background: #f0f0f0; padding: 20px; border-radius: 10px;}
821
+ .status-box {background: #e8f4f8; padding: 15px; border-radius: 8px; margin-top: 10px;}
822
+ """) as demo:
823
+ with gr.Column(elem_classes="container"):
824
+ # ํ—ค๋”
825
+ with gr.Row(elem_classes="header-text"):
826
+ gr.Markdown("""
827
+ # ๐ŸŽ™๏ธ LIVE Podcast Generator with Chatterbox TTS
828
+ ### Convert any article, blog, PDF document, or topic into an engaging professional podcast conversation!
829
+ """)
830
+
831
+ with gr.Row(elem_classes="discord-badge"):
832
+ gr.HTML("""
833
+ <p style="text-align: center;">
834
+ <a href="https://discord.gg/openfreeai" target="_blank" style="display: inline-block; margin-right: 10px;">
835
+ <img src="https://img.shields.io/static/v1?label=Discord&message=Openfree%20AI&color=%230000ff&labelColor=%23800080&logo=discord&logoColor=white&style=for-the-badge" alt="badge">
836
+ </a>
837
+ <a href="https://open.spotify.com/show/36GtIP7iqJxCwp7FfXmTYK?si=KsIsUJq7SJiiudPTaMsXAA" target="_blank" style="display: inline-block;">
838
+ <img src="https://img.shields.io/static/v1?label=Spotify&message=Podcast&color=%230000ff&labelColor=%23000080&logo=Spotify&logoColor=white&style=for-the-badge" alt="badge">
839
+ </a>
840
+ <a href="https://huggingface.co/spaces/openfree/AI-Podcast" target="_blank" style="display: inline-block;">
841
+ <img src="https://img.shields.io/static/v1?label=Huggingface&message=AI%20Podcast&color=%230000ff&labelColor=%23ffa500&logo=huggingface&logoColor=white&style=for-the-badge" alt="badge">
842
+ </a>
843
+ </p>
844
+ """)
845
+
846
+ # ์ƒํƒœ ํ‘œ์‹œ ์„น์…˜
847
+ with gr.Row():
848
+ with gr.Column(scale=1):
849
+ gr.Markdown(f"""
850
+ #### ๐Ÿค– System Status
851
+ - **LLM**: {converter.config.local_model_name.split('.')[0]}
852
+ - **Fallback**: {converter.config.api_model_name.split('/')[-1]}
853
+ - **Llama CPP**: {"โœ… Ready" if LLAMA_CPP_AVAILABLE else "โŒ Not Available"}
854
+ - **Chatterbox TTS**: {"โœ… Ready" if CHATTERBOX_AVAILABLE else "โŒ Not Available"}
855
+ - **Search**: {"โœ… Brave API" if BRAVE_KEY else "โŒ No API"}
856
+ """)
857
+ with gr.Column(scale=1):
858
+ gr.Markdown("""
859
+ #### ๐ŸŽ™๏ธ Chatterbox TTS Features
860
+ - **High Quality**: Neural voice synthesis
861
+ - **Voice Cloning**: Upload your reference audio
862
+ - **Unlimited Length**: Automatic text chunking
863
+ - **Professional Style**: Expert podcast discussions
864
+ """)
865
+
866
+ # ๋ฉ”์ธ ์ž…๋ ฅ ์„น์…˜
867
+ with gr.Group(elem_classes="input-group"):
868
+ with gr.Row():
869
+ # ์™ผ์ชฝ: ์ž…๋ ฅ ์˜ต์…˜๋“ค
870
+ with gr.Column(scale=2):
871
+ # ์ž…๋ ฅ ํƒ€์ž… ์„ ํƒ
872
+ input_type_selector = gr.Radio(
873
+ choices=["URL", "PDF", "Keyword"],
874
+ value="URL",
875
+ label="๐Ÿ“ฅ Input Type",
876
+ info="Choose your content source"
877
+ )
878
+
879
+ # URL ์ž…๋ ฅ
880
+ url_input = gr.Textbox(
881
+ label="๐Ÿ”— Article URL",
882
+ placeholder="Enter the article URL here...",
883
+ value="",
884
+ visible=True,
885
+ lines=2
886
+ )
887
+
888
+ # PDF ์—…๋กœ๋“œ
889
+ pdf_input = gr.File(
890
+ label="๐Ÿ“„ Upload PDF",
891
+ file_types=[".pdf"],
892
+ visible=False
893
+ )
894
+
895
+ # ํ‚ค์›Œ๋“œ ์ž…๋ ฅ
896
+ keyword_input = gr.Textbox(
897
+ label="๐Ÿ” Topic/Keyword",
898
+ placeholder="Enter a topic (e.g., 'AI trends 2024', 'quantum computing')",
899
+ value="",
900
+ visible=False,
901
+ info="System will search and compile latest information",
902
+ lines=2
903
+ )
904
+
905
+ # ์˜ค๋ฅธ์ชฝ: ์„ค์ • ์˜ต์…˜๋“ค
906
+ with gr.Column(scale=1):
907
+ # ์ฒ˜๋ฆฌ ๋ชจ๋“œ
908
+ mode_selector = gr.Radio(
909
+ choices=["Local", "API"],
910
+ value="Local",
911
+ label="โš™๏ธ Processing Mode",
912
+ info="Local: On-device | API: Cloud"
913
+ )
914
+
915
+ # ์ƒ์„ฑ ๋ฒ„ํŠผ
916
+ with gr.Row():
917
+ convert_btn = gr.Button(
918
+ "๐ŸŽฏ Generate Professional Conversation",
919
+ variant="primary",
920
+ size="lg",
921
+ scale=1
922
+ )
923
+
924
+ # TTS ์„ค์ • ์„น์…˜
925
+ with gr.Group(elem_classes="input-group"):
926
+ gr.Markdown("### ๐ŸŽ™๏ธ Chatterbox TTS Settings")
927
+
928
+ with gr.Row():
929
+ with gr.Column(scale=2):
930
+ ref_audio = gr.Audio(
931
+ sources=["upload", "microphone"],
932
+ type="filepath",
933
+ label="Reference Audio File (Upload your voice)",
934
+ value="https://storage.googleapis.com/chatterbox-demo-samples/prompts/female_shadowheart4.flac",
935
+ info="Upload your voice sample for voice cloning"
936
+ )
937
+
938
+ with gr.Column(scale=1):
939
+ exaggeration = gr.Slider(
940
+ 0.25, 2, step=.05,
941
+ label="Exaggeration (Neutral = 0.5)",
942
+ value=.5
943
+ )
944
+ cfg_weight = gr.Slider(
945
+ 0.2, 1, step=.05,
946
+ label="CFG/Pace",
947
+ value=0.5
948
+ )
949
+ chunk_size = gr.Slider(
950
+ 100, 300, step=50,
951
+ label="Chunk Size (characters)",
952
+ value=250,
953
+ info="Text chunking for long conversations"
954
+ )
955
+
956
+ with gr.Accordion("Advanced Options", open=False):
957
+ seed_num = gr.Number(value=0, label="Random seed (0 for random)")
958
+ temperature = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
959
+
960
+ # ์ถœ๋ ฅ ์„น์…˜
961
+ with gr.Group(elem_classes="output-group"):
962
+ with gr.Row():
963
+ # ์™ผ์ชฝ: ๋Œ€ํ™” ํ…์ŠคํŠธ
964
+ with gr.Column(scale=3):
965
+ conversation_output = gr.Textbox(
966
+ label="๐Ÿ’ฌ Generated Professional Conversation (Editable)",
967
+ lines=25,
968
+ max_lines=50,
969
+ interactive=True,
970
+ placeholder="Professional podcast conversation will appear here...",
971
+ info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
972
+ )
973
+
974
+ # ํ…์ŠคํŠธ ๊ธธ์ด ํ‘œ์‹œ
975
+ char_count = gr.Textbox(
976
+ label="Text Information",
977
+ value="0 characters",
978
+ interactive=False
979
+ )
980
+
981
+ # ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ฒ„ํŠผ
982
+ with gr.Row():
983
+ generate_audio_btn = gr.Button(
984
+ "๐ŸŽ™๏ธ Generate Audio with Chatterbox TTS",
985
+ variant="secondary",
986
+ size="lg"
987
+ )
988
+
989
+ # ์˜ค๋ฅธ์ชฝ: ์˜ค๋””์˜ค ์ถœ๋ ฅ ๋ฐ ์ƒํƒœ
990
+ with gr.Column(scale=2):
991
+ audio_output = gr.Audio(
992
+ label="๐ŸŽง Professional Podcast Audio",
993
+ type="filepath",
994
+ interactive=False
995
+ )
996
+
997
+ status_output = gr.Textbox(
998
+ label="๐Ÿ“Š Status",
999
+ interactive=False,
1000
+ lines=3,
1001
+ elem_classes="status-box"
1002
+ )
1003
+
1004
+ # ๋„์›€๋ง
1005
+ gr.Markdown("""
1006
+ #### ๐Ÿ’ก Quick Tips:
1007
+ - **URL**: Paste any article link
1008
+ - **PDF**: Upload documents directly
1009
+ - **Keyword**: Enter topics for AI research
1010
+ - **Voice Cloning**: Upload reference audio
1011
+ - Edit conversation before audio generation
1012
+ - Longer text automatically chunked
1013
+ """)
1014
+
1015
+ # ์˜ˆ์ œ ์„น์…˜
1016
+ with gr.Accordion("๐Ÿ“š Examples", open=False):
1017
+ gr.Examples(
1018
+ examples=[
1019
+ ["https://huggingface.co/blog/openfreeai/cycle-navigator", "URL", "Local"],
1020
+ ["quantum computing breakthroughs", "Keyword", "Local"],
1021
+ ["https://huggingface.co/papers/2505.14810", "URL", "Local"],
1022
+ ["artificial intelligence ethics", "Keyword", "Local"],
1023
+ ],
1024
+ inputs=[url_input, input_type_selector, mode_selector],
1025
+ outputs=[conversation_output, status_output],
1026
+ fn=synthesize_sync,
1027
+ cache_examples=False,
1028
+ )
1029
+
1030
+ # Input type change handler
1031
+ input_type_selector.change(
1032
+ fn=toggle_input_visibility,
1033
+ inputs=[input_type_selector],
1034
+ outputs=[url_input, pdf_input, keyword_input]
1035
  )
1036
 
1037
+ # ํ…์ŠคํŠธ ์ž…๋ ฅ ์‹œ ๋ฌธ์ž ์ˆ˜ ์—…๋ฐ์ดํŠธ
1038
+ conversation_output.change(
1039
+ fn=update_char_count,
1040
+ inputs=[conversation_output, chunk_size],
1041
+ outputs=[char_count]
1042
+ )
1043
+
1044
+ chunk_size.change(
1045
+ fn=update_char_count,
1046
+ inputs=[conversation_output, chunk_size],
1047
+ outputs=[char_count]
1048
+ )
1049
+
1050
+ # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
1051
+ def get_article_input(input_type, url_input, pdf_input, keyword_input):
1052
+ """Get the appropriate input based on input type"""
1053
+ if input_type == "URL":
1054
+ return url_input
1055
+ elif input_type == "PDF":
1056
+ return pdf_input
1057
+ else: # Keyword
1058
+ return keyword_input
1059
+
1060
+ convert_btn.click(
1061
+ fn=lambda input_type, url_input, pdf_input, keyword_input, mode: synthesize_sync(
1062
+ get_article_input(input_type, url_input, pdf_input, keyword_input), input_type, mode
1063
+ ),
1064
+ inputs=[input_type_selector, url_input, pdf_input, keyword_input, mode_selector],
1065
+ outputs=[conversation_output, status_output]
1066
+ )
1067
+
1068
+ generate_audio_btn.click(
1069
+ fn=regenerate_audio_sync,
1070
+ inputs=[conversation_output, ref_audio, exaggeration, temperature, seed_num, cfg_weight, chunk_size],
1071
+ outputs=[status_output, audio_output]
1072
  )
1073
 
 
 
1074
 
1075
+ # Launch the app
1076
+ if __name__ == "__main__":
1077
+ demo.queue(api_open=True, default_concurrency_limit=10).launch(
1078
+ show_api=True,
1079
+ share=False,
1080
+ server_name="0.0.0.0",
1081
+ server_port=7860
1082
+ )