openfree commited on
Commit
9e2a211
·
verified ·
1 Parent(s): e788f00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +422 -0
app.py CHANGED
@@ -1,2 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
 
1
+ import gradio as gr
2
+ import os
3
+ import asyncio
4
+ import torch
5
+ import io
6
+ import json
7
+ import re
8
+ import httpx
9
+ import tempfile
10
+ import wave
11
+ import base64
12
+ from dataclasses import dataclass
13
+ from typing import List, Tuple, Dict, Optional
14
+ from pathlib import Path
15
+ from threading import Thread
16
+ from dotenv import load_dotenv
17
 
18
+ # Edge TTS imports
19
+ import edge_tts
20
+ from pydub import AudioSegment
21
+
22
+ # OpenAI imports
23
+ from openai import OpenAI
24
+
25
+ # Transformers imports (for local mode)
26
+ from transformers import (
27
+ AutoModelForCausalLM,
28
+ AutoTokenizer,
29
+ TextIteratorStreamer,
30
+ BitsAndBytesConfig,
31
+ )
32
+
33
+ # MeloTTS imports (for local mode)
34
+ try:
35
+ os.system("python -m unidic download")
36
+ from melo.api import TTS as MeloTTS
37
+ MELO_AVAILABLE = True
38
+ except:
39
+ MELO_AVAILABLE = False
40
+
41
+ load_dotenv()
42
+
43
+
44
+ @dataclass
45
+ class ConversationConfig:
46
+ max_words: int = 6000
47
+ prefix_url: str = "https://r.jina.ai/"
48
+ model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
49
+ local_model_name: str = "NousResearch/Hermes-2-Pro-Llama-3-8B"
50
+
51
+
52
+ class UnifiedAudioConverter:
53
+ def __init__(self, config: ConversationConfig):
54
+ self.config = config
55
+ self.llm_client = None
56
+ self.local_model = None
57
+ self.tokenizer = None
58
+ self.melo_models = None
59
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
60
+
61
+ def initialize_api_mode(self, api_key: str):
62
+ """Initialize API mode with Together API"""
63
+ self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
64
+
65
+ def initialize_local_mode(self):
66
+ """Initialize local mode with Hugging Face model"""
67
+ if self.local_model is None:
68
+ quantization_config = BitsAndBytesConfig(
69
+ load_in_4bit=True,
70
+ bnb_4bit_compute_dtype=torch.float16
71
+ )
72
+ self.local_model = AutoModelForCausalLM.from_pretrained(
73
+ self.config.local_model_name,
74
+ quantization_config=quantization_config
75
+ )
76
+ self.tokenizer = AutoTokenizer.from_pretrained(
77
+ self.config.local_model_name,
78
+ revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
79
+ )
80
+
81
+ if MELO_AVAILABLE and self.melo_models is None:
82
+ self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
83
+
84
+ def fetch_text(self, url: str) -> str:
85
+ """Fetch text content from URL"""
86
+ if not url:
87
+ raise ValueError("URL cannot be empty")
88
+
89
+ if not url.startswith("http://") and not url.startswith("https://"):
90
+ raise ValueError("URL must start with 'http://' or 'https://'")
91
+
92
+ full_url = f"{self.config.prefix_url}{url}"
93
+ try:
94
+ response = httpx.get(full_url, timeout=60.0)
95
+ response.raise_for_status()
96
+ return response.text
97
+ except httpx.HTTPError as e:
98
+ raise RuntimeError(f"Failed to fetch URL: {e}")
99
+
100
+ def _build_prompt(self, text: str) -> str:
101
+ """Build prompt for conversation generation"""
102
+ template = """
103
+ {
104
+ "conversation": [
105
+ {"speaker": "", "text": ""},
106
+ {"speaker": "", "text": ""}
107
+ ]
108
+ }
109
+ """
110
+ return (
111
+ f"{text}\n\nConvert the provided text into a short, informative and crisp "
112
+ f"podcast conversation between two experts. The tone should be "
113
+ f"professional and engaging. Please adhere to the following "
114
+ f"format and return ONLY the JSON:\n{template}"
115
+ )
116
+
117
+ def extract_conversation_api(self, text: str) -> Dict:
118
+ """Extract conversation using API"""
119
+ if not self.llm_client:
120
+ raise RuntimeError("API mode not initialized")
121
+
122
+ try:
123
+ chat_completion = self.llm_client.chat.completions.create(
124
+ messages=[{"role": "user", "content": self._build_prompt(text)}],
125
+ model=self.config.model_name,
126
+ )
127
+
128
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
129
+ json_match = re.search(pattern, chat_completion.choices[0].message.content)
130
+
131
+ if not json_match:
132
+ raise ValueError("No valid JSON found in response")
133
+
134
+ return json.loads(json_match.group())
135
+ except Exception as e:
136
+ raise RuntimeError(f"Failed to extract conversation: {e}")
137
+
138
+ def extract_conversation_local(self, text: str, progress=None) -> Dict:
139
+ """Extract conversation using local model"""
140
+ if not self.local_model or not self.tokenizer:
141
+ raise RuntimeError("Local mode not initialized")
142
+
143
+ chat = [{
144
+ "role": "user",
145
+ "content": self._build_prompt(text)
146
+ }]
147
+
148
+ terminators = [
149
+ self.tokenizer.eos_token_id,
150
+ self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
151
+ ]
152
+
153
+ messages = self.tokenizer.apply_chat_template(
154
+ chat, tokenize=False, add_generation_prompt=True
155
+ )
156
+ model_inputs = self.tokenizer([messages], return_tensors="pt").to(self.device)
157
+
158
+ streamer = TextIteratorStreamer(
159
+ self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
160
+ )
161
+
162
+ generate_kwargs = dict(
163
+ model_inputs,
164
+ streamer=streamer,
165
+ max_new_tokens=4000,
166
+ do_sample=True,
167
+ temperature=0.9,
168
+ eos_token_id=terminators,
169
+ )
170
+
171
+ t = Thread(target=self.local_model.generate, kwargs=generate_kwargs)
172
+ t.start()
173
+
174
+ partial_text = ""
175
+ for new_text in streamer:
176
+ partial_text += new_text
177
+
178
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
179
+ json_match = re.search(pattern, partial_text)
180
+
181
+ if json_match:
182
+ return json.loads(json_match.group())
183
+ else:
184
+ # Return a default template if no valid JSON found
185
+ return {
186
+ "conversation": [
187
+ {"speaker": "Host", "text": "Welcome to our podcast."},
188
+ {"speaker": "Guest", "text": "Thank you for having me."}
189
+ ]
190
+ }
191
+
192
+ async def text_to_speech_edge(self, conversation_json: Dict, voice_1: str, voice_2: str) -> Tuple[str, str]:
193
+ """Convert text to speech using Edge TTS"""
194
+ output_dir = Path(self._create_output_directory())
195
+ filenames = []
196
+
197
+ try:
198
+ for i, turn in enumerate(conversation_json["conversation"]):
199
+ filename = output_dir / f"output_{i}.wav"
200
+ voice = voice_1 if i % 2 == 0 else voice_2
201
+
202
+ tmp_path = await self._generate_audio_edge(turn["text"], voice)
203
+ os.rename(tmp_path, filename)
204
+ filenames.append(str(filename))
205
+
206
+ # Combine audio files
207
+ final_output = os.path.join(output_dir, "combined_output.wav")
208
+ self._combine_audio_files(filenames, final_output)
209
+
210
+ # Generate conversation text
211
+ conversation_text = "\n".join(
212
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
213
+ for i, turn in enumerate(conversation_json["conversation"])
214
+ )
215
+
216
+ return final_output, conversation_text
217
+ except Exception as e:
218
+ raise RuntimeError(f"Failed to convert text to speech: {e}")
219
+
220
+ async def _generate_audio_edge(self, text: str, voice: str) -> str:
221
+ """Generate audio using Edge TTS"""
222
+ if not text.strip():
223
+ raise ValueError("Text cannot be empty")
224
+
225
+ voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
226
+ communicate = edge_tts.Communicate(text, voice_short_name)
227
+
228
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
229
+ tmp_path = tmp_file.name
230
+ await communicate.save(tmp_path)
231
+
232
+ return tmp_path
233
+
234
+ def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
235
+ """Convert text to speech using MeloTTS"""
236
+ if not MELO_AVAILABLE or not self.melo_models:
237
+ raise RuntimeError("MeloTTS not available")
238
+
239
+ speakers = ["EN-Default", "EN-US"]
240
+ combined_audio = AudioSegment.empty()
241
+
242
+ for i, turn in enumerate(conversation_json["conversation"]):
243
+ bio = io.BytesIO()
244
+ text = turn["text"]
245
+ speaker = speakers[i % 2]
246
+ speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
247
+
248
+ # Generate audio
249
+ self.melo_models["EN"].tts_to_file(
250
+ text, speaker_id, bio, speed=1.0,
251
+ pbar=progress.tqdm if progress else None,
252
+ format="wav"
253
+ )
254
+
255
+ bio.seek(0)
256
+ audio_segment = AudioSegment.from_file(bio, format="wav")
257
+ combined_audio += audio_segment
258
+
259
+ # Save final audio
260
+ final_audio_path = "final_podcast.mp3"
261
+ combined_audio.export(final_audio_path, format="mp3")
262
+
263
+ # Generate conversation text
264
+ conversation_text = "\n".join(
265
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
266
+ for i, turn in enumerate(conversation_json["conversation"])
267
+ )
268
+
269
+ return final_audio_path, conversation_text
270
+
271
+ def _create_output_directory(self) -> str:
272
+ """Create a unique output directory"""
273
+ random_bytes = os.urandom(8)
274
+ folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
275
+ os.makedirs(folder_name, exist_ok=True)
276
+ return folder_name
277
+
278
+ def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
279
+ """Combine multiple audio files into one"""
280
+ if not filenames:
281
+ raise ValueError("No input files provided")
282
+
283
+ try:
284
+ audio_segments = []
285
+ for filename in filenames:
286
+ audio_segment = AudioSegment.from_file(filename)
287
+ audio_segments.append(audio_segment)
288
+
289
+ combined = sum(audio_segments)
290
+ combined.export(output_file, format="wav")
291
+
292
+ # Clean up temporary files
293
+ for filename in filenames:
294
+ os.remove(filename)
295
+
296
+ except Exception as e:
297
+ raise RuntimeError(f"Failed to combine audio files: {e}")
298
+
299
+
300
+ # Global converter instance
301
+ converter = UnifiedAudioConverter(ConversationConfig())
302
+
303
+
304
+ async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
305
+ """Main synthesis function"""
306
+ if not article_url:
307
+ return "Please provide a valid URL.", None
308
+
309
+ try:
310
+ # Fetch text from URL
311
+ text = converter.fetch_text(article_url)
312
+
313
+ # Limit text to max words
314
+ words = text.split()
315
+ if len(words) > converter.config.max_words:
316
+ text = " ".join(words[:converter.config.max_words])
317
+
318
+ # Extract conversation based on mode
319
+ if mode == "API":
320
+ api_key = os.environ.get("TOGETHER_API_KEY")
321
+ if not api_key:
322
+ return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
323
+ converter.initialize_api_mode(api_key)
324
+ conversation_json = converter.extract_conversation_api(text)
325
+ else: # Local mode
326
+ converter.initialize_local_mode()
327
+ conversation_json = converter.extract_conversation_local(text)
328
+
329
+ # Generate audio based on TTS engine
330
+ if tts_engine == "Edge-TTS":
331
+ output_file, conversation_text = await converter.text_to_speech_edge(
332
+ conversation_json,
333
+ "en-US-AvaMultilingualNeural",
334
+ "en-US-AndrewMultilingualNeural"
335
+ )
336
+ else: # MeloTTS
337
+ if not MELO_AVAILABLE:
338
+ return "MeloTTS not available. Please install required dependencies.", None
339
+ output_file, conversation_text = converter.text_to_speech_melo(
340
+ conversation_json
341
+ )
342
+
343
+ return conversation_text, output_file
344
+
345
+ except Exception as e:
346
+ return f"Error: {str(e)}", None
347
+
348
+
349
+ def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
350
+ """Synchronous wrapper for async synthesis"""
351
+ return asyncio.run(synthesize(article_url, mode, tts_engine))
352
+
353
+
354
+ # Gradio Interface
355
+ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
356
+ gr.Markdown("# 🎙️ URL to Podcast Converter")
357
+ gr.Markdown("Convert any article, blog, or news into an engaging podcast conversation!")
358
+
359
+ with gr.Row():
360
+ with gr.Column(scale=3):
361
+ url_input = gr.Textbox(
362
+ label="Article URL",
363
+ placeholder="Enter the article URL here...",
364
+ value=""
365
+ )
366
+ with gr.Column(scale=1):
367
+ mode_selector = gr.Radio(
368
+ choices=["API", "Local"],
369
+ value="API",
370
+ label="Processing Mode",
371
+ info="API: Faster, requires API key | Local: Slower, runs on device"
372
+ )
373
+ tts_selector = gr.Radio(
374
+ choices=["Edge-TTS", "MeloTTS"],
375
+ value="Edge-TTS",
376
+ label="TTS Engine",
377
+ info="Edge-TTS: More natural | MeloTTS: Requires GPU"
378
+ )
379
+
380
+ convert_btn = gr.Button("🎯 Convert to Podcast", variant="primary", size="lg")
381
+
382
+ with gr.Row():
383
+ with gr.Column():
384
+ conversation_output = gr.Textbox(
385
+ label="Generated Conversation",
386
+ lines=15,
387
+ max_lines=30,
388
+ interactive=False
389
+ )
390
+ with gr.Column():
391
+ audio_output = gr.Audio(
392
+ label="Podcast Audio",
393
+ type="filepath",
394
+ interactive=False
395
+ )
396
+
397
+ gr.Examples(
398
+ examples=[
399
+ ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
400
+ ["https://www.bbc.com/news/technology-67988517", "API", "Edge-TTS"],
401
+ ],
402
+ inputs=[url_input, mode_selector, tts_selector],
403
+ outputs=[conversation_output, audio_output],
404
+ fn=synthesize_sync,
405
+ cache_examples=False,
406
+ )
407
+
408
+ convert_btn.click(
409
+ fn=synthesize_sync,
410
+ inputs=[url_input, mode_selector, tts_selector],
411
+ outputs=[conversation_output, audio_output]
412
+ )
413
+
414
+
415
+
416
+ # Launch the app
417
+ if __name__ == "__main__":
418
+ demo.queue(api_open=True, default_concurrency_limit=10).launch(
419
+ show_api=True,
420
+ share=False,
421
+ server_name="0.0.0.0",
422
+ server_port=7860
423
+ )
424