Rausda6 commited on
Commit
4c6c365
·
verified ·
1 Parent(s): 132e1a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -38
app.py CHANGED
@@ -14,6 +14,7 @@ import mimetypes
14
  from typing import List
15
 
16
  from PyPDF2 import PdfReader
 
17
 
18
  # Define model name clearly
19
  MODEL_NAME = "unsloth/gemma-3-1b-pt"
@@ -39,7 +40,19 @@ class PodcastGenerator:
39
 
40
  async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None):
41
  example = """
42
- {...}
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
  if language == "Auto Detect":
45
  language_instruction = "- The podcast MUST be in the same language as the user input."
@@ -47,11 +60,18 @@ class PodcastGenerator:
47
  language_instruction = f"- The podcast MUST be in {language} language"
48
 
49
  system_prompt = f"""
50
- You are a professional podcast generator...
51
  {language_instruction}
 
 
 
 
 
 
52
  Follow this example structure:
53
  {example}
54
  """
 
55
  if prompt and file_obj:
56
  user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
57
  elif prompt:
@@ -59,10 +79,11 @@ Follow this example structure:
59
  else:
60
  user_prompt = "Please generate a podcast script based on the uploaded file."
61
 
 
62
  if file_obj:
63
  file_size = getattr(file_obj, 'size', os.path.getsize(file_obj.name))
64
  if file_size > MAX_FILE_SIZE_BYTES:
65
- raise Exception("File size exceeds limit.")
66
  ext = os.path.splitext(file_obj.name)[1].lower()
67
  if ext == '.pdf':
68
  reader = PdfReader(file_obj)
@@ -73,54 +94,147 @@ Follow this example structure:
73
  user_prompt += f"\n\n―― FILE CONTENT ――\n{text}"
74
 
75
  prompt_text = system_prompt + "\n" + user_prompt
76
-
77
  try:
78
- if progress: progress(0.3, "Generating podcast script...")
 
79
  def hf_generate(p):
80
  inputs = tokenizer(p, return_tensors="pt").to(model.device)
81
- outs = model.generate(**inputs, max_new_tokens=1024, do_sample=True, temperature=1.0)
 
 
 
 
 
82
  return tokenizer.decode(outs[0], skip_special_tokens=True)
83
- generated_text = await asyncio.wait_for(asyncio.to_thread(hf_generate, prompt_text), timeout=60)
 
 
 
84
  except asyncio.TimeoutError:
85
- raise Exception("Script generation timed out.")
86
  except Exception as e:
87
- raise Exception(f"Failed to generate script: {e}")
88
- if progress: progress(0.4, "Script generated successfully!")
89
- return json.loads(generated_text)
90
 
91
- # ... TTS and combine_audio_files methods unchanged ...
 
92
 
93
- async def process_input(input_text, input_file, language, speaker1, speaker2, api_key="", progress=None):
94
- # Implementation unchanged
95
- ...
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # Gradio UI
98
  with gr.Blocks(title="PodcastGen 🎙️") as demo:
99
  gr.Markdown("""
100
- # PodcastGen 🎙️
101
- Generate a 2-speaker podcast from text or PDF!
102
- """
103
- )
104
  with gr.Row():
105
  with gr.Column():
106
- input_text = gr.Textbox(...)
107
- input_file = gr.File(...)
108
  with gr.Column():
109
- language = gr.Dropdown(...)
110
- speaker1 = gr.Dropdown(...)
111
- speaker2 = gr.Dropdown(...)
112
- api_key = gr.Textbox(...)
113
-
114
  generate_btn = gr.Button("Generate Podcast 🎙️", variant="primary")
115
- output_audio = gr.Audio(...)
116
-
117
- # Bind async function directly
118
- generate_btn.click(
119
- fn=process_input,
120
- inputs=[input_text, input_file, language, speaker1, speaker2, api_key],
121
- outputs=output_audio,
122
- show_progress=True
123
- )
124
-
125
- demo.queue()
126
- demo.launch(server_name="0.0.0.0", share=True, debug=True)
 
14
  from typing import List
15
 
16
  from PyPDF2 import PdfReader
17
+ from pydub import AudioSegment
18
 
19
  # Define model name clearly
20
  MODEL_NAME = "unsloth/gemma-3-1b-pt"
 
40
 
41
  async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None):
42
  example = """
43
+ {
44
+ "topic": "AGI",
45
+ "podcast": [
46
+ {"speaker": 2, "line": "So, AGI, huh? Seems like everyone's talking about it these days."},
47
+ {"speaker": 1, "line": "Yeah, it's definitely having a moment, isn't it?"},
48
+ {"speaker": 2, "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"},
49
+ {"speaker": 1, "line": "It's easy to get lost in the noise, for sure."},
50
+ {"speaker": 2, "line": "Exactly. So how about we try to cut through some of that, shall we?"},
51
+ {"speaker": 1, "line": "Sounds like a plan."},
52
+ {"speaker": 2, "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."},
53
+ {"speaker": 1, "line": "Peace."}
54
+ ]
55
+ }
56
  """
57
  if language == "Auto Detect":
58
  language_instruction = "- The podcast MUST be in the same language as the user input."
 
60
  language_instruction = f"- The podcast MUST be in {language} language"
61
 
62
  system_prompt = f"""
63
+ You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
64
  {language_instruction}
65
+ - The podcast should have 2 speakers.
66
+ - The podcast should be long.
67
+ - Do not use names for the speakers.
68
+ - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
69
+ - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
70
+ - The script must be in JSON format.
71
  Follow this example structure:
72
  {example}
73
  """
74
+ # Build the user prompt
75
  if prompt and file_obj:
76
  user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
77
  elif prompt:
 
79
  else:
80
  user_prompt = "Please generate a podcast script based on the uploaded file."
81
 
82
+ # Handle file content
83
  if file_obj:
84
  file_size = getattr(file_obj, 'size', os.path.getsize(file_obj.name))
85
  if file_size > MAX_FILE_SIZE_BYTES:
86
+ raise Exception(f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file.")
87
  ext = os.path.splitext(file_obj.name)[1].lower()
88
  if ext == '.pdf':
89
  reader = PdfReader(file_obj)
 
94
  user_prompt += f"\n\n―― FILE CONTENT ――\n{text}"
95
 
96
  prompt_text = system_prompt + "\n" + user_prompt
 
97
  try:
98
+ if progress:
99
+ progress(0.3, "Generating podcast script...")
100
  def hf_generate(p):
101
  inputs = tokenizer(p, return_tensors="pt").to(model.device)
102
+ outs = model.generate(
103
+ **inputs,
104
+ max_new_tokens=1024,
105
+ do_sample=True,
106
+ temperature=1.0
107
+ )
108
  return tokenizer.decode(outs[0], skip_special_tokens=True)
109
+ generated_text = await asyncio.wait_for(
110
+ asyncio.to_thread(hf_generate, prompt_text),
111
+ timeout=60
112
+ )
113
  except asyncio.TimeoutError:
114
+ raise Exception("The script generation request timed out. Please try again later.")
115
  except Exception as e:
116
+ raise Exception(f"Failed to generate podcast script: {e}")
 
 
117
 
118
+ if progress:
119
+ progress(0.4, "Script generated successfully!")
120
 
121
+ return json.loads(generated_text)
 
 
122
 
123
+ async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
124
+ voice = speaker1 if speaker == 1 else speaker2
125
+ speech = edge_tts.Communicate(text, voice)
126
+ temp_filename = f"temp_{uuid.uuid4()}.wav"
127
+ try:
128
+ await asyncio.wait_for(speech.save(temp_filename), timeout=30)
129
+ return temp_filename
130
+ except asyncio.TimeoutError:
131
+ if os.path.exists(temp_filename): os.remove(temp_filename)
132
+ raise Exception("Text-to-speech generation timed out. Please try with a shorter text.")
133
+ except Exception as e:
134
+ if os.path.exists(temp_filename): os.remove(temp_filename)
135
+ raise e
136
+
137
+ async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
138
+ if progress: progress(0.9, "Combining audio files...")
139
+ combined_audio = AudioSegment.empty()
140
+ for audio_file in audio_files:
141
+ combined_audio += AudioSegment.from_file(audio_file)
142
+ os.remove(audio_file)
143
+ output_filename = f"output_{uuid.uuid4()}.wav"
144
+ combined_audio.export(output_filename, format="wav")
145
+ if progress: progress(1.0, "Podcast generated successfully!")
146
+ return output_filename
147
+
148
+ async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
149
+ try:
150
+ if progress: progress(0.1, "Starting podcast generation...")
151
+ return await asyncio.wait_for(
152
+ self._generate_podcast_internal(input_text, language, speaker1, speaker2, api_key, file_obj, progress),
153
+ timeout=600
154
+ )
155
+ except asyncio.TimeoutError:
156
+ raise Exception("The podcast generation process timed out. Please try with shorter text or try again later.")
157
+ except Exception as e:
158
+ raise Exception(f"Error generating podcast: {str(e)}")
159
+
160
+ async def _generate_podcast_internal(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
161
+ if progress: progress(0.2, "Generating podcast script...")
162
+ podcast_json = await self.generate_script(input_text, language, api_key, file_obj, progress)
163
+ if progress: progress(0.5, "Converting text to speech...")
164
+ audio_files = []
165
+ total_lines = len(podcast_json['podcast'])
166
+ batch_size = 10
167
+ for batch_start in range(0, total_lines, batch_size):
168
+ batch_end = min(batch_start + batch_size, total_lines)
169
+ batch = podcast_json['podcast'][batch_start:batch_end]
170
+ tts_tasks = [self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in batch]
171
+ try:
172
+ batch_results = await asyncio.gather(*tts_tasks, return_exceptions=True)
173
+ for result in batch_results:
174
+ if isinstance(result, Exception):
175
+ for file in audio_files:
176
+ if os.path.exists(file): os.remove(file)
177
+ raise Exception(f"Error generating speech: {str(result)}")
178
+ audio_files.append(result)
179
+ if progress:
180
+ progress(0.5 + (0.4 * (batch_end / total_lines)), f"Processed {batch_end}/{total_lines} speech segments...")
181
+ except Exception as e:
182
+ for file in audio_files:
183
+ if os.path.exists(file): os.remove(file)
184
+ raise Exception(f"Error in batch TTS generation: {str(e)}")
185
+ combined = await self.combine_audio_files(audio_files, progress)
186
+ return combined
187
+
188
+ async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "", progress=None) -> str:
189
+ start_time = time.time()
190
+ voice_names = {
191
+ "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
192
+ "Ava - English (United States)": "en-US-AvaMultilingualNeural",
193
+ "Brian - English (United States)": "en-US-BrianMultilingualNeural",
194
+ "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
195
+ "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
196
+ "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
197
+ "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
198
+ "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
199
+ }
200
+ speaker1 = voice_names[speaker1]
201
+ speaker2 = voice_names[speaker2]
202
+ try:
203
+ if progress: progress(0.05, "Processing input...")
204
+ if not api_key:
205
+ api_key = "saf"
206
+ if not api_key:
207
+ raise Exception("No API key provided. Please provide a Gemini API key.")
208
+ generator = PodcastGenerator()
209
+ output = await generator.generate_podcast(input_text, lan
210
+ guage, speaker1, speaker2, api_key, input_file, progress)
211
+ print(f"Total podcast generation time: {time.time() - start_time:.2f} seconds")
212
+ return output
213
+ except Exception as e:
214
+ msg = str(e)
215
+ if "rate limit" in msg.lower():
216
+ raise Exception("Rate limit exceeded. Please try again later or use your own API key.")
217
+ elif "timeout" in msg.lower():
218
+ raise Exception("The request timed out... Please try with shorter text.")
219
+ else:
220
+ raise Exception(f"Error: {msg}")
221
  # Gradio UI
222
  with gr.Blocks(title="PodcastGen 🎙️") as demo:
223
  gr.Markdown("""
224
+ # PodcastGen 🎙️
225
+ Generate a 2-speaker podcast from text or PDF!
226
+ """ )
 
227
  with gr.Row():
228
  with gr.Column():
229
+ input_text = gr.Textbox(label="Input Text", lines=10, placeholder="Enter podcast topic or paste text here...", elem_id="input_text")
230
+ input_file = gr.File(label="Or Upload a PDF or TXT file", file_types=[".pdf", ".txt"] )
231
  with gr.Column():
232
+ language = gr.Dropdown(label="Podcast Language", choices=["Auto Detect","English","German","French","Spanish","Italian","Dutch","Portuguese","Russian","Chinese","Japanese","Korean","Other" ], value="Auto Detect")
233
+ speaker1 = gr.Dropdown(label="Speaker 1 Voice", choices=["Andrew - English (United States)","Ava - English (United States)","Brian - English (United States)","Emma - English (United States)","Florian - German (Germany)","Seraphina - German (Germany)","Remy - French (France)","Vivienne - French (France)" ], value="Andrew - English (United States)")
234
+ speaker2 = gr.Dropdown(label="Speaker 2 Voice", choices=["Andrew - English (United States)","Ava - English (United States)","Brian - English (United States)","Emma - English (United States)","Florian - German (Germany)","Seraphina - German (Germany)","Remy - French (France)","Vivienne - French (France)" ], value="Ava - English (United States)")
235
+ api_key = gr.Textbox(label="Gemini API Key (Optional)", type="password", placeholder="Needed only if you're getting rate limited.")
 
236
  generate_btn = gr.Button("Generate Podcast 🎙️", variant="primary")
237
+ output_audio = gr.Audio(label="Generated Podcast", type="filepath", format="wav", elem_id="output_audio")
238
+ generate_btn.click(fn=process_input, inputs=[input_text, input_file, language, speaker1, speaker2, api_key], outputs=output_audio, show_progress=True)
239
+ demo.queue()
240
+ demo.launch(server_name="0.0.0.0", share=True, debug=True)