bluenevus commited on
Commit
83886ae
·
verified ·
1 Parent(s): 88fa21c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -26
app.py CHANGED
@@ -58,7 +58,7 @@ def generate_podcast_script(api_key, prompt, uploaded_file, duration, num_hosts)
58
  Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
59
 
60
  Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
61
- {'Make sure the script is a monologue for one person.' if num_hosts == 1 else 'Ensure the dialogue alternates between two distinct voices.'}
62
  """
63
 
64
  response = model.generate_content(prompt)
@@ -131,35 +131,47 @@ def redistribute_codes(code_list, snac_model):
131
  return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
132
 
133
  @spaces.GPU()
134
- def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
135
  if not text.strip():
136
  return None
137
 
138
  try:
139
  progress(0.1, "Processing text...")
140
- input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
 
141
 
142
- progress(0.3, "Generating speech tokens...")
143
- with torch.no_grad():
144
- generated_ids = model.generate(
145
- input_ids,
146
- attention_mask=attention_mask,
147
- do_sample=True,
148
- temperature=temperature,
149
- top_p=top_p,
150
- repetition_penalty=repetition_penalty,
151
- max_new_tokens=max_new_tokens,
152
- num_return_sequences=1,
153
- eos_token_id=128258,
154
- )
155
-
156
- progress(0.6, "Processing speech tokens...")
157
- code_list = parse_output(generated_ids)
 
 
 
 
 
 
 
 
 
 
 
158
 
159
- progress(0.8, "Converting to audio...")
160
- audio_samples = redistribute_codes(code_list, snac_model)
161
 
162
- return (24000, audio_samples)
163
  except Exception as e:
164
  print(f"Error generating speech: {e}")
165
  return None
@@ -176,11 +188,17 @@ with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
176
 
177
  with gr.Column(scale=2):
178
  script_output = gr.Textbox(label="Generated Script", lines=10)
179
- voice = gr.Dropdown(
180
  choices=VOICES,
181
  value="tara",
182
- label="Voice",
183
- info="Select the voice for speech generation"
 
 
 
 
 
 
184
  )
185
 
186
  with gr.Accordion("Advanced Settings", open=False):
@@ -220,7 +238,7 @@ with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
220
 
221
  submit_btn.click(
222
  fn=generate_speech,
223
- inputs=[script_output, voice, temperature, top_p, repetition_penalty, max_new_tokens],
224
  outputs=audio_output
225
  )
226
 
 
58
  Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
59
 
60
  Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
61
+ {'Make sure the script is a monologue for one person.' if num_hosts == 1 else 'Ensure the dialogue alternates between two distinct voices, with one speaking on odd-numbered lines and the other on even-numbered lines.'}
62
  """
63
 
64
  response = model.generate_content(prompt)
 
131
  return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
132
 
133
  @spaces.GPU()
134
+ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
135
  if not text.strip():
136
  return None
137
 
138
  try:
139
  progress(0.1, "Processing text...")
140
+ lines = text.split('\n')
141
+ audio_samples = []
142
 
143
+ for i, line in enumerate(lines):
144
+ if not line.strip():
145
+ continue
146
+
147
+ voice = voice1 if i % 2 == 0 else voice2
148
+ input_ids, attention_mask = process_prompt(line, voice, tokenizer, device)
149
+
150
+ progress(0.3, f"Generating speech tokens for line {i+1}...")
151
+ with torch.no_grad():
152
+ generated_ids = model.generate(
153
+ input_ids,
154
+ attention_mask=attention_mask,
155
+ do_sample=True,
156
+ temperature=temperature,
157
+ top_p=top_p,
158
+ repetition_penalty=repetition_penalty,
159
+ max_new_tokens=max_new_tokens,
160
+ num_return_sequences=1,
161
+ eos_token_id=128258,
162
+ )
163
+
164
+ progress(0.6, f"Processing speech tokens for line {i+1}...")
165
+ code_list = parse_output(generated_ids)
166
+
167
+ progress(0.8, f"Converting line {i+1} to audio...")
168
+ line_audio = redistribute_codes(code_list, snac_model)
169
+ audio_samples.append(line_audio)
170
 
171
+ # Concatenate all audio samples
172
+ final_audio = np.concatenate(audio_samples)
173
 
174
+ return (24000, final_audio)
175
  except Exception as e:
176
  print(f"Error generating speech: {e}")
177
  return None
 
188
 
189
  with gr.Column(scale=2):
190
  script_output = gr.Textbox(label="Generated Script", lines=10)
191
+ voice1 = gr.Dropdown(
192
  choices=VOICES,
193
  value="tara",
194
+ label="Voice 1",
195
+ info="Select the first voice for speech generation"
196
+ )
197
+ voice2 = gr.Dropdown(
198
+ choices=VOICES,
199
+ value="dan",
200
+ label="Voice 2",
201
+ info="Select the second voice for speech generation"
202
  )
203
 
204
  with gr.Accordion("Advanced Settings", open=False):
 
238
 
239
  submit_btn.click(
240
  fn=generate_speech,
241
+ inputs=[script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts],
242
  outputs=audio_output
243
  )
244