podcast-generator

Sleeping

App Files Files Community

bluenevus commited on Apr 16

Commit

83886ae

verified ·

1 Parent(s): 88fa21c

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -26

app.py CHANGED Viewed

@@ -58,7 +58,7 @@ def generate_podcast_script(api_key, prompt, uploaded_file, duration, num_hosts)
         Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
         Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
-        {'Make sure the script is a monologue for one person.' if num_hosts == 1 else 'Ensure the dialogue alternates between two distinct voices.'}
         """
         response = model.generate_content(prompt)
@@ -131,35 +131,47 @@ def redistribute_codes(code_list, snac_model):
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 @spaces.GPU()
-def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
-        input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
-        progress(0.3, "Generating speech tokens...")
-        with torch.no_grad():
-            generated_ids = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                do_sample=True,
-                temperature=temperature,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                max_new_tokens=max_new_tokens,
-                num_return_sequences=1,
-                eos_token_id=128258,
-            )
-        progress(0.6, "Processing speech tokens...")
-        code_list = parse_output(generated_ids)
-        progress(0.8, "Converting to audio...")
-        audio_samples = redistribute_codes(code_list, snac_model)
-        return (24000, audio_samples)
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
@@ -176,11 +188,17 @@ with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
         with gr.Column(scale=2):
             script_output = gr.Textbox(label="Generated Script", lines=10)
-            voice = gr.Dropdown(
                 choices=VOICES,
                 value="tara",
-                label="Voice",
-                info="Select the voice for speech generation"
             )
             with gr.Accordion("Advanced Settings", open=False):
@@ -220,7 +238,7 @@ with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
     submit_btn.click(
         fn=generate_speech,
-        inputs=[script_output, voice, temperature, top_p, repetition_penalty, max_new_tokens],
         outputs=audio_output
     )

         Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
         Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
+        {'Make sure the script is a monologue for one person.' if num_hosts == 1 else 'Ensure the dialogue alternates between two distinct voices, with one speaking on odd-numbered lines and the other on even-numbered lines.'}
         """
         response = model.generate_content(prompt)
     return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 @spaces.GPU()
+def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
     if not text.strip():
         return None
     try:
         progress(0.1, "Processing text...")
+        lines = text.split('\n')
+        audio_samples = []
+        for i, line in enumerate(lines):
+            if not line.strip():
+                continue
+            voice = voice1 if i % 2 == 0 else voice2
+            input_ids, attention_mask = process_prompt(line, voice, tokenizer, device)
+            progress(0.3, f"Generating speech tokens for line {i+1}...")
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    do_sample=True,
+                    temperature=temperature,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    max_new_tokens=max_new_tokens,
+                    num_return_sequences=1,
+                    eos_token_id=128258,
+                )
+            progress(0.6, f"Processing speech tokens for line {i+1}...")
+            code_list = parse_output(generated_ids)
+            progress(0.8, f"Converting line {i+1} to audio...")
+            line_audio = redistribute_codes(code_list, snac_model)
+            audio_samples.append(line_audio)
+        # Concatenate all audio samples
+        final_audio = np.concatenate(audio_samples)
+        return (24000, final_audio)
     except Exception as e:
         print(f"Error generating speech: {e}")
         return None
         with gr.Column(scale=2):
             script_output = gr.Textbox(label="Generated Script", lines=10)
+            voice1 = gr.Dropdown(
                 choices=VOICES,
                 value="tara",
+                label="Voice 1",
+                info="Select the first voice for speech generation"
+            )
+            voice2 = gr.Dropdown(
+                choices=VOICES,
+                value="dan",
+                label="Voice 2",
+                info="Select the second voice for speech generation"
             )
             with gr.Accordion("Advanced Settings", open=False):
     submit_btn.click(
         fn=generate_speech,
+        inputs=[script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts],
         outputs=audio_output
     )