PDF2Audio

Running

App Files Files Community

matsuap commited on 7 days ago

Commit

73cd951

1 Parent(s): 63bca4f

Add speed adjustment feature to audio generation in app.py, including UI slider for playback speed control.

Browse files

Files changed (1) hide show

app.py +26 -2

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from promptic import llm
 from pydantic import BaseModel, ValidationError
 from pypdf import PdfReader
 from tenacity import retry, retry_if_exception_type
 # Define multiple sets of instruction templates
 INSTRUCTION_TEMPLATES = {
@@ -576,6 +577,7 @@ def generate_audio(
     user_feedback: str = None,
     original_text: str = None,
     debug = False,
 ) -> tuple:
     # Validate API Key
     if not os.getenv("OPENAI_API_KEY") and not openai_api_key:
@@ -677,6 +679,17 @@ def generate_audio(
     temporary_file.write(audio)
     temporary_file.close()
     # Delete any files in the temp directory that end with .mp3 and are over a day old
     for file in glob.glob(f"{temporary_directory}*.mp3"):
         if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
@@ -782,6 +795,14 @@ with gr.Blocks(title="PDF to Audio", css="""
                 placeholder="カスタム/ローカルモデルを使う場合はAPIベースURLを入力してください...",
                 info="カスタムやローカルモデルを使う場合、ここにAPIベースURLを入力してください。例: http://localhost:8080/v1 (llama.cpp RESTサーバー用)",
             )
         with gr.Column(scale=3):
             template_dropdown = gr.Dropdown(
@@ -861,6 +882,7 @@ with gr.Blocks(title="PDF to Audio", css="""
             prelude_dialog, podcast_dialog_instructions,
             edited_transcript,  # placeholder for edited_transcript
             user_feedback,  # placeholder for user_feedback
         ],
         outputs=[audio_output, transcript_output, original_text_output, error_output]
     ).then(
@@ -880,7 +902,8 @@ with gr.Blocks(title="PDF to Audio", css="""
         fn=lambda use_edit, edit, *args: validate_and_generate_audio(
             *args[:12],  # All inputs up to podcast_dialog_instructions
             edit if use_edit else "",  # Use edited transcript if checkbox is checked, otherwise empty string
-            *args[12:]  # user_feedback and original_text_output
         ),
         inputs=[
             use_edited_transcript, edited_transcript,
@@ -888,7 +911,8 @@ with gr.Blocks(title="PDF to Audio", css="""
             speaker_1_voice, speaker_2_voice, api_base,
             intro_instructions, text_instructions, scratch_pad_instructions,
             prelude_dialog, podcast_dialog_instructions,
-            user_feedback, original_text_output
         ],
         outputs=[audio_output, transcript_output, original_text_output, error_output]
     ).then(

 from pydantic import BaseModel, ValidationError
 from pypdf import PdfReader
 from tenacity import retry, retry_if_exception_type
+from pydub import AudioSegment
 # Define multiple sets of instruction templates
 INSTRUCTION_TEMPLATES = {
     user_feedback: str = None,
     original_text: str = None,
     debug = False,
+    speed: float = 1.0,  # 追加
 ) -> tuple:
     # Validate API Key
     if not os.getenv("OPENAI_API_KEY") and not openai_api_key:
     temporary_file.write(audio)
     temporary_file.close()
+    # ここから再生速度変更処理
+    if speed != 1.0:
+        # pydubでmp3を読み込み、速度変更
+        sound = AudioSegment.from_file(temporary_file.name, format="mp3")
+        # 再生速度変更（ピッチはそのまま）
+        sound = sound._spawn(sound.raw_data, overrides={
+            "frame_rate": int(sound.frame_rate * speed)
+        }).set_frame_rate(sound.frame_rate)
+        # 上書き保存
+        sound.export(temporary_file.name, format="mp3")
     # Delete any files in the temp directory that end with .mp3 and are over a day old
     for file in glob.glob(f"{temporary_directory}*.mp3"):
         if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
                 placeholder="カスタム/ローカルモデルを使う場合はAPIベースURLを入力してください...",
                 info="カスタムやローカルモデルを使う場合、ここにAPIベースURLを入力してください。例: http://localhost:8080/v1 (llama.cpp RESTサーバー用)",
             )
+            speed_slider = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=1.0,
+                step=0.05,
+                label="再生速度 (0.5x～2.0x)",
+                info="音声の再生速度を調整できます。デフォルトは1.0（等倍）です。"
+            )
         with gr.Column(scale=3):
             template_dropdown = gr.Dropdown(
             prelude_dialog, podcast_dialog_instructions,
             edited_transcript,  # placeholder for edited_transcript
             user_feedback,  # placeholder for user_feedback
+            speed_slider,   # 追加
         ],
         outputs=[audio_output, transcript_output, original_text_output, error_output]
     ).then(
         fn=lambda use_edit, edit, *args: validate_and_generate_audio(
             *args[:12],  # All inputs up to podcast_dialog_instructions
             edit if use_edit else "",  # Use edited transcript if checkbox is checked, otherwise empty string
+            *args[12:],
+            speed_slider,
         ),
         inputs=[
             use_edited_transcript, edited_transcript,
             speaker_1_voice, speaker_2_voice, api_base,
             intro_instructions, text_instructions, scratch_pad_instructions,
             prelude_dialog, podcast_dialog_instructions,
+            user_feedback, original_text_output,
+            speed_slider,
         ],
         outputs=[audio_output, transcript_output, original_text_output, error_output]
     ).then(