Spaces:

p1atdev
/

PodcastVox

Running

App Files Files Community

Plat commited on 5 days ago

Commit

3a09141

1 Parent(s): 0a5847d

init

Browse files

Files changed (14) hide show

.gitignore +210 -0
.python-version +1 -0
LICENSE +165 -0
README.md +1 -1
app.py +451 -0
assets/engine_manifest.json +79 -0
pyproject.toml +27 -0
requirements.txt +13 -0
src/agent.py +162 -0
src/aivis.py +174 -0
src/fetcher.py +83 -0
src/podcast.py +107 -0
src/voicevox.py +125 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,210 @@

+# Created by https://www.toptal.com/developers/gitignore/api/macos,python,dotenv
+# Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,dotenv
+### dotenv ###
+.env
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+# End of https://www.toptal.com/developers/gitignore/api/macos,python,dotenv

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

LICENSE ADDED Viewed

	@@ -0,0 +1,165 @@

+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+  0. Additional Definitions.
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+  1. Exception to Section 3 of the GNU GPL.
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+  2. Conveying Modified Versions.
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+  3. Object Code Incorporating Material from Library Header Files.
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+  4. Combined Works.
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+   d) Do one of the following:
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+  5. Combined Libraries.
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+  6. Revised Versions of the GNU Lesser General Public License.
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: PodcastVox
-emoji: ⚡
 colorFrom: blue
 colorTo: indigo
 sdk: gradio

 ---
 title: PodcastVox
+emoji: 📻💠
 colorFrom: blue
 colorTo: indigo
 sdk: gradio

app.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import tempfile
+import asyncio
+import aiohttp
+import dotenv
+import os
+import time
+import logging
+from src.voicevox import VoiceVoxClient
+from src.agent import Conversation
+from src.podcast import PodcastStudio
+from src.aivis import start_aivis_speech, download_model
+import gradio as gr
+dotenv.load_dotenv()
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
+DEFAULT_MODELS = [
+    "https://hub.aivis-project.com/aivm-models/a59cb814-0083-4369-8542-f51a29e72af7",  # Anneli
+    "https://hub.aivis-project.com/aivm-models/4cf3e1d8-5583-41a9-a554-b2d2cda2c569",  # Anneli Whisper
+    "https://hub.aivis-project.com/aivm-models/6acf95e8-11a9-414e-aa9c-6dbebf9113ca",  # F1
+    "https://hub.aivis-project.com/aivm-models/25b39db7-5757-47ef-9fe4-2b7aff328a18",  # F2
+    "https://hub.aivis-project.com/aivm-models/d7255c2c-ddd0-425a-808c-662cd94c7f41",  # M1
+    "https://hub.aivis-project.com/aivm-models/d1a7446f-230d-4077-afdf-923eddabe53c",  # M2
+    "https://hub.aivis-project.com/aivm-models/6d11c6c2-f4a4-4435-887e-23dd60f8b8dd",  # にせ
+    "https://hub.aivis-project.com/aivm-models/e9339137-2ae3-4d41-9394-fb757a7e61e6",  # まい
+    "https://hub.aivis-project.com/aivm-models/eefe1fbd-d15a-49ae-bc83-fc4aaad680e1",  # ハヤテ
+    "https://hub.aivis-project.com/aivm-models/5d804388-665e-4174-ab60-53d448c0d7eb",  # 老当主
+    "https://hub.aivis-project.com/aivm-models/71e72188-2726-4739-9aa9-39567396fb2a",  # ふみふみ
+]
+AIVIS_ENDPOINT = "http://127.0.0.1:10101"
+NAVIGATOR_SAMPLE = "こんにちは！私の名前は {nickname} です。今回は私がポッドキャストをナビゲートします。よろしくお願いします！"
+ASSISTANT_SAMPLE = "こんにちは！私の名前は {nickname} です。私はサポーターとして、ナビゲーターと一緒にポッドキャストを盛り上げていきます。頑張ります！"
+async def generate_podcast(
+    voicevox_endpoint: str,
+    llm_api_key: str,
+    pdf_url: str,
+    speaker_name: str,
+    supporter_name: str,
+    speaker2id: dict[str, int],
+) -> tuple[str, str, object, Conversation, str, dict]:
+    client = VoiceVoxClient(voicevox_endpoint)
+    speaker_id = speaker2id[speaker_name]
+    supporter_id = speaker2id[supporter_name]
+    podcast_studio = PodcastStudio(
+        api_key=llm_api_key,
+        logging_level=logging.DEBUG,
+    )
+    start_time = time.time()
+    blog, _dialogue, conversation = await podcast_studio.create_conversation(pdf_url)
+    podcast_audio = await podcast_studio.record_podcast(
+        conversation=conversation,
+        voicevox_client=client,
+        speaker_id=speaker_id,
+        supporter_id=supporter_id,
+    )
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+        temp_file.write(podcast_audio.wav)
+        temp_file_path = temp_file.name
+    elapsed_time = time.time() - start_time
+    time_elapsed_text = f"処理時間: {elapsed_time:.2f} 秒"
+    return (
+        temp_file_path,
+        blog,
+        conversation.model_dump(),
+        conversation,
+        time_elapsed_text,
+        gr.update(visible=True),
+    )
+async def change_speaker(
+    voicevox_endpoint: str,
+    speaker_name: str,
+    supporter_name: str,
+    speaker2id: dict[str, int],
+    conversation_cache: Conversation,
+) -> tuple[str, str]:
+    client = VoiceVoxClient(voicevox_endpoint)
+    speaker_id = speaker2id[speaker_name]
+    supporter_id = speaker2id[supporter_name]
+    podcast_studio = PodcastStudio(api_key="")  # only voice synthesis
+    start_time = time.time()
+    podcast_audio = await podcast_studio.record_podcast(
+        conversation=conversation_cache,
+        voicevox_client=client,
+        speaker_id=speaker_id,
+        supporter_id=supporter_id,
+    )
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+        temp_file.write(podcast_audio.wav)
+        temp_file_path = temp_file.name
+    elapsed_time = time.time() - start_time
+    time_elapsed_text = f"処理時間: {elapsed_time:.2f} 秒"
+    return temp_file_path, time_elapsed_text
+async def get_speakers(endpoint: str):
+    client = VoiceVoxClient(endpoint)
+    speakers = await client.get_speakers()
+    print(f"Found {len(speakers)} speakers at {endpoint}")
+    choices = []
+    speaker_ids = []
+    for speaker in speakers:
+        for style in speaker.styles:
+            spekaer_name = f"{speaker.name} ({style.name})"
+            print(f"Speaker: {spekaer_name}, ID: {style.id}")
+            choices.append(spekaer_name)
+            speaker_ids.append(style.id)
+    speaker2id = dict(zip(choices, speaker_ids))
+    return choices, speaker2id
+async def on_endpoint_change(endpoint_text: str):
+    try:
+        speakers, speaker2id = await get_speakers(endpoint_text)
+        return (
+            gr.update(choices=speakers, value=speakers[0]),
+            gr.update(choices=speakers, value=speakers[1]),
+            speaker2id,
+        )
+    except Exception as e:
+        return gr.update(), gr.update(), gr.update()
+async def preview_speaker_voice(
+    voicevox_endpoint: str,
+    speaker_name: str,
+    speaker_id: int,
+    is_main_speaker: bool = True,
+):
+    client = VoiceVoxClient(voicevox_endpoint)
+    speaker_nickname = speaker_name.split("(")[0].strip()
+    if is_main_speaker:
+        sample_text = NAVIGATOR_SAMPLE.format(nickname=speaker_nickname)
+    else:
+        sample_text = ASSISTANT_SAMPLE.format(nickname=speaker_nickname)
+    audio_query = await client.post_audio_query(
+        text=sample_text,
+        speaker=speaker_id,
+    )
+    if audio_query.tempoDynamicsScale is not None:
+        audio_query.tempoDynamicsScale = 1.1
+    else:
+        audio_query.speedScale = 1.1
+    audio = await client.post_synthesis(
+        speaker=speaker_id,
+        audio_query=audio_query,
+    )
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+        temp_file.write(audio.wav)
+        temp_file_path = temp_file.name
+    return temp_file_path
+async def on_change_speaker(
+    voicevox_endpoint: str,
+    speaker_name: str,
+    speaker2id: dict[str, int],
+    is_main_speaker: bool,
+):
+    speaker_id = speaker2id[speaker_name]
+    return await preview_speaker_voice(
+        voicevox_endpoint=voicevox_endpoint,
+        speaker_name=speaker_name,
+        speaker_id=speaker_id,
+        is_main_speaker=is_main_speaker,
+    )
+async def download_default_models():
+    logging.info("Downloading default models...")
+    results = await asyncio.gather(
+        *[download_model(model_url) for model_url in DEFAULT_MODELS],
+        return_exceptions=True,
+    )
+    for result in results:
+        if isinstance(result, Exception):
+            logging.error(f"Failed to download model: {result}")
+async def wait_for_endpoint(url: str, timeout: float = 30.0, interval: float = 0.5):
+    """url が 200 を返すまで待機"""
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url) as res:
+                    if res.status == 200:
+                        return
+        except Exception:
+            pass
+        await asyncio.sleep(interval)
+    raise RuntimeError(f"Endpoint {url} did not become ready in {timeout}s")
+async def main():
+    await wait_for_endpoint(AIVIS_ENDPOINT)
+    initial_endpoint = AIVIS_ENDPOINT
+    try:
+        speakers, spaker2id = await get_speakers(initial_endpoint)
+    except Exception as _e:
+        speakers = []
+        spaker2id = {}
+    main_speaker_name = None if len(speakers) == 0 else speakers[0]
+    supporter_speaker_name = None if len(speakers) < 2 else speakers[1]
+    main_speaker_preview = None
+    supporter_speaker_preview = None
+    if main_speaker_name is not None:
+        main_speaker_preview = await preview_speaker_voice(
+            voicevox_endpoint=initial_endpoint,
+            speaker_name=main_speaker_name,
+            speaker_id=spaker2id.get(main_speaker_name, 0),
+            is_main_speaker=True,
+        )
+    if supporter_speaker_name is not None:
+        supporter_speaker_preview = await preview_speaker_voice(
+            voicevox_endpoint=initial_endpoint,
+            speaker_name=supporter_speaker_name,
+            speaker_id=spaker2id.get(supporter_speaker_name, 0),
+            is_main_speaker=False,
+        )
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+# PodcastVox (Aivis Speech)
+Gemini Flash 2.5 と Aivis Speech を利用して、Web サイトを情報源とした Podcast を生成することができます。
+"""
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Group():
+                    endpoint_text = gr.Textbox(
+                        label="VOICEVOX エンドポイント",
+                        value=initial_endpoint,
+                        placeholder=AIVIS_ENDPOINT,
+                        info="VOICEVOX 型 の REST API に対応したエンドポイントを入力してください",
+                        visible=False,
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            speakers_dropdown = gr.Dropdown(
+                                label="メイン話者",
+                                choices=speakers,
+                                value=main_speaker_name,
+                                multiselect=False,
+                            )
+                            speaker_preview_audio = gr.Audio(
+                                label="メイン話者音声プレビュー",
+                                type="filepath",
+                                value=main_speaker_preview,
+                            )
+                        with gr.Column():
+                            supporter_dropdown = gr.Dropdown(
+                                label="サポーター話者",
+                                choices=speakers,
+                                value=supporter_speaker_name,
+                                multiselect=False,
+                            )
+                            supporter_preview_audio = gr.Audio(
+                                label="サポーター音声プレビュー",
+                                type="filepath",
+                                value=supporter_speaker_preview,
+                            )
+                    spaker2id_map = gr.State(value=spaker2id)
+                    change_speaker_button = gr.Button(
+                        "この話者で再録音",
+                        variant="secondary",
+                        visible=False,
+                    )
+                with gr.Group():
+                    llm_api_key_text = gr.Textbox(
+                        label="Gemini API Key",
+                        info="Podcast を生成するには API キーが必要です。https://aistudio.google.com/apikey から取得できます。",
+                        placeholder="Enter your Gemini API key",
+                        value=GEMINI_API_KEY,
+                        type="password",
+                        visible=GEMINI_API_KEY == "",
+                    )
+            with gr.Column():
+                with gr.Group():
+                    pdf_url_text = gr.Textbox(
+                        label="情報源となる Web サイト の URL",
+                        placeholder="https://arxiv.org/pdf/2308.06721, https://example.com/index.html",
+                        lines=1,
+                        info="Podcast のテーマとなる Web サイト の URL を入力してください。HTML、PDF に対応しています。",
+                    )
+                    submit_button = gr.Button("Synthesize", variant="primary")
+                time_elapsed_text = gr.Markdown(
+                    value="",
+                )
+                output_audio = gr.Audio(
+                    label="Output Podcast Audio",
+                    type="filepath",
+                    autoplay=True,
+                )
+                conversation_cache = gr.State(value=None)
+                with gr.Accordion("生成されたブログ", open=False):
+                    blog_output = gr.Markdown(
+                        label="Blog Output",
+                        value="生成されたブログはここに表示されます。",
+                    )
+                with gr.Accordion("生成された会話", open=False):
+                    conversation_output = gr.JSON(label="Conversation Output", value={})
+        gr.Examples(
+            examples=[
+                ["https://arxiv.org/pdf/2308.06721"],
+                ["https://www.aozora.gr.jp/cards/000879/files/127_15260.html"],
+            ],
+            inputs=[pdf_url_text],
+        )
+        gr.on(
+            triggers=[endpoint_text.change],
+            fn=on_endpoint_change,
+            inputs=[endpoint_text],
+            outputs=[
+                speakers_dropdown,
+                supporter_dropdown,
+                spaker2id_map,
+            ],
+            concurrency_limit=10,
+        )
+        gr.on(
+            triggers=[submit_button.click],
+            fn=generate_podcast,
+            inputs=[
+                endpoint_text,
+                llm_api_key_text,
+                pdf_url_text,
+                speakers_dropdown,
+                supporter_dropdown,
+                spaker2id_map,
+            ],
+            outputs=[
+                output_audio,
+                blog_output,
+                conversation_output,
+                conversation_cache,
+                time_elapsed_text,
+                change_speaker_button,  # make visible after generation
+            ],
+            concurrency_limit=10,
+        )
+        gr.on(
+            triggers=[change_speaker_button.click],
+            fn=change_speaker,
+            inputs=[
+                endpoint_text,
+                speakers_dropdown,
+                supporter_dropdown,
+                spaker2id_map,
+                conversation_cache,
+            ],
+            outputs=[
+                output_audio,
+                time_elapsed_text,
+            ],
+            concurrency_limit=10,
+        )
+        gr.on(
+            triggers=[
+                speakers_dropdown.change,
+            ],
+            fn=on_change_speaker,
+            inputs=[
+                endpoint_text,
+                speakers_dropdown,
+                spaker2id_map,
+                gr.State(value=True),
+            ],
+            outputs=[speaker_preview_audio],
+            concurrency_limit=10,
+        )
+        gr.on(
+            triggers=[
+                supporter_dropdown.change,
+            ],
+            fn=on_change_speaker,
+            inputs=[
+                endpoint_text,
+                supporter_dropdown,
+                spaker2id_map,
+                gr.State(value=False),
+            ],
+            outputs=[supporter_preview_audio],
+            concurrency_limit=10,
+        )
+    demo.launch()
+async def runner():
+    await download_default_models()
+    aivis = asyncio.to_thread(start_aivis_speech)
+    webui = asyncio.create_task(main())
+    await asyncio.gather(aivis, webui)
+if __name__ == "__main__":
+    asyncio.run(runner())

assets/engine_manifest.json ADDED Viewed

	@@ -0,0 +1,79 @@

+{
+    "manifest_version": "0.13.1",
+    "name": "AivisSpeech Engine",
+    "brand_name": "AivisSpeech",
+    "uuid": "1b4a5014-d9fd-11ee-b97d-83c170a68ed3",
+    "version": "999.999.999",
+    "url": "https://github.com/Aivis-Project/AivisSpeech-Engine",
+    "command": "run",
+    "port": 10101,
+    "icon": "resources/engine_manifest_assets/icon.png",
+    "default_sampling_rate": 44100,
+    "frame_rate": 172.265625,
+    "terms_of_service": "resources/engine_manifest_assets/terms_of_service.md",
+    "update_infos": "resources/engine_manifest_assets/update_infos.json",
+    "dependency_licenses": "resources/engine_manifest_assets/dependency_licenses.json",
+    "supported_vvlib_manifest_version": null,
+    "supported_features": {
+        "adjust_mora_pitch": {
+            "type": "bool",
+            "value": false,
+            "name": "モーラごとの音高の調整"
+        },
+        "adjust_phoneme_length": {
+            "type": "bool",
+            "value": false,
+            "name": "音素ごとの長さの調整"
+        },
+        "adjust_speed_scale": {
+            "type": "bool",
+            "value": true,
+            "name": "全体の話速の調整"
+        },
+        "adjust_pitch_scale": {
+            "type": "bool",
+            "value": true,
+            "name": "全体の音高の調整"
+        },
+        "adjust_intonation_scale": {
+            "type": "bool",
+            "value": true,
+            "name": "全体の抑揚の調整"
+        },
+        "adjust_volume_scale": {
+            "type": "bool",
+            "value": true,
+            "name": "全体の音量の調整"
+        },
+        "adjust_pause_length": {
+            "type": "bool",
+            "value": false,
+            "name": "句読点などの無音時間の調整"
+        },
+        "interrogative_upspeak": {
+            "type": "bool",
+            "value": false,
+            "name": "疑問文の自動調整"
+        },
+        "synthesis_morphing" : {
+            "type": "bool",
+            "value": false,
+            "name": "2種類のスタイルでモーフィングした音声を合成"
+        },
+        "sing" : {
+            "type": "bool",
+            "value": false,
+            "name": "歌唱音声合成"
+        },
+        "manage_library": {
+            "type": "bool",
+            "value": false,
+            "name": "音声ライブラリのインストール・アンインストール"
+        },
+        "return_resource_url": {
+            "type": "bool",
+            "value": false,
+            "name": "キャラクター情報のリソースを URL で返送"
+        }
+    }
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,27 @@

+[project]
+name = "podcastvox-demo"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11,<3.12"
+dependencies = [
+    "aiohttp>=3.12.6",
+    "aivis-speech-engine",
+    "fastapi>=0.115.12",
+    "gradio>=5.32.0",
+    "hf-transfer>=0.1.9",
+    "hf-xet>=1.1.2",
+    "litellm>=1.72.0",
+    "markitdown[pdf]>=0.1.2",
+    "onnxruntime>=1.22.0",
+    "pydantic>=2.11.5",
+    "pyopenjtalk-plus==0.4.1.post3",
+    "setuptools>=80.9.0",
+    "wheel>=0.45.1",
+]
+[dependency-groups]
+dev = ["ruff>=0.11.12", "ty>=0.0.1a7"]
+[tool.uv.sources]
+aivis-speech-engine = { git = "https://github.com/p1atdev/AivisSpeech-Engine" }

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+aiohttp>=3.12.6
+fastapi>=0.115.12
+gradio>=5.32.0
+hf-transfer>=0.1.9
+hf-xet>=1.1.2
+litellm>=1.72.0
+markitdown[pdf]>=0.1.2
+onnxruntime>=1.22.0
+pydantic>=2.11.5
+pyopenjtalk-plus==0.4.1.post3
+setuptools>=80.9.0
+wheel>=0.45.1
+git+https://github.com/p1atdev/AivisSpeech-Engine

src/agent.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import json
+from typing import Literal
+from pydantic import BaseModel
+import litellm
+from litellm.types.utils import ModelResponse
+SAFETY_SETTINGS = [
+    {
+        "category": "HARM_CATEGORY_HARASSMENT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_HATE_SPEECH",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+        "threshold": "BLOCK_NONE",
+    },
+    {
+        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+        "threshold": "BLOCK_NONE",
+    },
+]
+class BloggerAgent:
+    instructions = [
+        {
+            "role": "user",
+            "content": "与えられる情報について、重要なポイントを踏まえて平易な言葉で解説・紹介する記事を書いてください",
+        },
+    ]
+    model: str = "gemini/gemini-2.5-flash-preview-05-20"
+    temperature: float = 1.0
+    max_tokens: int = 4096
+    thinking_budget: int = 1024
+    api_key: str
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+    async def task(self, information: str) -> str:
+        messages = self.instructions.copy()
+        messages.append({"role": "user", "content": information})
+        res = await litellm.acompletion(
+            api_key=self.api_key,
+            model=self.model,
+            messages=messages,
+            temperature=self.temperature,
+            max_completion_tokens=self.max_tokens,
+            thinking={"type": "enabled", "budget_tokens": self.thinking_budget},
+            safety_settings=SAFETY_SETTINGS,
+        )
+        assert isinstance(res, ModelResponse)
+        blog = res.choices[0].message.content
+        assert isinstance(blog, str)
+        return blog
+class WriterAgent:
+    instructions = [
+        {
+            "role": "user",
+            "content": """与えられる情報ソースとその解説記事をもとに、コンテンツを紹介する Podcast の会話を作成してください。
+Podcast では、二人の人物が交互に会話をします。
+# 登場人物
+- スピーカー: コンテンツ紹介をリードする人で、主にこの人物が解説を行う
+- サポーター: スピーカーの説明を聞き、うなづいたり、さらに質問を投げかけることで、理解を助ける。
+# 構成
+1. イントロ: まず、スピーカーとサポーターが何について話すのか、挨拶を交えながら会話します。自己紹介は省略する。
+2. 解説: 前提知識の確認をしながら、内容を解説していきます
+3. アウトロ: 今後の展望を交えながら締めくくります
+---
+このような内容になるような Podcast の脚本を作成してください。
+""".strip(),
+        },
+    ]
+    model: str = "gemini/gemini-2.5-flash-preview-05-20"
+    temperature: float = 1.0
+    max_tokens: int = 4096
+    thinking_budget: int = 1024
+    api_key: str
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+    async def task(self, information: str, blog: str) -> str:
+        messages = self.instructions.copy()
+        messages.append(
+            {"role": "user", "content": f"# 情報\n{information}\n\n# 解説\n{blog}"}
+        )
+        res = await litellm.acompletion(
+            api_key=self.api_key,
+            model=self.model,
+            messages=messages,
+            temperature=self.temperature,
+            max_completion_tokens=self.max_tokens,
+            thinking={"type": "enabled", "budget_tokens": self.thinking_budget},
+            safety_settings=SAFETY_SETTINGS,
+        )
+        assert isinstance(res, ModelResponse)
+        dialogue = res.choices[0].message.content
+        assert isinstance(dialogue, str)
+        return dialogue
+class Dialogue(BaseModel):
+    role: Literal["speaker", "supporter"]
+    content: str
+class Conversation(BaseModel):
+    conversation: list[Dialogue]
+class StructureAgent:
+    instructions = [
+        {
+            "role": "user",
+            "content": """この会話を指定されたスキーマに従った形に変換してください。スピーカーの role は `speaker`、サポーターは `supporter` です。""".strip(),
+        },
+    ]
+    model: str = "gemini/gemini-2.5-flash-preview-05-20"
+    temperature: float = 0.1
+    max_tokens: int = 12_288
+    thinking_budget: int = 0
+    api_key: str
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+    async def task(self, dialogue: str) -> Conversation:
+        messages = self.instructions.copy()
+        messages.append({"role": "user", "content": dialogue})
+        res = await litellm.acompletion(
+            api_key=self.api_key,
+            model=self.model,
+            messages=messages,
+            temperature=self.temperature,
+            max_completion_tokens=self.max_tokens,
+            thinking={"type": "disabled"},
+            response_format=Conversation,
+            safety_settings=SAFETY_SETTINGS,
+        )
+        conversation = Conversation.model_validate(
+            json.loads(res.choices[0].message.content)
+        )
+        return conversation

src/aivis.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# ref: https://github.com/Aivis-Project/AivisSpeech-Engine/blob/master/run.py
+import gc
+import uvicorn
+from pathlib import Path
+import random
+import aiohttp
+import aiofiles
+from voicevox_engine import __version__
+from voicevox_engine.aivm_manager import AivmManager
+from voicevox_engine.app.application import generate_app
+from voicevox_engine.core.core_initializer import MOCK_VER, initialize_cores
+from voicevox_engine.engine_manifest import load_manifest
+from voicevox_engine.library.library_manager import LibraryManager
+from voicevox_engine.logging import LOGGING_CONFIG, logger
+from voicevox_engine.preset.preset_manager import PresetManager
+from voicevox_engine.setting.model import CorsPolicyMode
+from voicevox_engine.setting.setting_manager import USER_SETTING_PATH, SettingHandler
+from voicevox_engine.tts_pipeline.song_engine import make_song_engines_from_cores
+from voicevox_engine.tts_pipeline.tts_engine import TTSEngineManager
+from voicevox_engine.user_dict.user_dict_manager import UserDictionary
+from voicevox_engine.utility.path_utility import (
+    engine_root,
+    get_save_dir,
+)
+from voicevox_engine.utility.user_agent_utility import generate_user_agent
+def start_aivis_speech() -> None:
+    """AivisSpeech Engine を実行する"""
+    try:
+        # multiprocessing.freeze_support()
+        # 起動時の可能な限り早い段階で実行結果をキャッシュしておくのが重要
+        generate_user_agent("CPU")
+        logger.info(f"AivisSpeech Engine version {__version__}")
+        logger.info(f"Engine root directory: {engine_root()}")
+        logger.info(f"User data directory: {get_save_dir()}")
+        # AivmManager を初期化
+        aivm_manager = AivmManager(get_save_dir() / "Models")
+        # ごく稀に style_bert_vits2_tts_engine.py (が依存する onnxruntime) のインポート自体に失敗し
+        # 例外が発生する環境があるようなので、例外をキャッチしてエラーログに出力できるよう、敢えてルーター初期化時にインポートする
+        from voicevox_engine.tts_pipeline.style_bert_vits2_tts_engine import (
+            StyleBertVITS2TTSEngine,
+        )
+        # AivisSpeech Engine 独自の StyleBertVITS2TTSEngine を通常の TTSEngine の代わりに利用
+        tts_engines = TTSEngineManager()
+        tts_engines.register_engine(
+            StyleBertVITS2TTSEngine(aivm_manager, use_gpu=False, load_all_models=False),
+            MOCK_VER,
+        )
+        core_manager = initialize_cores(
+            use_gpu=False,
+            voicelib_dirs=None,
+            voicevox_dir=None,
+            runtime_dirs=None,
+            cpu_num_threads=16,
+            enable_mock=True,
+            load_all_models=False,
+        )
+        # tts_engines = make_tts_engines_from_cores(core_manager)
+        song_engines = make_song_engines_from_cores(core_manager)
+        # assert len(tts_engines.versions()) != 0, "音声合成エンジンがありません。"
+        assert len(song_engines.versions()) != 0, "音声合成エンジンがありません。"
+        setting_loader = SettingHandler(USER_SETTING_PATH)
+        # 複数方式で指定可能な場合、優先度は上から「引数」「環境変数」「設定ファイル」「デフォルト値」
+        cors_policy_mode = CorsPolicyMode.all
+        allow_origin = ["*"]
+        preset_path = get_save_dir() / "presets.yaml"
+        preset_manager = PresetManager(preset_path)
+        user_dict = UserDictionary()
+        engine_manifest = load_manifest(Path("engine/engine_manifest.json"))
+        library_manager = LibraryManager(
+            # get_save_dir() / "installed_libraries",
+            # AivisSpeech では利用しない LibraryManager によるディレクトリ作成を防ぐため、get_save_dir() 直下を指定
+            get_save_dir(),
+            engine_manifest.supported_vvlib_manifest_version,
+            engine_manifest.brand_name,
+            engine_manifest.name,
+            engine_manifest.uuid,
+        )
+        root_dir = engine_root()
+        character_info_dir = root_dir / "resources" / "character_info"
+        # NOTE: ENGINE v0.19 以前向けに後方互換性を確保する
+        if not character_info_dir.exists():
+            character_info_dir = root_dir / "speaker_info"
+        # ASGI に準拠した AivisSpeech Engine アプリケーションを生成する
+        app = generate_app(
+            tts_engines,
+            song_engines,
+            aivm_manager,
+            core_manager,
+            setting_loader,
+            preset_manager,
+            user_dict,
+            engine_manifest,
+            library_manager,
+            cancellable_engine=None,
+            character_info_dir=character_info_dir,
+            cors_policy_mode=cors_policy_mode,
+            allow_origin=allow_origin,
+            disable_mutable_api=False,
+        )
+        # 起動処理にのみに要したメモリを開放
+        gc.collect()
+        # AivisSpeech Engine サーバーを起動
+        # NOTE: デフォルトは ASGI に準拠した HTTP/1.1 サーバー
+        uvicorn.run(app, host="127.0.0.1", port=10101, log_config=LOGGING_CONFIG)
+    except Exception as e:
+        logger.error("Unexpected error occurred during engine startup:", exc_info=e)
+        raise e
+def random_str() -> str:
+    num = random.randint(10000, 99999)
+    return str(num)
+async def download_model(model_url: str) -> None:
+    save_dir = get_save_dir() / "Models"
+    url = Path(model_url)
+    model_id = url.stem
+    model_path = save_dir / f"{model_id}.aivmx"
+    if model_path.exists():
+        logger.info(
+            f"Model {model_id} already exists at {model_path}. Skipping download."
+        )
+        return
+    download_url = f"https://api.aivis-project.com/v1/aivm-models/{model_id}/download?model_type=AIVMX"
+    logger.info("Downloading model from {download_url} to {model_path}...")
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(
+                download_url,
+            ) as res:
+                res.raise_for_status()
+                # streaming download
+                async with aiofiles.open(model_path, "wb") as f:
+                    async for chunk in res.content.iter_chunked(1024 * 1024):
+                        await f.write(chunk)
+                logger.info(f"Model downloaded to {model_path}")
+        except Exception as e:
+            logger.error(f"Failed to download model: {e}")
+if __name__ == "__main__":
+    # AivisSpeech Engine を起動
+    start_aivis_speech()

src/fetcher.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import aiohttp
+import io
+from markitdown import MarkItDown
+class PDFFetcher:
+    def __init__(self):
+        self.md = MarkItDown(enable_plugins=True)
+    def read_local(self, pdf_path: str) -> str:
+        result = self.md.convert(pdf_path)
+        markdown = self.postprocess(result.text_content)
+        return markdown
+    def postprocess(self, markdown: str) -> str:
+        pages = markdown.split("\f")
+        markdown = "\n".join(pages)
+        return markdown.strip()
+    async def fetch(self, pdf_url: str) -> str:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(pdf_url) as res:
+                if res.status != 200:
+                    raise Exception(f"Failed to download PDF: {res.status}")
+                pdf_content = await res.read()
+        markdown = self.md.convert_stream(io.BytesIO(pdf_content)).text_content
+        markdown = self.postprocess(markdown)
+        return markdown
+class HTMLFetcher:
+    def __init__(self):
+        self.md = MarkItDown(enable_plugins=True)
+    async def fetch(self, html_url: str) -> str:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(html_url) as res:
+                if res.status != 200:
+                    raise Exception(f"Failed to download HTML: {res.status}")
+                data = await res.read()
+            markdown = self.md.convert_stream(io.BytesIO(data))
+        return markdown.text_content
+class AutoFetcher:
+    def __init__(self):
+        self.pdf_fetcher = PDFFetcher()
+        self.html_fetcher = HTMLFetcher()
+        self.md = MarkItDown(enable_plugins=True)
+    async def fetch(self, url: str) -> str:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as res:
+                if res.status != 200:
+                    raise Exception(f"Failed to download HTML: {res.status}")
+                data = await res.read()
+                content_type = res.headers.get(
+                    "Content-Type",
+                    res.headers.get("content-type", "text/plain"),
+                )
+        if "application/pdf" in content_type:
+            return self.pdf_fetcher.postprocess(
+                self.md.convert_stream(io.BytesIO(data)).text_content
+            )
+        elif "text/html" in content_type:
+            return self.md.convert_stream(io.BytesIO(data)).text_content
+        else:
+            # plain?
+            return self.md.convert_stream(io.BytesIO(data)).text_content

src/podcast.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from tqdm import tqdm
+import logging
+from .agent import BloggerAgent, WriterAgent, StructureAgent, Conversation
+from .fetcher import AutoFetcher
+from .voicevox import VoiceVoxClient, SpeakerId, Audio
+class PodcastStudio:
+    def __init__(self, api_key: str, logging_level: int = logging.INFO):
+        self.blogger = BloggerAgent(api_key=api_key)
+        self.writer = WriterAgent(api_key=api_key)
+        self.structure_agent = StructureAgent(api_key=api_key)
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging_level)
+        self.fetcher = AutoFetcher()
+    async def create_conversation(self, url: str) -> tuple[str, str, Conversation]:
+        self.logger.info(f"Fetching paper from {url}...")
+        paper = await self.fetcher.fetch(url)
+        self.logger.info("Paper fetched successfully.")
+        self.logger.debug(
+            f"Paper content: {paper[:100]}..."
+        )  # Log first 100 characters
+        self.logger.info("Creating blog from paper...")
+        blog = await self.blogger.task(paper)
+        self.logger.info("Blog created successfully.")
+        self.logger.debug(f"{blog[:100]}...")  # Log first 100 characters
+        self.logger.info("Creating dialogue from blog...")
+        dialogue = await self.writer.task(paper, blog)
+        self.logger.info("Dialogue created successfully.")
+        self.logger.debug(f"{dialogue[:100]}...")  # Log first 100 characters
+        self.logger.info("Structuring conversation from dialogue...")
+        conversation = await self.structure_agent.task(dialogue)
+        self.logger.info("Conversation structured successfully.")
+        for _d in conversation.conversation:
+            self.logger.debug(f"{_d.role}: {_d.content[:100]}...")
+        return blog, dialogue, conversation
+    async def record_podcast(
+        self,
+        conversation: Conversation,
+        voicevox_client: VoiceVoxClient,
+        speaker_id: SpeakerId,
+        supporter_id: SpeakerId,
+    ) -> Audio:
+        progress_bar = tqdm(
+            total=len(conversation.conversation),
+            desc="Synthesizing audio",
+            ncols=100,
+        )
+        async def _synthesis(
+            speaker_id: SpeakerId,
+            text: str,
+            index: int,
+            progress: tqdm,
+        ) -> tuple[int, Audio]:
+            audio_query = await voicevox_client.post_audio_query(
+                text=text,
+                speaker=speaker_id,
+            )
+            if audio_query.tempoDynamicsScale is not None:
+                audio_query.tempoDynamicsScale = 1.1
+            else:
+                audio_query.speedScale = 1.1
+            audio = await voicevox_client.post_synthesis(
+                speaker=speaker_id,
+                audio_query=audio_query,
+            )
+            progress.update(1)
+            progress.set_postfix({"text": text[:20] + "..."})
+            return index, audio
+        results = []
+        for i, dialogue in enumerate(conversation.conversation):
+            results.append(
+                await _synthesis(
+                    speaker_id=(
+                        speaker_id if dialogue.role == "speaker" else supporter_id
+                    ),
+                    text=dialogue.content,
+                    index=i,
+                    progress=progress_bar,
+                )
+            )
+        progress_bar.close()
+        # sort results by index
+        results.sort(key=lambda x: x[0])
+        audios = [audio for _, audio in results]
+        # connect audio files
+        podcast = await voicevox_client.post_connect_waves(
+            audio_list=audios,
+        )
+        return podcast

src/voicevox.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import aiohttp
+from typing import Literal
+from pydantic import BaseModel
+import io
+import base64
+SpeakerId = int
+class SpeakerStyle(BaseModel):
+    name: str
+    id: SpeakerId
+    type: Literal["talk"]
+class Speaker(BaseModel):
+    name: str
+    speaker_uuid: str
+    styles: list[SpeakerStyle]
+    version: str
+class AudioQuery(BaseModel):
+    accent_phrases: list[dict]
+    speedScale: float
+    intonationScale: float
+    tempoDynamicsScale: float | None = None
+    pitchScale: float
+    volumeScale: float
+    prePhonemeLength: float
+    postPhonemeLength: float
+    pauseLength: float | None
+    pauseLengthScale: float
+    outputSamplingRate: int
+    outputStereo: bool
+    kana: str
+class Audio(BaseModel):
+    wav: bytes
+class VoiceVoxClient:
+    endpoint: str
+    def __init__(self, endpoint: str = "http://127.0.0.1:50021"):
+        self.endpoint = endpoint
+    async def get_speakers(self) -> list[Speaker]:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self.endpoint}/speakers") as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to get speakers: {response.status}")
+                return [
+                    Speaker.model_validate(speaker) for speaker in await response.json()
+                ]
+    async def get_core_versions(self) -> list[str]:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self.endpoint}/core_versions") as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to get core version: {response.status}")
+                return await response.json()
+    async def post_audio_query(
+        self,
+        text: str,
+        speaker: SpeakerId,
+        core_version: str | None = None,
+    ) -> AudioQuery:
+        async with aiohttp.ClientSession() as session:
+            params: dict[str, str | int | float] = {"text": text, "speaker": speaker}
+            if core_version:
+                params["core_version"] = core_version
+            async with session.post(
+                f"{self.endpoint}/audio_query",
+                params=params,
+            ) as res:
+                if res.status != 200:
+                    raise Exception(f"Failed to post audio query: {res.status}")
+                json_data = await res.json()
+                return AudioQuery.model_validate(json_data)
+    async def post_synthesis(
+        self,
+        speaker: SpeakerId,
+        audio_query: AudioQuery,
+        enable_interrogative_upspeak: bool = True,
+        core_version: str | None = None,
+    ) -> Audio:
+        async with aiohttp.ClientSession() as session:
+            params: dict[str, str | int | float] = {
+                "speaker": speaker,
+                "enable_interrogative_upspeak": (
+                    "true" if enable_interrogative_upspeak else "false"
+                ),
+            }
+            if core_version:
+                params["core_version"] = core_version
+            async with session.post(
+                f"{self.endpoint}/synthesis",
+                params=params,
+                json=audio_query.model_dump(),
+            ) as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to post synthesis: {response.status}")
+                wav = io.BytesIO(await response.read())
+                return Audio(wav=wav.getvalue())
+    async def post_connect_waves(
+        self,
+        audio_list: list[Audio],
+    ) -> Audio:
+        async with aiohttp.ClientSession() as session:
+            audio_data = [
+                base64.b64encode(audio.wav).decode("utf-8") for audio in audio_list
+            ]
+            async with session.post(
+                f"{self.endpoint}/connect_waves",
+                json=audio_data,
+            ) as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to connect waves: {response.status}")
+                wav = io.BytesIO(await response.read())
+                return Audio(wav=wav.getvalue())

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff