Plat commited on
Commit
3a09141
·
1 Parent(s): 0a5847d
Files changed (14) hide show
  1. .gitignore +210 -0
  2. .python-version +1 -0
  3. LICENSE +165 -0
  4. README.md +1 -1
  5. app.py +451 -0
  6. assets/engine_manifest.json +79 -0
  7. pyproject.toml +27 -0
  8. requirements.txt +13 -0
  9. src/agent.py +162 -0
  10. src/aivis.py +174 -0
  11. src/fetcher.py +83 -0
  12. src/podcast.py +107 -0
  13. src/voicevox.py +125 -0
  14. uv.lock +0 -0
.gitignore ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/macos,python,dotenv
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=macos,python,dotenv
3
+
4
+ ### dotenv ###
5
+ .env
6
+
7
+ ### macOS ###
8
+ # General
9
+ .DS_Store
10
+ .AppleDouble
11
+ .LSOverride
12
+
13
+ # Icon must end with two \r
14
+ Icon
15
+
16
+ # Thumbnails
17
+ ._*
18
+
19
+ # Files that might appear in the root of a volume
20
+ .DocumentRevisions-V100
21
+ .fseventsd
22
+ .Spotlight-V100
23
+ .TemporaryItems
24
+ .Trashes
25
+ .VolumeIcon.icns
26
+ .com.apple.timemachine.donotpresent
27
+
28
+ # Directories potentially created on remote AFP share
29
+ .AppleDB
30
+ .AppleDesktop
31
+ Network Trash Folder
32
+ Temporary Items
33
+ .apdisk
34
+
35
+ ### macOS Patch ###
36
+ # iCloud generated files
37
+ *.icloud
38
+
39
+ ### Python ###
40
+ # Byte-compiled / optimized / DLL files
41
+ __pycache__/
42
+ *.py[cod]
43
+ *$py.class
44
+
45
+ # C extensions
46
+ *.so
47
+
48
+ # Distribution / packaging
49
+ .Python
50
+ build/
51
+ develop-eggs/
52
+ dist/
53
+ downloads/
54
+ eggs/
55
+ .eggs/
56
+ lib/
57
+ lib64/
58
+ parts/
59
+ sdist/
60
+ var/
61
+ wheels/
62
+ share/python-wheels/
63
+ *.egg-info/
64
+ .installed.cfg
65
+ *.egg
66
+ MANIFEST
67
+
68
+ # PyInstaller
69
+ # Usually these files are written by a python script from a template
70
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
71
+ *.manifest
72
+ *.spec
73
+
74
+ # Installer logs
75
+ pip-log.txt
76
+ pip-delete-this-directory.txt
77
+
78
+ # Unit test / coverage reports
79
+ htmlcov/
80
+ .tox/
81
+ .nox/
82
+ .coverage
83
+ .coverage.*
84
+ .cache
85
+ nosetests.xml
86
+ coverage.xml
87
+ *.cover
88
+ *.py,cover
89
+ .hypothesis/
90
+ .pytest_cache/
91
+ cover/
92
+
93
+ # Translations
94
+ *.mo
95
+ *.pot
96
+
97
+ # Django stuff:
98
+ *.log
99
+ local_settings.py
100
+ db.sqlite3
101
+ db.sqlite3-journal
102
+
103
+ # Flask stuff:
104
+ instance/
105
+ .webassets-cache
106
+
107
+ # Scrapy stuff:
108
+ .scrapy
109
+
110
+ # Sphinx documentation
111
+ docs/_build/
112
+
113
+ # PyBuilder
114
+ .pybuilder/
115
+ target/
116
+
117
+ # Jupyter Notebook
118
+ .ipynb_checkpoints
119
+
120
+ # IPython
121
+ profile_default/
122
+ ipython_config.py
123
+
124
+ # pyenv
125
+ # For a library or package, you might want to ignore these files since the code is
126
+ # intended to run in multiple environments; otherwise, check them in:
127
+ # .python-version
128
+
129
+ # pipenv
130
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
131
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
132
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
133
+ # install all needed dependencies.
134
+ #Pipfile.lock
135
+
136
+ # poetry
137
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
138
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
139
+ # commonly ignored for libraries.
140
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
141
+ #poetry.lock
142
+
143
+ # pdm
144
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
145
+ #pdm.lock
146
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
147
+ # in version control.
148
+ # https://pdm.fming.dev/#use-with-ide
149
+ .pdm.toml
150
+
151
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
152
+ __pypackages__/
153
+
154
+ # Celery stuff
155
+ celerybeat-schedule
156
+ celerybeat.pid
157
+
158
+ # SageMath parsed files
159
+ *.sage.py
160
+
161
+ # Environments
162
+ .venv
163
+ env/
164
+ venv/
165
+ ENV/
166
+ env.bak/
167
+ venv.bak/
168
+
169
+ # Spyder project settings
170
+ .spyderproject
171
+ .spyproject
172
+
173
+ # Rope project settings
174
+ .ropeproject
175
+
176
+ # mkdocs documentation
177
+ /site
178
+
179
+ # mypy
180
+ .mypy_cache/
181
+ .dmypy.json
182
+ dmypy.json
183
+
184
+ # Pyre type checker
185
+ .pyre/
186
+
187
+ # pytype static type analyzer
188
+ .pytype/
189
+
190
+ # Cython debug symbols
191
+ cython_debug/
192
+
193
+ # PyCharm
194
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
195
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
196
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
197
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
198
+ #.idea/
199
+
200
+ ### Python Patch ###
201
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
202
+ poetry.toml
203
+
204
+ # ruff
205
+ .ruff_cache/
206
+
207
+ # LSP config files
208
+ pyrightconfig.json
209
+
210
+ # End of https://www.toptal.com/developers/gitignore/api/macos,python,dotenv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
LICENSE ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU LESSER GENERAL PUBLIC LICENSE
2
+ Version 3, 29 June 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+
9
+ This version of the GNU Lesser General Public License incorporates
10
+ the terms and conditions of version 3 of the GNU General Public
11
+ License, supplemented by the additional permissions listed below.
12
+
13
+ 0. Additional Definitions.
14
+
15
+ As used herein, "this License" refers to version 3 of the GNU Lesser
16
+ General Public License, and the "GNU GPL" refers to version 3 of the GNU
17
+ General Public License.
18
+
19
+ "The Library" refers to a covered work governed by this License,
20
+ other than an Application or a Combined Work as defined below.
21
+
22
+ An "Application" is any work that makes use of an interface provided
23
+ by the Library, but which is not otherwise based on the Library.
24
+ Defining a subclass of a class defined by the Library is deemed a mode
25
+ of using an interface provided by the Library.
26
+
27
+ A "Combined Work" is a work produced by combining or linking an
28
+ Application with the Library. The particular version of the Library
29
+ with which the Combined Work was made is also called the "Linked
30
+ Version".
31
+
32
+ The "Minimal Corresponding Source" for a Combined Work means the
33
+ Corresponding Source for the Combined Work, excluding any source code
34
+ for portions of the Combined Work that, considered in isolation, are
35
+ based on the Application, and not on the Linked Version.
36
+
37
+ The "Corresponding Application Code" for a Combined Work means the
38
+ object code and/or source code for the Application, including any data
39
+ and utility programs needed for reproducing the Combined Work from the
40
+ Application, but excluding the System Libraries of the Combined Work.
41
+
42
+ 1. Exception to Section 3 of the GNU GPL.
43
+
44
+ You may convey a covered work under sections 3 and 4 of this License
45
+ without being bound by section 3 of the GNU GPL.
46
+
47
+ 2. Conveying Modified Versions.
48
+
49
+ If you modify a copy of the Library, and, in your modifications, a
50
+ facility refers to a function or data to be supplied by an Application
51
+ that uses the facility (other than as an argument passed when the
52
+ facility is invoked), then you may convey a copy of the modified
53
+ version:
54
+
55
+ a) under this License, provided that you make a good faith effort to
56
+ ensure that, in the event an Application does not supply the
57
+ function or data, the facility still operates, and performs
58
+ whatever part of its purpose remains meaningful, or
59
+
60
+ b) under the GNU GPL, with none of the additional permissions of
61
+ this License applicable to that copy.
62
+
63
+ 3. Object Code Incorporating Material from Library Header Files.
64
+
65
+ The object code form of an Application may incorporate material from
66
+ a header file that is part of the Library. You may convey such object
67
+ code under terms of your choice, provided that, if the incorporated
68
+ material is not limited to numerical parameters, data structure
69
+ layouts and accessors, or small macros, inline functions and templates
70
+ (ten or fewer lines in length), you do both of the following:
71
+
72
+ a) Give prominent notice with each copy of the object code that the
73
+ Library is used in it and that the Library and its use are
74
+ covered by this License.
75
+
76
+ b) Accompany the object code with a copy of the GNU GPL and this license
77
+ document.
78
+
79
+ 4. Combined Works.
80
+
81
+ You may convey a Combined Work under terms of your choice that,
82
+ taken together, effectively do not restrict modification of the
83
+ portions of the Library contained in the Combined Work and reverse
84
+ engineering for debugging such modifications, if you also do each of
85
+ the following:
86
+
87
+ a) Give prominent notice with each copy of the Combined Work that
88
+ the Library is used in it and that the Library and its use are
89
+ covered by this License.
90
+
91
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
92
+ document.
93
+
94
+ c) For a Combined Work that displays copyright notices during
95
+ execution, include the copyright notice for the Library among
96
+ these notices, as well as a reference directing the user to the
97
+ copies of the GNU GPL and this license document.
98
+
99
+ d) Do one of the following:
100
+
101
+ 0) Convey the Minimal Corresponding Source under the terms of this
102
+ License, and the Corresponding Application Code in a form
103
+ suitable for, and under terms that permit, the user to
104
+ recombine or relink the Application with a modified version of
105
+ the Linked Version to produce a modified Combined Work, in the
106
+ manner specified by section 6 of the GNU GPL for conveying
107
+ Corresponding Source.
108
+
109
+ 1) Use a suitable shared library mechanism for linking with the
110
+ Library. A suitable mechanism is one that (a) uses at run time
111
+ a copy of the Library already present on the user's computer
112
+ system, and (b) will operate properly with a modified version
113
+ of the Library that is interface-compatible with the Linked
114
+ Version.
115
+
116
+ e) Provide Installation Information, but only if you would otherwise
117
+ be required to provide such information under section 6 of the
118
+ GNU GPL, and only to the extent that such information is
119
+ necessary to install and execute a modified version of the
120
+ Combined Work produced by recombining or relinking the
121
+ Application with a modified version of the Linked Version. (If
122
+ you use option 4d0, the Installation Information must accompany
123
+ the Minimal Corresponding Source and Corresponding Application
124
+ Code. If you use option 4d1, you must provide the Installation
125
+ Information in the manner specified by section 6 of the GNU GPL
126
+ for conveying Corresponding Source.)
127
+
128
+ 5. Combined Libraries.
129
+
130
+ You may place library facilities that are a work based on the
131
+ Library side by side in a single library together with other library
132
+ facilities that are not Applications and are not covered by this
133
+ License, and convey such a combined library under terms of your
134
+ choice, if you do both of the following:
135
+
136
+ a) Accompany the combined library with a copy of the same work based
137
+ on the Library, uncombined with any other library facilities,
138
+ conveyed under the terms of this License.
139
+
140
+ b) Give prominent notice with the combined library that part of it
141
+ is a work based on the Library, and explaining where to find the
142
+ accompanying uncombined form of the same work.
143
+
144
+ 6. Revised Versions of the GNU Lesser General Public License.
145
+
146
+ The Free Software Foundation may publish revised and/or new versions
147
+ of the GNU Lesser General Public License from time to time. Such new
148
+ versions will be similar in spirit to the present version, but may
149
+ differ in detail to address new problems or concerns.
150
+
151
+ Each version is given a distinguishing version number. If the
152
+ Library as you received it specifies that a certain numbered version
153
+ of the GNU Lesser General Public License "or any later version"
154
+ applies to it, you have the option of following the terms and
155
+ conditions either of that published version or of any later version
156
+ published by the Free Software Foundation. If the Library as you
157
+ received it does not specify a version number of the GNU Lesser
158
+ General Public License, you may choose any version of the GNU Lesser
159
+ General Public License ever published by the Free Software Foundation.
160
+
161
+ If the Library as you received it specifies that a proxy can decide
162
+ whether future versions of the GNU Lesser General Public License shall
163
+ apply, that proxy's public statement of acceptance of any version is
164
+ permanent authorization for you to choose that version for the
165
+ Library.
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: PodcastVox
3
- emoji:
4
  colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
 
1
  ---
2
  title: PodcastVox
3
+ emoji: 📻💠
4
  colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import asyncio
3
+ import aiohttp
4
+ import dotenv
5
+ import os
6
+ import time
7
+ import logging
8
+
9
+
10
+ from src.voicevox import VoiceVoxClient
11
+ from src.agent import Conversation
12
+ from src.podcast import PodcastStudio
13
+ from src.aivis import start_aivis_speech, download_model
14
+
15
+ import gradio as gr
16
+
17
+ dotenv.load_dotenv()
18
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
19
+
20
+ DEFAULT_MODELS = [
21
+ "https://hub.aivis-project.com/aivm-models/a59cb814-0083-4369-8542-f51a29e72af7", # Anneli
22
+ "https://hub.aivis-project.com/aivm-models/4cf3e1d8-5583-41a9-a554-b2d2cda2c569", # Anneli Whisper
23
+ "https://hub.aivis-project.com/aivm-models/6acf95e8-11a9-414e-aa9c-6dbebf9113ca", # F1
24
+ "https://hub.aivis-project.com/aivm-models/25b39db7-5757-47ef-9fe4-2b7aff328a18", # F2
25
+ "https://hub.aivis-project.com/aivm-models/d7255c2c-ddd0-425a-808c-662cd94c7f41", # M1
26
+ "https://hub.aivis-project.com/aivm-models/d1a7446f-230d-4077-afdf-923eddabe53c", # M2
27
+ "https://hub.aivis-project.com/aivm-models/6d11c6c2-f4a4-4435-887e-23dd60f8b8dd", # にせ
28
+ "https://hub.aivis-project.com/aivm-models/e9339137-2ae3-4d41-9394-fb757a7e61e6", # まい
29
+ "https://hub.aivis-project.com/aivm-models/eefe1fbd-d15a-49ae-bc83-fc4aaad680e1", # ハヤテ
30
+ "https://hub.aivis-project.com/aivm-models/5d804388-665e-4174-ab60-53d448c0d7eb", # 老当主
31
+ "https://hub.aivis-project.com/aivm-models/71e72188-2726-4739-9aa9-39567396fb2a", # ふみふみ
32
+ ]
33
+ AIVIS_ENDPOINT = "http://127.0.0.1:10101"
34
+
35
+ NAVIGATOR_SAMPLE = "こんにちは!私の名前は {nickname} です。今回は私がポッドキャストをナビゲートします。よろしくお願いします!"
36
+ ASSISTANT_SAMPLE = "こんにちは!私の名前は {nickname} です。私はサポーターとして、ナビゲーターと一緒にポッドキャストを盛り上げていきます。頑張ります!"
37
+
38
+
39
+ async def generate_podcast(
40
+ voicevox_endpoint: str,
41
+ llm_api_key: str,
42
+ pdf_url: str,
43
+ speaker_name: str,
44
+ supporter_name: str,
45
+ speaker2id: dict[str, int],
46
+ ) -> tuple[str, str, object, Conversation, str, dict]:
47
+ client = VoiceVoxClient(voicevox_endpoint)
48
+
49
+ speaker_id = speaker2id[speaker_name]
50
+ supporter_id = speaker2id[supporter_name]
51
+
52
+ podcast_studio = PodcastStudio(
53
+ api_key=llm_api_key,
54
+ logging_level=logging.DEBUG,
55
+ )
56
+
57
+ start_time = time.time()
58
+
59
+ blog, _dialogue, conversation = await podcast_studio.create_conversation(pdf_url)
60
+ podcast_audio = await podcast_studio.record_podcast(
61
+ conversation=conversation,
62
+ voicevox_client=client,
63
+ speaker_id=speaker_id,
64
+ supporter_id=supporter_id,
65
+ )
66
+
67
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
68
+ temp_file.write(podcast_audio.wav)
69
+ temp_file_path = temp_file.name
70
+
71
+ elapsed_time = time.time() - start_time
72
+ time_elapsed_text = f"処理時間: {elapsed_time:.2f} 秒"
73
+
74
+ return (
75
+ temp_file_path,
76
+ blog,
77
+ conversation.model_dump(),
78
+ conversation,
79
+ time_elapsed_text,
80
+ gr.update(visible=True),
81
+ )
82
+
83
+
84
+ async def change_speaker(
85
+ voicevox_endpoint: str,
86
+ speaker_name: str,
87
+ supporter_name: str,
88
+ speaker2id: dict[str, int],
89
+ conversation_cache: Conversation,
90
+ ) -> tuple[str, str]:
91
+ client = VoiceVoxClient(voicevox_endpoint)
92
+
93
+ speaker_id = speaker2id[speaker_name]
94
+ supporter_id = speaker2id[supporter_name]
95
+
96
+ podcast_studio = PodcastStudio(api_key="") # only voice synthesis
97
+
98
+ start_time = time.time()
99
+ podcast_audio = await podcast_studio.record_podcast(
100
+ conversation=conversation_cache,
101
+ voicevox_client=client,
102
+ speaker_id=speaker_id,
103
+ supporter_id=supporter_id,
104
+ )
105
+
106
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
107
+ temp_file.write(podcast_audio.wav)
108
+ temp_file_path = temp_file.name
109
+
110
+ elapsed_time = time.time() - start_time
111
+ time_elapsed_text = f"処理時間: {elapsed_time:.2f} 秒"
112
+
113
+ return temp_file_path, time_elapsed_text
114
+
115
+
116
+ async def get_speakers(endpoint: str):
117
+ client = VoiceVoxClient(endpoint)
118
+
119
+ speakers = await client.get_speakers()
120
+
121
+ print(f"Found {len(speakers)} speakers at {endpoint}")
122
+
123
+ choices = []
124
+ speaker_ids = []
125
+ for speaker in speakers:
126
+ for style in speaker.styles:
127
+ spekaer_name = f"{speaker.name} ({style.name})"
128
+ print(f"Speaker: {spekaer_name}, ID: {style.id}")
129
+ choices.append(spekaer_name)
130
+ speaker_ids.append(style.id)
131
+
132
+ speaker2id = dict(zip(choices, speaker_ids))
133
+
134
+ return choices, speaker2id
135
+
136
+
137
+ async def on_endpoint_change(endpoint_text: str):
138
+ try:
139
+ speakers, speaker2id = await get_speakers(endpoint_text)
140
+ return (
141
+ gr.update(choices=speakers, value=speakers[0]),
142
+ gr.update(choices=speakers, value=speakers[1]),
143
+ speaker2id,
144
+ )
145
+ except Exception as e:
146
+ return gr.update(), gr.update(), gr.update()
147
+
148
+
149
+ async def preview_speaker_voice(
150
+ voicevox_endpoint: str,
151
+ speaker_name: str,
152
+ speaker_id: int,
153
+ is_main_speaker: bool = True,
154
+ ):
155
+ client = VoiceVoxClient(voicevox_endpoint)
156
+
157
+ speaker_nickname = speaker_name.split("(")[0].strip()
158
+
159
+ if is_main_speaker:
160
+ sample_text = NAVIGATOR_SAMPLE.format(nickname=speaker_nickname)
161
+ else:
162
+ sample_text = ASSISTANT_SAMPLE.format(nickname=speaker_nickname)
163
+
164
+ audio_query = await client.post_audio_query(
165
+ text=sample_text,
166
+ speaker=speaker_id,
167
+ )
168
+ if audio_query.tempoDynamicsScale is not None:
169
+ audio_query.tempoDynamicsScale = 1.1
170
+ else:
171
+ audio_query.speedScale = 1.1
172
+
173
+ audio = await client.post_synthesis(
174
+ speaker=speaker_id,
175
+ audio_query=audio_query,
176
+ )
177
+
178
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
179
+ temp_file.write(audio.wav)
180
+ temp_file_path = temp_file.name
181
+
182
+ return temp_file_path
183
+
184
+
185
+ async def on_change_speaker(
186
+ voicevox_endpoint: str,
187
+ speaker_name: str,
188
+ speaker2id: dict[str, int],
189
+ is_main_speaker: bool,
190
+ ):
191
+ speaker_id = speaker2id[speaker_name]
192
+ return await preview_speaker_voice(
193
+ voicevox_endpoint=voicevox_endpoint,
194
+ speaker_name=speaker_name,
195
+ speaker_id=speaker_id,
196
+ is_main_speaker=is_main_speaker,
197
+ )
198
+
199
+
200
+ async def download_default_models():
201
+ logging.info("Downloading default models...")
202
+
203
+ results = await asyncio.gather(
204
+ *[download_model(model_url) for model_url in DEFAULT_MODELS],
205
+ return_exceptions=True,
206
+ )
207
+
208
+ for result in results:
209
+ if isinstance(result, Exception):
210
+ logging.error(f"Failed to download model: {result}")
211
+
212
+
213
+ async def wait_for_endpoint(url: str, timeout: float = 30.0, interval: float = 0.5):
214
+ """url が 200 を返すまで待機"""
215
+ start = time.time()
216
+ while time.time() - start < timeout:
217
+ try:
218
+ async with aiohttp.ClientSession() as session:
219
+ async with session.get(url) as res:
220
+ if res.status == 200:
221
+ return
222
+ except Exception:
223
+ pass
224
+ await asyncio.sleep(interval)
225
+ raise RuntimeError(f"Endpoint {url} did not become ready in {timeout}s")
226
+
227
+
228
+ async def main():
229
+ await wait_for_endpoint(AIVIS_ENDPOINT)
230
+
231
+ initial_endpoint = AIVIS_ENDPOINT
232
+ try:
233
+ speakers, spaker2id = await get_speakers(initial_endpoint)
234
+ except Exception as _e:
235
+ speakers = []
236
+ spaker2id = {}
237
+
238
+ main_speaker_name = None if len(speakers) == 0 else speakers[0]
239
+ supporter_speaker_name = None if len(speakers) < 2 else speakers[1]
240
+
241
+ main_speaker_preview = None
242
+ supporter_speaker_preview = None
243
+ if main_speaker_name is not None:
244
+ main_speaker_preview = await preview_speaker_voice(
245
+ voicevox_endpoint=initial_endpoint,
246
+ speaker_name=main_speaker_name,
247
+ speaker_id=spaker2id.get(main_speaker_name, 0),
248
+ is_main_speaker=True,
249
+ )
250
+ if supporter_speaker_name is not None:
251
+ supporter_speaker_preview = await preview_speaker_voice(
252
+ voicevox_endpoint=initial_endpoint,
253
+ speaker_name=supporter_speaker_name,
254
+ speaker_id=spaker2id.get(supporter_speaker_name, 0),
255
+ is_main_speaker=False,
256
+ )
257
+
258
+ with gr.Blocks() as demo:
259
+ gr.Markdown(
260
+ """
261
+ # PodcastVox (Aivis Speech)
262
+
263
+ Gemini Flash 2.5 と Aivis Speech を利用して、Web サイトを情報源とした Podcast を生成することができます。
264
+
265
+ """
266
+ )
267
+
268
+ with gr.Row():
269
+ with gr.Column():
270
+ with gr.Group():
271
+ endpoint_text = gr.Textbox(
272
+ label="VOICEVOX エンドポイント",
273
+ value=initial_endpoint,
274
+ placeholder=AIVIS_ENDPOINT,
275
+ info="VOICEVOX 型 の REST API に対応したエンドポイントを入力してください",
276
+ visible=False,
277
+ )
278
+ with gr.Row():
279
+ with gr.Column():
280
+ speakers_dropdown = gr.Dropdown(
281
+ label="メイン話者",
282
+ choices=speakers,
283
+ value=main_speaker_name,
284
+ multiselect=False,
285
+ )
286
+ speaker_preview_audio = gr.Audio(
287
+ label="メイン話者音声プレビュー",
288
+ type="filepath",
289
+ value=main_speaker_preview,
290
+ )
291
+
292
+ with gr.Column():
293
+ supporter_dropdown = gr.Dropdown(
294
+ label="サポーター話者",
295
+ choices=speakers,
296
+ value=supporter_speaker_name,
297
+ multiselect=False,
298
+ )
299
+ supporter_preview_audio = gr.Audio(
300
+ label="サポーター音声プレビュー",
301
+ type="filepath",
302
+ value=supporter_speaker_preview,
303
+ )
304
+
305
+ spaker2id_map = gr.State(value=spaker2id)
306
+
307
+ change_speaker_button = gr.Button(
308
+ "この話者で再録音",
309
+ variant="secondary",
310
+ visible=False,
311
+ )
312
+
313
+ with gr.Group():
314
+ llm_api_key_text = gr.Textbox(
315
+ label="Gemini API Key",
316
+ info="Podcast を生成するには API キーが必要です。https://aistudio.google.com/apikey から取得できます。",
317
+ placeholder="Enter your Gemini API key",
318
+ value=GEMINI_API_KEY,
319
+ type="password",
320
+ visible=GEMINI_API_KEY == "",
321
+ )
322
+
323
+ with gr.Column():
324
+ with gr.Group():
325
+ pdf_url_text = gr.Textbox(
326
+ label="情報源となる Web サイト の URL",
327
+ placeholder="https://arxiv.org/pdf/2308.06721, https://example.com/index.html",
328
+ lines=1,
329
+ info="Podcast のテーマとなる Web サイト の URL を入力してください。HTML、PDF に対応しています。",
330
+ )
331
+ submit_button = gr.Button("Synthesize", variant="primary")
332
+
333
+ time_elapsed_text = gr.Markdown(
334
+ value="",
335
+ )
336
+
337
+ output_audio = gr.Audio(
338
+ label="Output Podcast Audio",
339
+ type="filepath",
340
+ autoplay=True,
341
+ )
342
+ conversation_cache = gr.State(value=None)
343
+
344
+ with gr.Accordion("生成されたブログ", open=False):
345
+ blog_output = gr.Markdown(
346
+ label="Blog Output",
347
+ value="生成されたブログはここに表示されます。",
348
+ )
349
+
350
+ with gr.Accordion("生成された会話", open=False):
351
+ conversation_output = gr.JSON(label="Conversation Output", value={})
352
+
353
+ gr.Examples(
354
+ examples=[
355
+ ["https://arxiv.org/pdf/2308.06721"],
356
+ ["https://www.aozora.gr.jp/cards/000879/files/127_15260.html"],
357
+ ],
358
+ inputs=[pdf_url_text],
359
+ )
360
+
361
+ gr.on(
362
+ triggers=[endpoint_text.change],
363
+ fn=on_endpoint_change,
364
+ inputs=[endpoint_text],
365
+ outputs=[
366
+ speakers_dropdown,
367
+ supporter_dropdown,
368
+ spaker2id_map,
369
+ ],
370
+ concurrency_limit=10,
371
+ )
372
+ gr.on(
373
+ triggers=[submit_button.click],
374
+ fn=generate_podcast,
375
+ inputs=[
376
+ endpoint_text,
377
+ llm_api_key_text,
378
+ pdf_url_text,
379
+ speakers_dropdown,
380
+ supporter_dropdown,
381
+ spaker2id_map,
382
+ ],
383
+ outputs=[
384
+ output_audio,
385
+ blog_output,
386
+ conversation_output,
387
+ conversation_cache,
388
+ time_elapsed_text,
389
+ change_speaker_button, # make visible after generation
390
+ ],
391
+ concurrency_limit=10,
392
+ )
393
+ gr.on(
394
+ triggers=[change_speaker_button.click],
395
+ fn=change_speaker,
396
+ inputs=[
397
+ endpoint_text,
398
+ speakers_dropdown,
399
+ supporter_dropdown,
400
+ spaker2id_map,
401
+ conversation_cache,
402
+ ],
403
+ outputs=[
404
+ output_audio,
405
+ time_elapsed_text,
406
+ ],
407
+ concurrency_limit=10,
408
+ )
409
+ gr.on(
410
+ triggers=[
411
+ speakers_dropdown.change,
412
+ ],
413
+ fn=on_change_speaker,
414
+ inputs=[
415
+ endpoint_text,
416
+ speakers_dropdown,
417
+ spaker2id_map,
418
+ gr.State(value=True),
419
+ ],
420
+ outputs=[speaker_preview_audio],
421
+ concurrency_limit=10,
422
+ )
423
+ gr.on(
424
+ triggers=[
425
+ supporter_dropdown.change,
426
+ ],
427
+ fn=on_change_speaker,
428
+ inputs=[
429
+ endpoint_text,
430
+ supporter_dropdown,
431
+ spaker2id_map,
432
+ gr.State(value=False),
433
+ ],
434
+ outputs=[supporter_preview_audio],
435
+ concurrency_limit=10,
436
+ )
437
+
438
+ demo.launch()
439
+
440
+
441
+ async def runner():
442
+ await download_default_models()
443
+
444
+ aivis = asyncio.to_thread(start_aivis_speech)
445
+ webui = asyncio.create_task(main())
446
+
447
+ await asyncio.gather(aivis, webui)
448
+
449
+
450
+ if __name__ == "__main__":
451
+ asyncio.run(runner())
assets/engine_manifest.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "manifest_version": "0.13.1",
3
+ "name": "AivisSpeech Engine",
4
+ "brand_name": "AivisSpeech",
5
+ "uuid": "1b4a5014-d9fd-11ee-b97d-83c170a68ed3",
6
+ "version": "999.999.999",
7
+ "url": "https://github.com/Aivis-Project/AivisSpeech-Engine",
8
+ "command": "run",
9
+ "port": 10101,
10
+ "icon": "resources/engine_manifest_assets/icon.png",
11
+ "default_sampling_rate": 44100,
12
+ "frame_rate": 172.265625,
13
+ "terms_of_service": "resources/engine_manifest_assets/terms_of_service.md",
14
+ "update_infos": "resources/engine_manifest_assets/update_infos.json",
15
+ "dependency_licenses": "resources/engine_manifest_assets/dependency_licenses.json",
16
+ "supported_vvlib_manifest_version": null,
17
+ "supported_features": {
18
+ "adjust_mora_pitch": {
19
+ "type": "bool",
20
+ "value": false,
21
+ "name": "モーラごとの音高の調整"
22
+ },
23
+ "adjust_phoneme_length": {
24
+ "type": "bool",
25
+ "value": false,
26
+ "name": "音素ごとの長さの調整"
27
+ },
28
+ "adjust_speed_scale": {
29
+ "type": "bool",
30
+ "value": true,
31
+ "name": "全体の話速の調整"
32
+ },
33
+ "adjust_pitch_scale": {
34
+ "type": "bool",
35
+ "value": true,
36
+ "name": "全体の音高の調整"
37
+ },
38
+ "adjust_intonation_scale": {
39
+ "type": "bool",
40
+ "value": true,
41
+ "name": "全体の抑揚の調整"
42
+ },
43
+ "adjust_volume_scale": {
44
+ "type": "bool",
45
+ "value": true,
46
+ "name": "全体の音量の調整"
47
+ },
48
+ "adjust_pause_length": {
49
+ "type": "bool",
50
+ "value": false,
51
+ "name": "句読点などの無音時間の調整"
52
+ },
53
+ "interrogative_upspeak": {
54
+ "type": "bool",
55
+ "value": false,
56
+ "name": "疑問文の自動調整"
57
+ },
58
+ "synthesis_morphing" : {
59
+ "type": "bool",
60
+ "value": false,
61
+ "name": "2種類のスタイルでモーフィングした音声を合成"
62
+ },
63
+ "sing" : {
64
+ "type": "bool",
65
+ "value": false,
66
+ "name": "歌唱音声合成"
67
+ },
68
+ "manage_library": {
69
+ "type": "bool",
70
+ "value": false,
71
+ "name": "音声ライブラリのインストール・アンインストール"
72
+ },
73
+ "return_resource_url": {
74
+ "type": "bool",
75
+ "value": false,
76
+ "name": "キャラクター情報のリソースを URL で返送"
77
+ }
78
+ }
79
+ }
pyproject.toml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "podcastvox-demo"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11,<3.12"
7
+ dependencies = [
8
+ "aiohttp>=3.12.6",
9
+ "aivis-speech-engine",
10
+ "fastapi>=0.115.12",
11
+ "gradio>=5.32.0",
12
+ "hf-transfer>=0.1.9",
13
+ "hf-xet>=1.1.2",
14
+ "litellm>=1.72.0",
15
+ "markitdown[pdf]>=0.1.2",
16
+ "onnxruntime>=1.22.0",
17
+ "pydantic>=2.11.5",
18
+ "pyopenjtalk-plus==0.4.1.post3",
19
+ "setuptools>=80.9.0",
20
+ "wheel>=0.45.1",
21
+ ]
22
+
23
+ [dependency-groups]
24
+ dev = ["ruff>=0.11.12", "ty>=0.0.1a7"]
25
+
26
+ [tool.uv.sources]
27
+ aivis-speech-engine = { git = "https://github.com/p1atdev/AivisSpeech-Engine" }
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp>=3.12.6
2
+ fastapi>=0.115.12
3
+ gradio>=5.32.0
4
+ hf-transfer>=0.1.9
5
+ hf-xet>=1.1.2
6
+ litellm>=1.72.0
7
+ markitdown[pdf]>=0.1.2
8
+ onnxruntime>=1.22.0
9
+ pydantic>=2.11.5
10
+ pyopenjtalk-plus==0.4.1.post3
11
+ setuptools>=80.9.0
12
+ wheel>=0.45.1
13
+ git+https://github.com/p1atdev/AivisSpeech-Engine
src/agent.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Literal
3
+ from pydantic import BaseModel
4
+
5
+ import litellm
6
+ from litellm.types.utils import ModelResponse
7
+
8
+ SAFETY_SETTINGS = [
9
+ {
10
+ "category": "HARM_CATEGORY_HARASSMENT",
11
+ "threshold": "BLOCK_NONE",
12
+ },
13
+ {
14
+ "category": "HARM_CATEGORY_HATE_SPEECH",
15
+ "threshold": "BLOCK_NONE",
16
+ },
17
+ {
18
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
19
+ "threshold": "BLOCK_NONE",
20
+ },
21
+ {
22
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
23
+ "threshold": "BLOCK_NONE",
24
+ },
25
+ ]
26
+
27
+
28
+ class BloggerAgent:
29
+ instructions = [
30
+ {
31
+ "role": "user",
32
+ "content": "与えられる情報について、重要なポイントを踏まえて平易な言葉で解説・紹介する記事を書いてください",
33
+ },
34
+ ]
35
+ model: str = "gemini/gemini-2.5-flash-preview-05-20"
36
+ temperature: float = 1.0
37
+ max_tokens: int = 4096
38
+ thinking_budget: int = 1024
39
+ api_key: str
40
+
41
+ def __init__(self, api_key: str):
42
+ self.api_key = api_key
43
+
44
+ async def task(self, information: str) -> str:
45
+ messages = self.instructions.copy()
46
+ messages.append({"role": "user", "content": information})
47
+
48
+ res = await litellm.acompletion(
49
+ api_key=self.api_key,
50
+ model=self.model,
51
+ messages=messages,
52
+ temperature=self.temperature,
53
+ max_completion_tokens=self.max_tokens,
54
+ thinking={"type": "enabled", "budget_tokens": self.thinking_budget},
55
+ safety_settings=SAFETY_SETTINGS,
56
+ )
57
+ assert isinstance(res, ModelResponse)
58
+
59
+ blog = res.choices[0].message.content
60
+ assert isinstance(blog, str)
61
+
62
+ return blog
63
+
64
+
65
+ class WriterAgent:
66
+ instructions = [
67
+ {
68
+ "role": "user",
69
+ "content": """与えられる情報ソースとその解説記事をもとに、コンテンツを紹介する Podcast の会話を作成してください。
70
+ Podcast では、二人の人物が交互に会話をします。
71
+
72
+ # 登場人物
73
+ - スピーカー: コンテンツ紹介をリードする人で、主にこの人物が解説を行う
74
+ - サポーター: スピーカーの説明を聞き、うなづいたり、さらに質問を投げかけることで、理解を助ける。
75
+
76
+ # 構成
77
+ 1. イントロ: まず、スピーカーとサポーターが何について話すのか、挨拶を交えながら会話します。自己紹介は省略する。
78
+ 2. 解説: 前提知識の確認をしながら、内容を解説していきます
79
+ 3. アウトロ: 今後の展望を交えながら締めくくります
80
+
81
+ ---
82
+ このような内容になるような Podcast の脚本を作成してください。
83
+ """.strip(),
84
+ },
85
+ ]
86
+ model: str = "gemini/gemini-2.5-flash-preview-05-20"
87
+ temperature: float = 1.0
88
+ max_tokens: int = 4096
89
+ thinking_budget: int = 1024
90
+ api_key: str
91
+
92
+ def __init__(self, api_key: str):
93
+ self.api_key = api_key
94
+
95
+ async def task(self, information: str, blog: str) -> str:
96
+ messages = self.instructions.copy()
97
+ messages.append(
98
+ {"role": "user", "content": f"# 情報\n{information}\n\n# 解説\n{blog}"}
99
+ )
100
+
101
+ res = await litellm.acompletion(
102
+ api_key=self.api_key,
103
+ model=self.model,
104
+ messages=messages,
105
+ temperature=self.temperature,
106
+ max_completion_tokens=self.max_tokens,
107
+ thinking={"type": "enabled", "budget_tokens": self.thinking_budget},
108
+ safety_settings=SAFETY_SETTINGS,
109
+ )
110
+ assert isinstance(res, ModelResponse)
111
+
112
+ dialogue = res.choices[0].message.content
113
+ assert isinstance(dialogue, str)
114
+
115
+ return dialogue
116
+
117
+
118
+ class Dialogue(BaseModel):
119
+ role: Literal["speaker", "supporter"]
120
+ content: str
121
+
122
+
123
+ class Conversation(BaseModel):
124
+ conversation: list[Dialogue]
125
+
126
+
127
+ class StructureAgent:
128
+ instructions = [
129
+ {
130
+ "role": "user",
131
+ "content": """この会話を指定されたスキーマに従った形に変換してください。スピーカーの role は `speaker`、サポーターは `supporter` です。""".strip(),
132
+ },
133
+ ]
134
+ model: str = "gemini/gemini-2.5-flash-preview-05-20"
135
+ temperature: float = 0.1
136
+ max_tokens: int = 12_288
137
+ thinking_budget: int = 0
138
+ api_key: str
139
+
140
+ def __init__(self, api_key: str):
141
+ self.api_key = api_key
142
+
143
+ async def task(self, dialogue: str) -> Conversation:
144
+ messages = self.instructions.copy()
145
+ messages.append({"role": "user", "content": dialogue})
146
+
147
+ res = await litellm.acompletion(
148
+ api_key=self.api_key,
149
+ model=self.model,
150
+ messages=messages,
151
+ temperature=self.temperature,
152
+ max_completion_tokens=self.max_tokens,
153
+ thinking={"type": "disabled"},
154
+ response_format=Conversation,
155
+ safety_settings=SAFETY_SETTINGS,
156
+ )
157
+
158
+ conversation = Conversation.model_validate(
159
+ json.loads(res.choices[0].message.content)
160
+ )
161
+
162
+ return conversation
src/aivis.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ref: https://github.com/Aivis-Project/AivisSpeech-Engine/blob/master/run.py
2
+
3
+ import gc
4
+ import uvicorn
5
+ from pathlib import Path
6
+ import random
7
+ import aiohttp
8
+ import aiofiles
9
+
10
+ from voicevox_engine import __version__
11
+ from voicevox_engine.aivm_manager import AivmManager
12
+ from voicevox_engine.app.application import generate_app
13
+ from voicevox_engine.core.core_initializer import MOCK_VER, initialize_cores
14
+ from voicevox_engine.engine_manifest import load_manifest
15
+ from voicevox_engine.library.library_manager import LibraryManager
16
+ from voicevox_engine.logging import LOGGING_CONFIG, logger
17
+ from voicevox_engine.preset.preset_manager import PresetManager
18
+ from voicevox_engine.setting.model import CorsPolicyMode
19
+ from voicevox_engine.setting.setting_manager import USER_SETTING_PATH, SettingHandler
20
+ from voicevox_engine.tts_pipeline.song_engine import make_song_engines_from_cores
21
+ from voicevox_engine.tts_pipeline.tts_engine import TTSEngineManager
22
+ from voicevox_engine.user_dict.user_dict_manager import UserDictionary
23
+ from voicevox_engine.utility.path_utility import (
24
+ engine_root,
25
+ get_save_dir,
26
+ )
27
+ from voicevox_engine.utility.user_agent_utility import generate_user_agent
28
+
29
+
30
+ def start_aivis_speech() -> None:
31
+ """AivisSpeech Engine を実行する"""
32
+ try:
33
+ # multiprocessing.freeze_support()
34
+
35
+ # 起動時の可能な限り早い段階で実行結果をキャッシュしておくのが重要
36
+ generate_user_agent("CPU")
37
+
38
+ logger.info(f"AivisSpeech Engine version {__version__}")
39
+ logger.info(f"Engine root directory: {engine_root()}")
40
+ logger.info(f"User data directory: {get_save_dir()}")
41
+
42
+ # AivmManager を初期化
43
+ aivm_manager = AivmManager(get_save_dir() / "Models")
44
+
45
+ # ごく稀に style_bert_vits2_tts_engine.py (が依存する onnxruntime) のインポート自体に失敗し
46
+ # 例外が発生する環境があるようなので、例外をキャッチしてエラーログに出力できるよう、敢えてルーター初期化時にインポートする
47
+ from voicevox_engine.tts_pipeline.style_bert_vits2_tts_engine import (
48
+ StyleBertVITS2TTSEngine,
49
+ )
50
+
51
+ # AivisSpeech Engine 独自の StyleBertVITS2TTSEngine を通常の TTSEngine の代わりに利用
52
+ tts_engines = TTSEngineManager()
53
+ tts_engines.register_engine(
54
+ StyleBertVITS2TTSEngine(aivm_manager, use_gpu=False, load_all_models=False),
55
+ MOCK_VER,
56
+ )
57
+
58
+ core_manager = initialize_cores(
59
+ use_gpu=False,
60
+ voicelib_dirs=None,
61
+ voicevox_dir=None,
62
+ runtime_dirs=None,
63
+ cpu_num_threads=16,
64
+ enable_mock=True,
65
+ load_all_models=False,
66
+ )
67
+ # tts_engines = make_tts_engines_from_cores(core_manager)
68
+ song_engines = make_song_engines_from_cores(core_manager)
69
+ # assert len(tts_engines.versions()) != 0, "音声合成エンジンがありません。"
70
+ assert len(song_engines.versions()) != 0, "音声合成エンジンがありません。"
71
+
72
+ setting_loader = SettingHandler(USER_SETTING_PATH)
73
+
74
+ # 複数方式で指定可能な場合、優先度は上から「引数」「環境変数」「設定ファイル」「デフォルト値」
75
+
76
+ cors_policy_mode = CorsPolicyMode.all
77
+ allow_origin = ["*"]
78
+
79
+ preset_path = get_save_dir() / "presets.yaml"
80
+ preset_manager = PresetManager(preset_path)
81
+
82
+ user_dict = UserDictionary()
83
+
84
+ engine_manifest = load_manifest(Path("engine/engine_manifest.json"))
85
+
86
+ library_manager = LibraryManager(
87
+ # get_save_dir() / "installed_libraries",
88
+ # AivisSpeech では利用しない LibraryManager によるディレクトリ作成を防ぐため、get_save_dir() 直下を指定
89
+ get_save_dir(),
90
+ engine_manifest.supported_vvlib_manifest_version,
91
+ engine_manifest.brand_name,
92
+ engine_manifest.name,
93
+ engine_manifest.uuid,
94
+ )
95
+
96
+ root_dir = engine_root()
97
+ character_info_dir = root_dir / "resources" / "character_info"
98
+ # NOTE: ENGINE v0.19 以前向けに後方互換性を確保する
99
+ if not character_info_dir.exists():
100
+ character_info_dir = root_dir / "speaker_info"
101
+
102
+ # ASGI に準拠した AivisSpeech Engine アプリケーションを生成する
103
+ app = generate_app(
104
+ tts_engines,
105
+ song_engines,
106
+ aivm_manager,
107
+ core_manager,
108
+ setting_loader,
109
+ preset_manager,
110
+ user_dict,
111
+ engine_manifest,
112
+ library_manager,
113
+ cancellable_engine=None,
114
+ character_info_dir=character_info_dir,
115
+ cors_policy_mode=cors_policy_mode,
116
+ allow_origin=allow_origin,
117
+ disable_mutable_api=False,
118
+ )
119
+
120
+ # 起動処理にのみに要したメモリを開放
121
+ gc.collect()
122
+
123
+ # AivisSpeech Engine サーバーを起動
124
+ # NOTE: デフォルトは ASGI に準拠した HTTP/1.1 サーバー
125
+ uvicorn.run(app, host="127.0.0.1", port=10101, log_config=LOGGING_CONFIG)
126
+
127
+ except Exception as e:
128
+ logger.error("Unexpected error occurred during engine startup:", exc_info=e)
129
+ raise e
130
+
131
+
132
+ def random_str() -> str:
133
+ num = random.randint(10000, 99999)
134
+ return str(num)
135
+
136
+
137
+ async def download_model(model_url: str) -> None:
138
+ save_dir = get_save_dir() / "Models"
139
+
140
+ url = Path(model_url)
141
+ model_id = url.stem
142
+ model_path = save_dir / f"{model_id}.aivmx"
143
+
144
+ if model_path.exists():
145
+ logger.info(
146
+ f"Model {model_id} already exists at {model_path}. Skipping download."
147
+ )
148
+ return
149
+
150
+ download_url = f"https://api.aivis-project.com/v1/aivm-models/{model_id}/download?model_type=AIVMX"
151
+
152
+ logger.info("Downloading model from {download_url} to {model_path}...")
153
+
154
+ async with aiohttp.ClientSession() as session:
155
+ try:
156
+ async with session.get(
157
+ download_url,
158
+ ) as res:
159
+ res.raise_for_status()
160
+
161
+ # streaming download
162
+ async with aiofiles.open(model_path, "wb") as f:
163
+ async for chunk in res.content.iter_chunked(1024 * 1024):
164
+ await f.write(chunk)
165
+
166
+ logger.info(f"Model downloaded to {model_path}")
167
+
168
+ except Exception as e:
169
+ logger.error(f"Failed to download model: {e}")
170
+
171
+
172
+ if __name__ == "__main__":
173
+ # AivisSpeech Engine を起動
174
+ start_aivis_speech()
src/fetcher.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+ import io
3
+ from markitdown import MarkItDown
4
+
5
+
6
+ class PDFFetcher:
7
+ def __init__(self):
8
+ self.md = MarkItDown(enable_plugins=True)
9
+
10
+ def read_local(self, pdf_path: str) -> str:
11
+ result = self.md.convert(pdf_path)
12
+
13
+ markdown = self.postprocess(result.text_content)
14
+
15
+ return markdown
16
+
17
+ def postprocess(self, markdown: str) -> str:
18
+ pages = markdown.split("\f")
19
+ markdown = "\n".join(pages)
20
+ return markdown.strip()
21
+
22
+ async def fetch(self, pdf_url: str) -> str:
23
+ async with aiohttp.ClientSession() as session:
24
+ async with session.get(pdf_url) as res:
25
+ if res.status != 200:
26
+ raise Exception(f"Failed to download PDF: {res.status}")
27
+
28
+ pdf_content = await res.read()
29
+
30
+ markdown = self.md.convert_stream(io.BytesIO(pdf_content)).text_content
31
+
32
+ markdown = self.postprocess(markdown)
33
+
34
+ return markdown
35
+
36
+
37
+ class HTMLFetcher:
38
+ def __init__(self):
39
+ self.md = MarkItDown(enable_plugins=True)
40
+
41
+ async def fetch(self, html_url: str) -> str:
42
+ async with aiohttp.ClientSession() as session:
43
+ async with session.get(html_url) as res:
44
+ if res.status != 200:
45
+ raise Exception(f"Failed to download HTML: {res.status}")
46
+
47
+ data = await res.read()
48
+
49
+ markdown = self.md.convert_stream(io.BytesIO(data))
50
+
51
+ return markdown.text_content
52
+
53
+
54
+ class AutoFetcher:
55
+ def __init__(self):
56
+ self.pdf_fetcher = PDFFetcher()
57
+ self.html_fetcher = HTMLFetcher()
58
+
59
+ self.md = MarkItDown(enable_plugins=True)
60
+
61
+ async def fetch(self, url: str) -> str:
62
+ async with aiohttp.ClientSession() as session:
63
+ async with session.get(url) as res:
64
+ if res.status != 200:
65
+ raise Exception(f"Failed to download HTML: {res.status}")
66
+
67
+ data = await res.read()
68
+ content_type = res.headers.get(
69
+ "Content-Type",
70
+ res.headers.get("content-type", "text/plain"),
71
+ )
72
+
73
+ if "application/pdf" in content_type:
74
+ return self.pdf_fetcher.postprocess(
75
+ self.md.convert_stream(io.BytesIO(data)).text_content
76
+ )
77
+
78
+ elif "text/html" in content_type:
79
+ return self.md.convert_stream(io.BytesIO(data)).text_content
80
+
81
+ else:
82
+ # plain?
83
+ return self.md.convert_stream(io.BytesIO(data)).text_content
src/podcast.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tqdm import tqdm
2
+ import logging
3
+
4
+ from .agent import BloggerAgent, WriterAgent, StructureAgent, Conversation
5
+ from .fetcher import AutoFetcher
6
+ from .voicevox import VoiceVoxClient, SpeakerId, Audio
7
+
8
+
9
+ class PodcastStudio:
10
+ def __init__(self, api_key: str, logging_level: int = logging.INFO):
11
+ self.blogger = BloggerAgent(api_key=api_key)
12
+ self.writer = WriterAgent(api_key=api_key)
13
+ self.structure_agent = StructureAgent(api_key=api_key)
14
+
15
+ self.logger = logging.getLogger(__name__)
16
+ self.logger.setLevel(logging_level)
17
+
18
+ self.fetcher = AutoFetcher()
19
+
20
+ async def create_conversation(self, url: str) -> tuple[str, str, Conversation]:
21
+ self.logger.info(f"Fetching paper from {url}...")
22
+ paper = await self.fetcher.fetch(url)
23
+ self.logger.info("Paper fetched successfully.")
24
+ self.logger.debug(
25
+ f"Paper content: {paper[:100]}..."
26
+ ) # Log first 100 characters
27
+
28
+ self.logger.info("Creating blog from paper...")
29
+ blog = await self.blogger.task(paper)
30
+ self.logger.info("Blog created successfully.")
31
+ self.logger.debug(f"{blog[:100]}...") # Log first 100 characters
32
+
33
+ self.logger.info("Creating dialogue from blog...")
34
+ dialogue = await self.writer.task(paper, blog)
35
+ self.logger.info("Dialogue created successfully.")
36
+ self.logger.debug(f"{dialogue[:100]}...") # Log first 100 characters
37
+
38
+ self.logger.info("Structuring conversation from dialogue...")
39
+ conversation = await self.structure_agent.task(dialogue)
40
+ self.logger.info("Conversation structured successfully.")
41
+ for _d in conversation.conversation:
42
+ self.logger.debug(f"{_d.role}: {_d.content[:100]}...")
43
+
44
+ return blog, dialogue, conversation
45
+
46
+ async def record_podcast(
47
+ self,
48
+ conversation: Conversation,
49
+ voicevox_client: VoiceVoxClient,
50
+ speaker_id: SpeakerId,
51
+ supporter_id: SpeakerId,
52
+ ) -> Audio:
53
+ progress_bar = tqdm(
54
+ total=len(conversation.conversation),
55
+ desc="Synthesizing audio",
56
+ ncols=100,
57
+ )
58
+
59
+ async def _synthesis(
60
+ speaker_id: SpeakerId,
61
+ text: str,
62
+ index: int,
63
+ progress: tqdm,
64
+ ) -> tuple[int, Audio]:
65
+ audio_query = await voicevox_client.post_audio_query(
66
+ text=text,
67
+ speaker=speaker_id,
68
+ )
69
+ if audio_query.tempoDynamicsScale is not None:
70
+ audio_query.tempoDynamicsScale = 1.1
71
+ else:
72
+ audio_query.speedScale = 1.1
73
+
74
+ audio = await voicevox_client.post_synthesis(
75
+ speaker=speaker_id,
76
+ audio_query=audio_query,
77
+ )
78
+ progress.update(1)
79
+
80
+ progress.set_postfix({"text": text[:20] + "..."})
81
+
82
+ return index, audio
83
+
84
+ results = []
85
+ for i, dialogue in enumerate(conversation.conversation):
86
+ results.append(
87
+ await _synthesis(
88
+ speaker_id=(
89
+ speaker_id if dialogue.role == "speaker" else supporter_id
90
+ ),
91
+ text=dialogue.content,
92
+ index=i,
93
+ progress=progress_bar,
94
+ )
95
+ )
96
+ progress_bar.close()
97
+
98
+ # sort results by index
99
+ results.sort(key=lambda x: x[0])
100
+
101
+ audios = [audio for _, audio in results]
102
+
103
+ # connect audio files
104
+ podcast = await voicevox_client.post_connect_waves(
105
+ audio_list=audios,
106
+ )
107
+ return podcast
src/voicevox.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import aiohttp
2
+ from typing import Literal
3
+ from pydantic import BaseModel
4
+ import io
5
+ import base64
6
+
7
+ SpeakerId = int
8
+
9
+
10
+ class SpeakerStyle(BaseModel):
11
+ name: str
12
+ id: SpeakerId
13
+ type: Literal["talk"]
14
+
15
+
16
+ class Speaker(BaseModel):
17
+ name: str
18
+ speaker_uuid: str
19
+ styles: list[SpeakerStyle]
20
+ version: str
21
+
22
+
23
+ class AudioQuery(BaseModel):
24
+ accent_phrases: list[dict]
25
+ speedScale: float
26
+ intonationScale: float
27
+ tempoDynamicsScale: float | None = None
28
+ pitchScale: float
29
+ volumeScale: float
30
+ prePhonemeLength: float
31
+ postPhonemeLength: float
32
+ pauseLength: float | None
33
+ pauseLengthScale: float
34
+ outputSamplingRate: int
35
+ outputStereo: bool
36
+ kana: str
37
+
38
+
39
+ class Audio(BaseModel):
40
+ wav: bytes
41
+
42
+
43
+ class VoiceVoxClient:
44
+ endpoint: str
45
+
46
+ def __init__(self, endpoint: str = "http://127.0.0.1:50021"):
47
+ self.endpoint = endpoint
48
+
49
+ async def get_speakers(self) -> list[Speaker]:
50
+ async with aiohttp.ClientSession() as session:
51
+ async with session.get(f"{self.endpoint}/speakers") as response:
52
+ if response.status != 200:
53
+ raise Exception(f"Failed to get speakers: {response.status}")
54
+ return [
55
+ Speaker.model_validate(speaker) for speaker in await response.json()
56
+ ]
57
+
58
+ async def get_core_versions(self) -> list[str]:
59
+ async with aiohttp.ClientSession() as session:
60
+ async with session.get(f"{self.endpoint}/core_versions") as response:
61
+ if response.status != 200:
62
+ raise Exception(f"Failed to get core version: {response.status}")
63
+ return await response.json()
64
+
65
+ async def post_audio_query(
66
+ self,
67
+ text: str,
68
+ speaker: SpeakerId,
69
+ core_version: str | None = None,
70
+ ) -> AudioQuery:
71
+ async with aiohttp.ClientSession() as session:
72
+ params: dict[str, str | int | float] = {"text": text, "speaker": speaker}
73
+ if core_version:
74
+ params["core_version"] = core_version
75
+ async with session.post(
76
+ f"{self.endpoint}/audio_query",
77
+ params=params,
78
+ ) as res:
79
+ if res.status != 200:
80
+ raise Exception(f"Failed to post audio query: {res.status}")
81
+ json_data = await res.json()
82
+ return AudioQuery.model_validate(json_data)
83
+
84
+ async def post_synthesis(
85
+ self,
86
+ speaker: SpeakerId,
87
+ audio_query: AudioQuery,
88
+ enable_interrogative_upspeak: bool = True,
89
+ core_version: str | None = None,
90
+ ) -> Audio:
91
+ async with aiohttp.ClientSession() as session:
92
+ params: dict[str, str | int | float] = {
93
+ "speaker": speaker,
94
+ "enable_interrogative_upspeak": (
95
+ "true" if enable_interrogative_upspeak else "false"
96
+ ),
97
+ }
98
+ if core_version:
99
+ params["core_version"] = core_version
100
+ async with session.post(
101
+ f"{self.endpoint}/synthesis",
102
+ params=params,
103
+ json=audio_query.model_dump(),
104
+ ) as response:
105
+ if response.status != 200:
106
+ raise Exception(f"Failed to post synthesis: {response.status}")
107
+ wav = io.BytesIO(await response.read())
108
+ return Audio(wav=wav.getvalue())
109
+
110
+ async def post_connect_waves(
111
+ self,
112
+ audio_list: list[Audio],
113
+ ) -> Audio:
114
+ async with aiohttp.ClientSession() as session:
115
+ audio_data = [
116
+ base64.b64encode(audio.wav).decode("utf-8") for audio in audio_list
117
+ ]
118
+ async with session.post(
119
+ f"{self.endpoint}/connect_waves",
120
+ json=audio_data,
121
+ ) as response:
122
+ if response.status != 200:
123
+ raise Exception(f"Failed to connect waves: {response.status}")
124
+ wav = io.BytesIO(await response.read())
125
+ return Audio(wav=wav.getvalue())
uv.lock ADDED
The diff for this file is too large to render. See raw diff