PodcastVox / src /voicevox.py
Plat
init
3a09141
import aiohttp
from typing import Literal
from pydantic import BaseModel
import io
import base64
SpeakerId = int
class SpeakerStyle(BaseModel):
name: str
id: SpeakerId
type: Literal["talk"]
class Speaker(BaseModel):
name: str
speaker_uuid: str
styles: list[SpeakerStyle]
version: str
class AudioQuery(BaseModel):
accent_phrases: list[dict]
speedScale: float
intonationScale: float
tempoDynamicsScale: float | None = None
pitchScale: float
volumeScale: float
prePhonemeLength: float
postPhonemeLength: float
pauseLength: float | None
pauseLengthScale: float
outputSamplingRate: int
outputStereo: bool
kana: str
class Audio(BaseModel):
wav: bytes
class VoiceVoxClient:
endpoint: str
def __init__(self, endpoint: str = "http://127.0.0.1:50021"):
self.endpoint = endpoint
async def get_speakers(self) -> list[Speaker]:
async with aiohttp.ClientSession() as session:
async with session.get(f"{self.endpoint}/speakers") as response:
if response.status != 200:
raise Exception(f"Failed to get speakers: {response.status}")
return [
Speaker.model_validate(speaker) for speaker in await response.json()
]
async def get_core_versions(self) -> list[str]:
async with aiohttp.ClientSession() as session:
async with session.get(f"{self.endpoint}/core_versions") as response:
if response.status != 200:
raise Exception(f"Failed to get core version: {response.status}")
return await response.json()
async def post_audio_query(
self,
text: str,
speaker: SpeakerId,
core_version: str | None = None,
) -> AudioQuery:
async with aiohttp.ClientSession() as session:
params: dict[str, str | int | float] = {"text": text, "speaker": speaker}
if core_version:
params["core_version"] = core_version
async with session.post(
f"{self.endpoint}/audio_query",
params=params,
) as res:
if res.status != 200:
raise Exception(f"Failed to post audio query: {res.status}")
json_data = await res.json()
return AudioQuery.model_validate(json_data)
async def post_synthesis(
self,
speaker: SpeakerId,
audio_query: AudioQuery,
enable_interrogative_upspeak: bool = True,
core_version: str | None = None,
) -> Audio:
async with aiohttp.ClientSession() as session:
params: dict[str, str | int | float] = {
"speaker": speaker,
"enable_interrogative_upspeak": (
"true" if enable_interrogative_upspeak else "false"
),
}
if core_version:
params["core_version"] = core_version
async with session.post(
f"{self.endpoint}/synthesis",
params=params,
json=audio_query.model_dump(),
) as response:
if response.status != 200:
raise Exception(f"Failed to post synthesis: {response.status}")
wav = io.BytesIO(await response.read())
return Audio(wav=wav.getvalue())
async def post_connect_waves(
self,
audio_list: list[Audio],
) -> Audio:
async with aiohttp.ClientSession() as session:
audio_data = [
base64.b64encode(audio.wav).decode("utf-8") for audio in audio_list
]
async with session.post(
f"{self.endpoint}/connect_waves",
json=audio_data,
) as response:
if response.status != 200:
raise Exception(f"Failed to connect waves: {response.status}")
wav = io.BytesIO(await response.read())
return Audio(wav=wav.getvalue())