Spaces:

p1atdev
/

PodcastVox

Running

File size: 4,099 Bytes

3a09141

import aiohttp
from typing import Literal
from pydantic import BaseModel
import io
import base64

SpeakerId = int


class SpeakerStyle(BaseModel):
    name: str
    id: SpeakerId
    type: Literal["talk"]


class Speaker(BaseModel):
    name: str
    speaker_uuid: str
    styles: list[SpeakerStyle]
    version: str


class AudioQuery(BaseModel):
    accent_phrases: list[dict]
    speedScale: float
    intonationScale: float
    tempoDynamicsScale: float | None = None
    pitchScale: float
    volumeScale: float
    prePhonemeLength: float
    postPhonemeLength: float
    pauseLength: float | None
    pauseLengthScale: float
    outputSamplingRate: int
    outputStereo: bool
    kana: str


class Audio(BaseModel):
    wav: bytes


class VoiceVoxClient:
    endpoint: str

    def __init__(self, endpoint: str = "http://127.0.0.1:50021"):
        self.endpoint = endpoint

    async def get_speakers(self) -> list[Speaker]:
        async with aiohttp.ClientSession() as session:
            async with session.get(f"{self.endpoint}/speakers") as response:
                if response.status != 200:
                    raise Exception(f"Failed to get speakers: {response.status}")
                return [
                    Speaker.model_validate(speaker) for speaker in await response.json()
                ]

    async def get_core_versions(self) -> list[str]:
        async with aiohttp.ClientSession() as session:
            async with session.get(f"{self.endpoint}/core_versions") as response:
                if response.status != 200:
                    raise Exception(f"Failed to get core version: {response.status}")
                return await response.json()

    async def post_audio_query(
        self,
        text: str,
        speaker: SpeakerId,
        core_version: str | None = None,
    ) -> AudioQuery:
        async with aiohttp.ClientSession() as session:
            params: dict[str, str | int | float] = {"text": text, "speaker": speaker}
            if core_version:
                params["core_version"] = core_version
            async with session.post(
                f"{self.endpoint}/audio_query",
                params=params,
            ) as res:
                if res.status != 200:
                    raise Exception(f"Failed to post audio query: {res.status}")
                json_data = await res.json()
                return AudioQuery.model_validate(json_data)

    async def post_synthesis(
        self,
        speaker: SpeakerId,
        audio_query: AudioQuery,
        enable_interrogative_upspeak: bool = True,
        core_version: str | None = None,
    ) -> Audio:
        async with aiohttp.ClientSession() as session:
            params: dict[str, str | int | float] = {
                "speaker": speaker,
                "enable_interrogative_upspeak": (
                    "true" if enable_interrogative_upspeak else "false"
                ),
            }
            if core_version:
                params["core_version"] = core_version
            async with session.post(
                f"{self.endpoint}/synthesis",
                params=params,
                json=audio_query.model_dump(),
            ) as response:
                if response.status != 200:
                    raise Exception(f"Failed to post synthesis: {response.status}")
                wav = io.BytesIO(await response.read())
                return Audio(wav=wav.getvalue())

    async def post_connect_waves(
        self,
        audio_list: list[Audio],
    ) -> Audio:
        async with aiohttp.ClientSession() as session:
            audio_data = [
                base64.b64encode(audio.wav).decode("utf-8") for audio in audio_list
            ]
            async with session.post(
                f"{self.endpoint}/connect_waves",
                json=audio_data,
            ) as response:
                if response.status != 200:
                    raise Exception(f"Failed to connect waves: {response.status}")
                wav = io.BytesIO(await response.read())
                return Audio(wav=wav.getvalue())