File size: 3,331 Bytes
b74ac65
 
a2f3696
 
 
 
 
 
 
b74ac65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2f3696
20c2334
a2f3696
20c2334
 
 
 
 
 
 
 
 
 
 
 
 
 
a2f3696
 
 
 
 
 
 
20c2334
 
 
a2f3696
 
20c2334
a2f3696
b74ac65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2f3696
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python

import os
import shlex
import subprocess

if os.getenv("SPACE_ID"):
    subprocess.run(shlex.split("python -m unidic download"), timeout=100, check=True)  # noqa: S603

import gradio as gr
import numpy as np
import spaces
from kokoro import KPipeline

pipeline = KPipeline(lang_code="j")

VOICES = [
    "jf_alpha",
    "jf_gongitsune",
    "jf_nezumi",
    "jf_tebukuro",
    "jm_kumo",
]


@spaces.GPU(duration=20)
def run(text: str, voice: str, speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
    """Synthesizes speech from Japanese text using the Kokoro TTS model.

    Note:
        This model supports only Japanese input texts.

    Voice Selection:
        - The `voice` parameter specifies the speaker's characteristics and should follow the naming convention:
        `<language/accent><gender>_<voice_name>`
        - `<language/accent>`:
            - 'j' for Japanese
        - `<gender>`:
            - 'f' for female
            - 'm' for male
        - Example: 'jf_alpha' indicates an Japanese female voice named Alpha.

    Available voices:
        - jf_alpha
        - jf_gongitsune
        - jf_nezumi
        - jf_tebukuro
        - jm_kumo

    Args:
        text: Input text to be synthesized. Only Japanese text is supported. Non-Japanese input may result in errors or mispronunciations.
        voice: Identifier for the voice to be used in synthesis. Defaults to "jf_alpha".
        speed: Playback speed multiplier. A value of 1.0 means normal speed; values above or below adjust the speech rate accordingly. Defaults to 1.0.

    Returns:
        A tuple containing the audio and the tokens used to generate the audio.
    """
    generator = pipeline(
        text,
        voice=voice,
        speed=speed,
        split_pattern=r"\n+",
    )
    _, ps, audio = next(generator)
    return (24000, audio.numpy()), ps


with gr.Blocks(css_paths="style.css") as demo:
    gr.Markdown("# Kokoro (ja)")
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text")
            voice = gr.Dropdown(label="Voice", choices=VOICES, value=VOICES[0])
            speed = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
            run_button = gr.Button()
        with gr.Column():
            output_audio = gr.Audio(label="Output Audio", autoplay=True)
            output_tokens = gr.Textbox(label="Output Tokens")
    gr.Examples(
        examples=[
            "どうもこんにちは。今日はいい天気ですね。",
            "隣の竹垣に竹立てかけたのは隣の竹垣に竹立てかけたかったからと骨粗鬆症の東京特許許可局局長がマサチューセッツ州の美術室で魔術師が呪術師と手術中に供述。",
            "李も桃も桃のうち",
            "隣の客はよく柿食う客だ",
            "裏庭には二羽、庭には二羽、鶏がいる。",
            "貴社の記者が汽車で帰社した。橋の端にある箸。石でできた医師。お食事券にまつわる汚職事件。",
        ],
        inputs=input_text,
    )

    run_button.click(
        fn=run,
        inputs=[input_text, voice, speed],
        outputs=[output_audio, output_tokens],
    )

if __name__ == "__main__":
    demo.launch(mcp_server=True)