Spaces:

hayas
/

kokoro-ja

Running on Zero

App Files Files Community

kokoro-ja / app.py

hayas

Update

20c2334 8 days ago

raw

history blame contribute delete

3.33 kB

	#!/usr/bin/env python

	import os
	import shlex
	import subprocess

	if os.getenv("SPACE_ID"):
	subprocess.run(shlex.split("python -m unidic download"), timeout=100, check=True) # noqa: S603

	import gradio as gr
	import numpy as np
	import spaces
	from kokoro import KPipeline

	pipeline = KPipeline(lang_code="j")

	VOICES = [
	"jf_alpha",
	"jf_gongitsune",
	"jf_nezumi",
	"jf_tebukuro",
	"jm_kumo",
	]


	@spaces.GPU(duration=20)
	def run(text: str, voice: str, speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
	"""Synthesizes speech from Japanese text using the Kokoro TTS model.

	Note:
	This model supports only Japanese input texts.

	Voice Selection:
	- The `voice` parameter specifies the speaker's characteristics and should follow the naming convention:
	`<language/accent><gender>_<voice_name>`
	- `<language/accent>`:
	- 'j' for Japanese
	- `<gender>`:
	- 'f' for female
	- 'm' for male
	- Example: 'jf_alpha' indicates an Japanese female voice named Alpha.

	Available voices:
	- jf_alpha
	- jf_gongitsune
	- jf_nezumi
	- jf_tebukuro
	- jm_kumo

	Args:
	text: Input text to be synthesized. Only Japanese text is supported. Non-Japanese input may result in errors or mispronunciations.
	voice: Identifier for the voice to be used in synthesis. Defaults to "jf_alpha".
	speed: Playback speed multiplier. A value of 1.0 means normal speed; values above or below adjust the speech rate accordingly. Defaults to 1.0.

	Returns:
	A tuple containing the audio and the tokens used to generate the audio.
	"""
	generator = pipeline(
	text,
	voice=voice,
	speed=speed,
	split_pattern=r"\n+",
	)
	_, ps, audio = next(generator)
	return (24000, audio.numpy()), ps


	with gr.Blocks(css_paths="style.css") as demo:
	gr.Markdown("# Kokoro (ja)")
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(label="Input Text")
	voice = gr.Dropdown(label="Voice", choices=VOICES, value=VOICES[0])
	speed = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0)
	run_button = gr.Button()
	with gr.Column():
	output_audio = gr.Audio(label="Output Audio", autoplay=True)
	output_tokens = gr.Textbox(label="Output Tokens")
	gr.Examples(
	examples=[
	"どうもこんにちは。今日はいい天気ですね。",
	"隣の竹垣に竹立てかけたのは隣の竹垣に竹立てかけたかったからと骨粗鬆症の東京特許許可局局長がマサチューセッツ州の美術室で魔術師が呪術師と手術中に供述。",
	"李も桃も桃のうち",
	"隣の客はよく柿食う客だ",
	"裏庭には二羽、庭には二羽、鶏がいる。",
	"貴社の記者が汽車で帰社した。橋の端にある箸。石でできた医師。お食事券にまつわる汚職事件。",
	],
	inputs=input_text,
	)

	run_button.click(
	fn=run,
	inputs=[input_text, voice, speed],
	outputs=[output_audio, output_tokens],
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)