hayas commited on
Commit
20c2334
·
1 Parent(s): ef2657b
Files changed (1) hide show
  1. app.py +19 -6
app.py CHANGED
@@ -25,9 +25,22 @@ VOICES = [
25
 
26
  @spaces.GPU(duration=20)
27
  def run(text: str, voice: str, speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
28
- """Generate audio from text using Kokoro.
29
 
30
- Available voices are:
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  - jf_alpha
32
  - jf_gongitsune
33
  - jf_nezumi
@@ -35,12 +48,12 @@ def run(text: str, voice: str, speed: float = 1.0) -> tuple[tuple[int, np.ndarra
35
  - jm_kumo
36
 
37
  Args:
38
- text (str): Text to generate audio from.
39
- voice (str): Voice to use.
40
- speed (float): Speed of the audio. Defaults to 1.0.
41
 
42
  Returns:
43
- tuple[tuple[int, np.ndarray], str]: Tuple of (sample rate, audio data) and the text.
44
  """
45
  generator = pipeline(
46
  text,
 
25
 
26
  @spaces.GPU(duration=20)
27
  def run(text: str, voice: str, speed: float = 1.0) -> tuple[tuple[int, np.ndarray], str]:
28
+ """Synthesizes speech from Japanese text using the Kokoro TTS model.
29
 
30
+ Note:
31
+ This model supports only Japanese input texts.
32
+
33
+ Voice Selection:
34
+ - The `voice` parameter specifies the speaker's characteristics and should follow the naming convention:
35
+ `<language/accent><gender>_<voice_name>`
36
+ - `<language/accent>`:
37
+ - 'j' for Japanese
38
+ - `<gender>`:
39
+ - 'f' for female
40
+ - 'm' for male
41
+ - Example: 'jf_alpha' indicates an Japanese female voice named Alpha.
42
+
43
+ Available voices:
44
  - jf_alpha
45
  - jf_gongitsune
46
  - jf_nezumi
 
48
  - jm_kumo
49
 
50
  Args:
51
+ text: Input text to be synthesized. Only Japanese text is supported. Non-Japanese input may result in errors or mispronunciations.
52
+ voice: Identifier for the voice to be used in synthesis. Defaults to "jf_alpha".
53
+ speed: Playback speed multiplier. A value of 1.0 means normal speed; values above or below adjust the speech rate accordingly. Defaults to 1.0.
54
 
55
  Returns:
56
+ A tuple containing the audio and the tokens used to generate the audio.
57
  """
58
  generator = pipeline(
59
  text,