Spaces:
Runtime error
Runtime error
Commit
·
ba791a8
1
Parent(s):
9eda15b
update
Browse files
cosyvoice/cli/cosyvoice.py
CHANGED
@@ -55,10 +55,12 @@ class CosyVoice:
|
|
55 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
56 |
del configs
|
57 |
|
|
|
58 |
def list_avaliable_spks(self):
|
59 |
spks = list(self.frontend.spk2info.keys())
|
60 |
return spks
|
61 |
|
|
|
62 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
63 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
64 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
@@ -70,6 +72,7 @@ class CosyVoice:
|
|
70 |
yield model_output
|
71 |
start_time = time.time()
|
72 |
|
|
|
73 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
74 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
75 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
@@ -84,6 +87,7 @@ class CosyVoice:
|
|
84 |
yield model_output
|
85 |
start_time = time.time()
|
86 |
|
|
|
87 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
88 |
if self.frontend.instruct is True:
|
89 |
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
@@ -97,6 +101,7 @@ class CosyVoice:
|
|
97 |
yield model_output
|
98 |
start_time = time.time()
|
99 |
|
|
|
100 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
101 |
if self.frontend.instruct is False:
|
102 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
@@ -111,6 +116,7 @@ class CosyVoice:
|
|
111 |
yield model_output
|
112 |
start_time = time.time()
|
113 |
|
|
|
114 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
115 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
116 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
@@ -122,6 +128,7 @@ class CosyVoice:
|
|
122 |
yield model_output
|
123 |
start_time = time.time()
|
124 |
|
|
|
125 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
126 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
127 |
start_time = time.time()
|
|
|
55 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
56 |
del configs
|
57 |
|
58 |
+
@spaces.GPU
|
59 |
def list_avaliable_spks(self):
|
60 |
spks = list(self.frontend.spk2info.keys())
|
61 |
return spks
|
62 |
|
63 |
+
@spaces.GPU
|
64 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
65 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
66 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
|
|
72 |
yield model_output
|
73 |
start_time = time.time()
|
74 |
|
75 |
+
@spaces.GPU
|
76 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
77 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
78 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
|
|
87 |
yield model_output
|
88 |
start_time = time.time()
|
89 |
|
90 |
+
@spaces.GPU
|
91 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
92 |
if self.frontend.instruct is True:
|
93 |
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
|
|
101 |
yield model_output
|
102 |
start_time = time.time()
|
103 |
|
104 |
+
@spaces.GPU
|
105 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
106 |
if self.frontend.instruct is False:
|
107 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
|
|
116 |
yield model_output
|
117 |
start_time = time.time()
|
118 |
|
119 |
+
@spaces.GPU
|
120 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
121 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
122 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
|
|
128 |
yield model_output
|
129 |
start_time = time.time()
|
130 |
|
131 |
+
@spaces.GPU
|
132 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
133 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
134 |
start_time = time.time()
|