Spaces:
Runtime error
Runtime error
Commit
·
a692a02
1
Parent(s):
975e651
update
Browse files- cosyvoice/cli/cosyvoice.py +31 -2
cosyvoice/cli/cosyvoice.py
CHANGED
@@ -26,11 +26,13 @@ class CosyVoice:
|
|
26 |
@spaces.GPU
|
27 |
def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
|
28 |
instruct = True if '-Instruct' in model_dir else False
|
|
|
29 |
self.model_dir = model_dir
|
30 |
if not os.path.exists(model_dir):
|
31 |
model_dir = snapshot_download(model_dir)
|
32 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
33 |
configs = load_hyperpyyaml(f)
|
|
|
34 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
35 |
configs['feat_extractor'],
|
36 |
'{}/campplus.onnx'.format(model_dir),
|
@@ -53,15 +55,25 @@ class CosyVoice:
|
|
53 |
'{}/flow.encoder.fp32.zip'.format(model_dir))
|
54 |
if load_onnx:
|
55 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
56 |
-
del configs
|
57 |
|
58 |
@spaces.GPU
|
59 |
def list_avaliable_spks(self):
|
60 |
spks = list(self.frontend.spk2info.keys())
|
61 |
return spks
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
@spaces.GPU
|
64 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
|
|
65 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
66 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
67 |
start_time = time.time()
|
@@ -74,6 +86,7 @@ class CosyVoice:
|
|
74 |
|
75 |
@spaces.GPU
|
76 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
77 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
78 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
79 |
if len(i) < 0.5 * len(prompt_text):
|
@@ -89,6 +102,7 @@ class CosyVoice:
|
|
89 |
|
90 |
@spaces.GPU
|
91 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
92 |
if self.frontend.instruct is True:
|
93 |
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
94 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
@@ -103,6 +117,7 @@ class CosyVoice:
|
|
103 |
|
104 |
@spaces.GPU
|
105 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
|
|
106 |
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
107 |
if self.frontend.instruct is False:
|
108 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
@@ -119,6 +134,7 @@ class CosyVoice:
|
|
119 |
|
120 |
@spaces.GPU
|
121 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
122 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
123 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
124 |
start_time = time.time()
|
@@ -131,6 +147,7 @@ class CosyVoice:
|
|
131 |
|
132 |
@spaces.GPU
|
133 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
134 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
135 |
start_time = time.time()
|
136 |
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
|
@@ -143,11 +160,13 @@ class CosyVoice2(CosyVoice):
|
|
143 |
@spaces.GPU
|
144 |
def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
|
145 |
instruct = True if '-Instruct' in model_dir else False
|
|
|
146 |
self.model_dir = model_dir
|
147 |
if not os.path.exists(model_dir):
|
148 |
model_dir = snapshot_download(model_dir)
|
149 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
150 |
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
|
|
|
151 |
# print(f"Loading configs:{configs}")
|
152 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
153 |
configs['feat_extractor'],
|
@@ -177,4 +196,14 @@ class CosyVoice2(CosyVoice):
|
|
177 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
178 |
if load_trt:
|
179 |
self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
|
180 |
-
del configs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
@spaces.GPU
|
27 |
def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
|
28 |
instruct = True if '-Instruct' in model_dir else False
|
29 |
+
self.instruct = instruct
|
30 |
self.model_dir = model_dir
|
31 |
if not os.path.exists(model_dir):
|
32 |
model_dir = snapshot_download(model_dir)
|
33 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
34 |
configs = load_hyperpyyaml(f)
|
35 |
+
self.configs = configs
|
36 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
37 |
configs['feat_extractor'],
|
38 |
'{}/campplus.onnx'.format(model_dir),
|
|
|
55 |
'{}/flow.encoder.fp32.zip'.format(model_dir))
|
56 |
if load_onnx:
|
57 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
|
|
58 |
|
59 |
@spaces.GPU
|
60 |
def list_avaliable_spks(self):
|
61 |
spks = list(self.frontend.spk2info.keys())
|
62 |
return spks
|
63 |
|
64 |
+
@spaces.GPU
|
65 |
+
def reload_frontend(self):
|
66 |
+
self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
|
67 |
+
self.configs['feat_extractor'],
|
68 |
+
'{}/campplus.onnx'.format(self.model_dir),
|
69 |
+
'{}/speech_tokenizer_v1.onnx'.format(self.model_dir),
|
70 |
+
'{}/spk2info.pt'.format(self.model_dir),
|
71 |
+
self.instruct,
|
72 |
+
self.configs['allowed_special'])
|
73 |
+
|
74 |
@spaces.GPU
|
75 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
76 |
+
self.reload_frontend()
|
77 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
78 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
79 |
start_time = time.time()
|
|
|
86 |
|
87 |
@spaces.GPU
|
88 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
89 |
+
self.reload_frontend()
|
90 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
91 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
92 |
if len(i) < 0.5 * len(prompt_text):
|
|
|
102 |
|
103 |
@spaces.GPU
|
104 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
105 |
+
self.reload_frontend()
|
106 |
if self.frontend.instruct is True:
|
107 |
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
108 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
|
|
117 |
|
118 |
@spaces.GPU
|
119 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
120 |
+
self.reload_frontend()
|
121 |
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
122 |
if self.frontend.instruct is False:
|
123 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
|
|
134 |
|
135 |
@spaces.GPU
|
136 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
137 |
+
self.reload_frontend()
|
138 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
139 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
140 |
start_time = time.time()
|
|
|
147 |
|
148 |
@spaces.GPU
|
149 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
150 |
+
self.reload_frontend()
|
151 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
152 |
start_time = time.time()
|
153 |
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
|
|
|
160 |
@spaces.GPU
|
161 |
def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
|
162 |
instruct = True if '-Instruct' in model_dir else False
|
163 |
+
self.instruct = instruct
|
164 |
self.model_dir = model_dir
|
165 |
if not os.path.exists(model_dir):
|
166 |
model_dir = snapshot_download(model_dir)
|
167 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
168 |
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
|
169 |
+
self.configs = configs
|
170 |
# print(f"Loading configs:{configs}")
|
171 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
172 |
configs['feat_extractor'],
|
|
|
196 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
197 |
if load_trt:
|
198 |
self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
|
199 |
+
del configs
|
200 |
+
|
201 |
+
@spaces.GPU
|
202 |
+
def reload_frontend(self):
|
203 |
+
self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
|
204 |
+
self.configs['feat_extractor'],
|
205 |
+
'{}/campplus.onnx'.format(self.model_dir),
|
206 |
+
'{}/speech_tokenizer_v2.onnx'.format(self.model_dir),
|
207 |
+
'{}/spk2info.pt'.format(self.model_dir),
|
208 |
+
self.instruct,
|
209 |
+
self.configs['allowed_special'])
|