kemuriririn commited on
Commit
a692a02
·
1 Parent(s): 975e651
Files changed (1) hide show
  1. cosyvoice/cli/cosyvoice.py +31 -2
cosyvoice/cli/cosyvoice.py CHANGED
@@ -26,11 +26,13 @@ class CosyVoice:
26
  @spaces.GPU
27
  def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
28
  instruct = True if '-Instruct' in model_dir else False
 
29
  self.model_dir = model_dir
30
  if not os.path.exists(model_dir):
31
  model_dir = snapshot_download(model_dir)
32
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
33
  configs = load_hyperpyyaml(f)
 
34
  self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
35
  configs['feat_extractor'],
36
  '{}/campplus.onnx'.format(model_dir),
@@ -53,15 +55,25 @@ class CosyVoice:
53
  '{}/flow.encoder.fp32.zip'.format(model_dir))
54
  if load_onnx:
55
  self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
56
- del configs
57
 
58
  @spaces.GPU
59
  def list_avaliable_spks(self):
60
  spks = list(self.frontend.spk2info.keys())
61
  return spks
62
 
 
 
 
 
 
 
 
 
 
 
63
  @spaces.GPU
64
  def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
 
65
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
66
  model_input = self.frontend.frontend_sft(i, spk_id)
67
  start_time = time.time()
@@ -74,6 +86,7 @@ class CosyVoice:
74
 
75
  @spaces.GPU
76
  def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
 
77
  prompt_text = self.frontend.text_normalize(prompt_text, split=False)
78
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
79
  if len(i) < 0.5 * len(prompt_text):
@@ -89,6 +102,7 @@ class CosyVoice:
89
 
90
  @spaces.GPU
91
  def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
 
92
  if self.frontend.instruct is True:
93
  raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
94
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
@@ -103,6 +117,7 @@ class CosyVoice:
103
 
104
  @spaces.GPU
105
  def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
 
106
  assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
107
  if self.frontend.instruct is False:
108
  raise ValueError('{} do not support instruct inference'.format(self.model_dir))
@@ -119,6 +134,7 @@ class CosyVoice:
119
 
120
  @spaces.GPU
121
  def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
 
122
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
123
  model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
124
  start_time = time.time()
@@ -131,6 +147,7 @@ class CosyVoice:
131
 
132
  @spaces.GPU
133
  def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
 
134
  model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
135
  start_time = time.time()
136
  for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
@@ -143,11 +160,13 @@ class CosyVoice2(CosyVoice):
143
  @spaces.GPU
144
  def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
145
  instruct = True if '-Instruct' in model_dir else False
 
146
  self.model_dir = model_dir
147
  if not os.path.exists(model_dir):
148
  model_dir = snapshot_download(model_dir)
149
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
150
  configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
 
151
  # print(f"Loading configs:{configs}")
152
  self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
153
  configs['feat_extractor'],
@@ -177,4 +196,14 @@ class CosyVoice2(CosyVoice):
177
  self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
178
  if load_trt:
179
  self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
180
- del configs
 
 
 
 
 
 
 
 
 
 
 
26
  @spaces.GPU
27
  def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
28
  instruct = True if '-Instruct' in model_dir else False
29
+ self.instruct = instruct
30
  self.model_dir = model_dir
31
  if not os.path.exists(model_dir):
32
  model_dir = snapshot_download(model_dir)
33
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
34
  configs = load_hyperpyyaml(f)
35
+ self.configs = configs
36
  self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
37
  configs['feat_extractor'],
38
  '{}/campplus.onnx'.format(model_dir),
 
55
  '{}/flow.encoder.fp32.zip'.format(model_dir))
56
  if load_onnx:
57
  self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
 
58
 
59
  @spaces.GPU
60
  def list_avaliable_spks(self):
61
  spks = list(self.frontend.spk2info.keys())
62
  return spks
63
 
64
+ @spaces.GPU
65
+ def reload_frontend(self):
66
+ self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
67
+ self.configs['feat_extractor'],
68
+ '{}/campplus.onnx'.format(self.model_dir),
69
+ '{}/speech_tokenizer_v1.onnx'.format(self.model_dir),
70
+ '{}/spk2info.pt'.format(self.model_dir),
71
+ self.instruct,
72
+ self.configs['allowed_special'])
73
+
74
  @spaces.GPU
75
  def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
76
+ self.reload_frontend()
77
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
78
  model_input = self.frontend.frontend_sft(i, spk_id)
79
  start_time = time.time()
 
86
 
87
  @spaces.GPU
88
  def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
89
+ self.reload_frontend()
90
  prompt_text = self.frontend.text_normalize(prompt_text, split=False)
91
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
92
  if len(i) < 0.5 * len(prompt_text):
 
102
 
103
  @spaces.GPU
104
  def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
105
+ self.reload_frontend()
106
  if self.frontend.instruct is True:
107
  raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
108
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
 
117
 
118
  @spaces.GPU
119
  def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
120
+ self.reload_frontend()
121
  assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
122
  if self.frontend.instruct is False:
123
  raise ValueError('{} do not support instruct inference'.format(self.model_dir))
 
134
 
135
  @spaces.GPU
136
  def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
137
+ self.reload_frontend()
138
  for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
139
  model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
140
  start_time = time.time()
 
147
 
148
  @spaces.GPU
149
  def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
150
+ self.reload_frontend()
151
  model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
152
  start_time = time.time()
153
  for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
 
160
  @spaces.GPU
161
  def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
162
  instruct = True if '-Instruct' in model_dir else False
163
+ self.instruct = instruct
164
  self.model_dir = model_dir
165
  if not os.path.exists(model_dir):
166
  model_dir = snapshot_download(model_dir)
167
  with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
168
  configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
169
+ self.configs = configs
170
  # print(f"Loading configs:{configs}")
171
  self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
172
  configs['feat_extractor'],
 
196
  self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
197
  if load_trt:
198
  self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
199
+ del configs
200
+
201
+ @spaces.GPU
202
+ def reload_frontend(self):
203
+ self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
204
+ self.configs['feat_extractor'],
205
+ '{}/campplus.onnx'.format(self.model_dir),
206
+ '{}/speech_tokenizer_v2.onnx'.format(self.model_dir),
207
+ '{}/spk2info.pt'.format(self.model_dir),
208
+ self.instruct,
209
+ self.configs['allowed_special'])