kemuriririn commited on
Commit
b8368df
·
1 Parent(s): a692a02
cosyvoice/cli/cosyvoice.py CHANGED
@@ -63,13 +63,7 @@ class CosyVoice:
63
 
64
  @spaces.GPU
65
  def reload_frontend(self):
66
- self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
67
- self.configs['feat_extractor'],
68
- '{}/campplus.onnx'.format(self.model_dir),
69
- '{}/speech_tokenizer_v1.onnx'.format(self.model_dir),
70
- '{}/spk2info.pt'.format(self.model_dir),
71
- self.instruct,
72
- self.configs['allowed_special'])
73
 
74
  @spaces.GPU
75
  def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
@@ -198,12 +192,4 @@ class CosyVoice2(CosyVoice):
198
  self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
199
  del configs
200
 
201
- @spaces.GPU
202
- def reload_frontend(self):
203
- self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
204
- self.configs['feat_extractor'],
205
- '{}/campplus.onnx'.format(self.model_dir),
206
- '{}/speech_tokenizer_v2.onnx'.format(self.model_dir),
207
- '{}/spk2info.pt'.format(self.model_dir),
208
- self.instruct,
209
- self.configs['allowed_special'])
 
63
 
64
  @spaces.GPU
65
  def reload_frontend(self):
66
+ self.frontend.reload_onnx()
 
 
 
 
 
 
67
 
68
  @spaces.GPU
69
  def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
 
192
  self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
193
  del configs
194
 
195
+
 
 
 
 
 
 
 
 
cosyvoice/cli/frontend.py CHANGED
@@ -51,11 +51,16 @@ class CosyVoiceFrontEnd:
51
  option = onnxruntime.SessionOptions()
52
  option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
53
  option.intra_op_num_threads = 1
 
 
 
54
  self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
55
  print("load campplus model from {}".format(campplus_model))
 
 
 
56
  self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
57
- providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
58
- "CPUExecutionProvider"])
59
  print("load speech-tokenizer model from {}".format(speech_tokenizer_model))
60
  if os.path.exists(spk2info):
61
  self.spk2info = torch.load(spk2info, map_location=self.device)
@@ -75,6 +80,11 @@ class CosyVoiceFrontEnd:
75
  self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
76
  self.en_tn_model = EnNormalizer()
77
 
 
 
 
 
 
78
  def _extract_text_token(self, text):
79
  text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
80
  text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)
 
51
  option = onnxruntime.SessionOptions()
52
  option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
53
  option.intra_op_num_threads = 1
54
+ self.campplus_model = campplus_model
55
+ self.option = option
56
+ self.speech_tokenizer_model = speech_tokenizer_model
57
  self.campplus_session = onnxruntime.InferenceSession(campplus_model, sess_options=option, providers=["CPUExecutionProvider"])
58
  print("load campplus model from {}".format(campplus_model))
59
+ # self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
60
+ # providers=["CUDAExecutionProvider" if torch.cuda.is_available() else
61
+ # "CPUExecutionProvider"])
62
  self.speech_tokenizer_session = onnxruntime.InferenceSession(speech_tokenizer_model, sess_options=option,
63
+ providers=["CPUExecutionProvider"])
 
64
  print("load speech-tokenizer model from {}".format(speech_tokenizer_model))
65
  if os.path.exists(spk2info):
66
  self.spk2info = torch.load(spk2info, map_location=self.device)
 
80
  self.zh_tn_model = ZhNormalizer(remove_erhua=False, full_to_half=False)
81
  self.en_tn_model = EnNormalizer()
82
 
83
+ def reload_onnx(self):
84
+ self.campplus_session = onnxruntime.InferenceSession(self.campplus_model, sess_options=self.option, providers=["CPUExecutionProvider"])
85
+ self.speech_tokenizer_session = onnxruntime.InferenceSession(self.speech_tokenizer_model, sess_options=self.option,
86
+ providers=["CPUExecutionProvider"])
87
+
88
  def _extract_text_token(self, text):
89
  text_token = self.tokenizer.encode(text, allowed_special=self.allowed_special)
90
  text_token = torch.tensor([text_token], dtype=torch.int32).to(self.device)