kemuriririn commited on
Commit
4172058
Β·
1 Parent(s): 577fef3

(wip)debug

Browse files
Files changed (2) hide show
  1. models.py +9 -1
  2. tts.py +54 -3
models.py CHANGED
@@ -438,9 +438,17 @@ def insert_initial_models():
438
  name="Spark TTS",
439
  model_type=ModelType.TTS,
440
  is_open=False,
441
- is_active=True, # API stopped working
442
  model_url="https://github.com/SparkAudio/Spark-TTS",
443
  ),
 
 
 
 
 
 
 
 
444
  # Model(
445
  # id="playht-2.0",
446
  # name="PlayHT 2.0",
 
438
  name="Spark TTS",
439
  model_type=ModelType.TTS,
440
  is_open=False,
441
+ is_active=False, # API stopped working
442
  model_url="https://github.com/SparkAudio/Spark-TTS",
443
  ),
444
+ Model(
445
+ id="maskgct",
446
+ name="maskgct",
447
+ model_type=ModelType.TTS,
448
+ is_open=False,
449
+ is_active=True,
450
+ model_url="https://github.com/open-mmlab/Amphion/tree/main/models/tts/maskgct",
451
+ ),
452
  # Model(
453
  # id="playht-2.0",
454
  # name="PlayHT 2.0",
tts.py CHANGED
@@ -85,6 +85,14 @@ model_mapping = {
85
  "provider": "bilibili",
86
  "model": "index-tts",
87
  },
 
 
 
 
 
 
 
 
88
  }
89
  url = "https://tts-agi-tts-router-v2.hf.space/tts"
90
  headers = {
@@ -230,15 +238,22 @@ def predict_spark_tts(text, reference_audio_path=None):
230
 
231
 
232
  def predict_cosyvoice_tts(text, reference_audio_path=None):
233
- from gradio_client import Client, file
234
  client = Client("https://iic-cosyvoice2-0-5b.ms.show/")
235
  if not reference_audio_path:
236
  raise ValueError("cosyvoice-2.0 ιœ€θ¦ reference_audio_path")
237
  prompt_wav = handle_file(reference_audio_path)
 
 
 
 
 
 
 
238
  result = client.predict(
239
  tts_text=text,
240
  mode_checkbox_group="3sζžι€Ÿε€εˆ»",
241
- prompt_text="",
242
  prompt_wav_upload=prompt_wav,
243
  prompt_wav_record=prompt_wav,
244
  instruct_text="",
@@ -246,7 +261,39 @@ def predict_cosyvoice_tts(text, reference_audio_path=None):
246
  api_name="/generate_audio"
247
  )
248
  print("cosyvoice-2.0 result:", result)
 
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  return result
251
 
252
 
@@ -266,6 +313,10 @@ def predict_tts(text, model, reference_audio_path=None):
266
  return predict_spark_tts(text, reference_audio_path)
267
  elif model == "cosyvoice-2.0":
268
  return predict_cosyvoice_tts(text, reference_audio_path)
 
 
 
 
269
 
270
  if not model in model_mapping:
271
  raise ValueError(f"Model {model} not found")
@@ -276,7 +327,7 @@ def predict_tts(text, model, reference_audio_path=None):
276
  "provider": model_mapping[model]["provider"],
277
  "model": model_mapping[model]["model"],
278
  }
279
- # δ»…ε―Ήζ”―ζŒιŸ³θ‰²ε…‹ιš†ηš„ζ¨‘εž‹δΌ ι€’ε‚θ€ƒιŸ³θ‰²
280
  supports_reference = model in [
281
  "styletts2", "eleven-multilingual-v2", "eleven-turbo-v2.5", "eleven-flash-v2.5"
282
  ]
 
85
  "provider": "bilibili",
86
  "model": "index-tts",
87
  },
88
+ "step-audio-tts-3b": {
89
+ "provider": "swarmeta_ai",
90
+ "model": "step-audio-tts-3b",
91
+ },
92
+ "maskgct": {
93
+ "provider": "amphion",
94
+ "model": "maskgct",
95
+ },
96
  }
97
  url = "https://tts-agi-tts-router-v2.hf.space/tts"
98
  headers = {
 
238
 
239
 
240
  def predict_cosyvoice_tts(text, reference_audio_path=None):
241
+ from gradio_client import Client, file, handle_file
242
  client = Client("https://iic-cosyvoice2-0-5b.ms.show/")
243
  if not reference_audio_path:
244
  raise ValueError("cosyvoice-2.0 ιœ€θ¦ reference_audio_path")
245
  prompt_wav = handle_file(reference_audio_path)
246
+ # ε…ˆθ―†εˆ«ε‚θ€ƒιŸ³ι’‘ζ–‡ζœ¬
247
+ recog_result = client.predict(
248
+ prompt_wav=file(reference_audio_path),
249
+ api_name="/prompt_wav_recognition"
250
+ )
251
+ print("cosyvoice-2.0 prompt_wav_recognition result:", recog_result)
252
+ prompt_text = recog_result if isinstance(recog_result, str) else str(recog_result)
253
  result = client.predict(
254
  tts_text=text,
255
  mode_checkbox_group="3sζžι€Ÿε€εˆ»",
256
+ prompt_text=prompt_text,
257
  prompt_wav_upload=prompt_wav,
258
  prompt_wav_record=prompt_wav,
259
  instruct_text="",
 
261
  api_name="/generate_audio"
262
  )
263
  print("cosyvoice-2.0 result:", result)
264
+ return result
265
 
266
+
267
+ def predict_step_audio_tts_3b(text, reference_audio_path=None):
268
+ from gradio_client import Client, handle_file,file
269
+ client = Client("https://swarmeta-ai-step-audio-tts-3b.ms.show/")
270
+ if not reference_audio_path:
271
+ raise ValueError("step-audio-tts-3b ιœ€θ¦ reference_audio_path")
272
+ prompt_audio = handle_file(reference_audio_path)
273
+ result = client.predict(
274
+ text=text,
275
+ prompt_audio=file(reference_audio_path),
276
+ prompt_text="",
277
+ api_name="/generate_clone"
278
+ )
279
+ print("step-audio-tts-3b result:", result)
280
+ return result
281
+
282
+
283
+ def predict_maskgct(text, reference_audio_path=None):
284
+ from gradio_client import Client, handle_file
285
+ client = Client("https://s5k.cn/api/v1/studio/amphion/maskgct/gradio/")
286
+ if not reference_audio_path:
287
+ raise ValueError("maskgct ιœ€θ¦ reference_audio_path")
288
+ prompt_wav = handle_file(reference_audio_path)
289
+ result = client.predict(
290
+ prompt_wav=prompt_wav,
291
+ target_text=text,
292
+ target_len=-1,
293
+ n_timesteps=25,
294
+ api_name="/predict"
295
+ )
296
+ print("maskgct result:", result)
297
  return result
298
 
299
 
 
313
  return predict_spark_tts(text, reference_audio_path)
314
  elif model == "cosyvoice-2.0":
315
  return predict_cosyvoice_tts(text, reference_audio_path)
316
+ elif model == "step-audio-tts-3b":
317
+ return predict_step_audio_tts_3b(text, reference_audio_path)
318
+ elif model == "maskgct":
319
+ return predict_maskgct(text, reference_audio_path)
320
 
321
  if not model in model_mapping:
322
  raise ValueError(f"Model {model} not found")
 
327
  "provider": model_mapping[model]["provider"],
328
  "model": model_mapping[model]["model"],
329
  }
330
+ # δ»…ζ”―ζŒιŸ³θ‰²ε…‹ιš†ηš„ζ¨‘εž‹δΌ ι€’ε‚θ€ƒιŸ³θ‰²
331
  supports_reference = model in [
332
  "styletts2", "eleven-multilingual-v2", "eleven-turbo-v2.5", "eleven-flash-v2.5"
333
  ]