zachzzc commited on
Commit
708d515
·
1 Parent(s): 07f1f64

Fix engine initalization and add model preload

Browse files
Files changed (2) hide show
  1. README.md +3 -0
  2. app.py +12 -30
README.md CHANGED
@@ -8,6 +8,9 @@ sdk_version: 5.36.2
8
  app_file: app.py
9
  pinned: false
10
  short_description: Higgs Audio Demo
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  short_description: Higgs Audio Demo
11
+ preload_from_hub:
12
+ - "bosonai/higgs-audio-v2-generation-3B-staging"
13
+ - "bosonai/higgs-audio-v2-tokenizer-staging"
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -15,7 +15,7 @@ import time
15
  from functools import lru_cache
16
  import re
17
  import spaces
18
-
19
 
20
  # Import HiggsAudio components
21
  from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
@@ -64,12 +64,7 @@ PREDEFINED_EXAMPLES = {
64
  "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
65
  "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
66
  "\n"
67
- "So here's the big question: Do you want to understand how deep learning works?\n"
68
- "How to use it to build powerful models that can predict, automate, and transform industries?\n"
69
- "Well, today, I've got some exciting news for you.\n"
70
- "\n"
71
- "We're going to talk about a course that I highly recommend: Dive into Deep Learning.\n"
72
- "It's not just another course; it's an entire experience that will take you from a beginner to someone who is well-versed in deep learning techniques.",
73
  "description": "Single speaker example",
74
  },
75
  "single-speaker-zh": {
@@ -80,7 +75,6 @@ PREDEFINED_EXAMPLES = {
80
  "<|scene_desc_end|>",
81
  "input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
82
  "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
83
- "无论你是开发者, 数据科学爱好者, 还是只是对人工智能感兴趣的人都一定听说过这个词. 它已经成为AI时代的一个研究热点.\n"
84
  "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
85
  "或者说, 你能察觉到我其实是个机器人吗?",
86
  "description": "Single speaker with Chinese text",
@@ -95,6 +89,11 @@ def encode_audio_file(file_path):
95
  return base64.b64encode(audio_file.read()).decode("utf-8")
96
 
97
 
 
 
 
 
 
98
  def load_voice_presets():
99
  """Load the voice presets from the voice_examples directory."""
100
  try:
@@ -127,14 +126,15 @@ def get_voice_present(voice_preset):
127
 
128
 
129
  @spaces.GPU
130
- def initialize_engine(model_path, audio_tokenizer_path, device="cuda") -> bool:
131
  """Initialize the HiggsAudioServeEngine."""
132
  global engine
133
  try:
 
134
  engine = HiggsAudioServeEngine(
135
  model_name_or_path=model_path,
136
  audio_tokenizer_name_or_path=audio_tokenizer_path,
137
- device=device,
138
  )
139
  logger.info(f"Successfully initialized HiggsAudioServeEngine with model: {model_path}")
140
  return True
@@ -217,10 +217,7 @@ def text_to_speech(
217
  global engine
218
 
219
  if engine is None:
220
- error_msg = "Engine not initialized. Please load a model first."
221
- logger.error(error_msg)
222
- gr.Error(error_msg)
223
- return f"❌ {error_msg}", None
224
 
225
  try:
226
  # Prepare ChatML sample
@@ -482,18 +479,6 @@ def main():
482
  global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
483
 
484
  parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
485
- parser.add_argument(
486
- "--model-path",
487
- type=str,
488
- default=DEFAULT_MODEL_PATH,
489
- help="Path to the Higgs Audio model.",
490
- )
491
- parser.add_argument(
492
- "--audio-tokenizer-path",
493
- type=str,
494
- default=DEFAULT_AUDIO_TOKENIZER_PATH,
495
- help="Path to the audio tokenizer.",
496
- )
497
  parser.add_argument(
498
  "--device",
499
  type=str,
@@ -507,13 +492,10 @@ def main():
507
  args = parser.parse_args()
508
 
509
  # Update default values if provided via command line
510
- DEFAULT_MODEL_PATH = args.model_path
511
- DEFAULT_AUDIO_TOKENIZER_PATH = args.audio_tokenizer_path
512
  VOICE_PRESETS = load_voice_presets()
513
 
514
  # Load model on startup
515
- logger.info("Loading model...")
516
- result = initialize_engine(args.model_path, args.audio_tokenizer_path, args.device)
517
 
518
  # Exit if model loading failed
519
  if not result:
 
15
  from functools import lru_cache
16
  import re
17
  import spaces
18
+ import torch
19
 
20
  # Import HiggsAudio components
21
  from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
 
64
  "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
65
  "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
66
  "\n"
67
+ "So here's the big question: Do you want to understand how deep learning works?\n",
 
 
 
 
 
68
  "description": "Single speaker example",
69
  },
70
  "single-speaker-zh": {
 
75
  "<|scene_desc_end|>",
76
  "input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
77
  "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
 
78
  "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
79
  "或者说, 你能察觉到我其实是个机器人吗?",
80
  "description": "Single speaker with Chinese text",
 
89
  return base64.b64encode(audio_file.read()).decode("utf-8")
90
 
91
 
92
+ def get_current_device():
93
+ """Get the current device."""
94
+ return "cuda" if torch.cuda.is_available() else "cpu"
95
+
96
+
97
  def load_voice_presets():
98
  """Load the voice presets from the voice_examples directory."""
99
  try:
 
126
 
127
 
128
  @spaces.GPU
129
+ def initialize_engine(model_path, audio_tokenizer_path) -> bool:
130
  """Initialize the HiggsAudioServeEngine."""
131
  global engine
132
  try:
133
+ logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
134
  engine = HiggsAudioServeEngine(
135
  model_name_or_path=model_path,
136
  audio_tokenizer_name_or_path=audio_tokenizer_path,
137
+ device=get_current_device(),
138
  )
139
  logger.info(f"Successfully initialized HiggsAudioServeEngine with model: {model_path}")
140
  return True
 
217
  global engine
218
 
219
  if engine is None:
220
+ initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
 
 
 
221
 
222
  try:
223
  # Prepare ChatML sample
 
479
  global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
480
 
481
  parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
 
 
 
 
 
 
 
 
 
 
 
 
482
  parser.add_argument(
483
  "--device",
484
  type=str,
 
492
  args = parser.parse_args()
493
 
494
  # Update default values if provided via command line
 
 
495
  VOICE_PRESETS = load_voice_presets()
496
 
497
  # Load model on startup
498
+ result = initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
 
499
 
500
  # Exit if model loading failed
501
  if not result: