Spaces:
Running
on
Zero
Running
on
Zero
Fix engine initalization and add model preload
Browse files
README.md
CHANGED
@@ -8,6 +8,9 @@ sdk_version: 5.36.2
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
short_description: Higgs Audio Demo
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
short_description: Higgs Audio Demo
|
11 |
+
preload_from_hub:
|
12 |
+
- "bosonai/higgs-audio-v2-generation-3B-staging"
|
13 |
+
- "bosonai/higgs-audio-v2-tokenizer-staging"
|
14 |
---
|
15 |
|
16 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -15,7 +15,7 @@ import time
|
|
15 |
from functools import lru_cache
|
16 |
import re
|
17 |
import spaces
|
18 |
-
|
19 |
|
20 |
# Import HiggsAudio components
|
21 |
from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
|
@@ -64,12 +64,7 @@ PREDEFINED_EXAMPLES = {
|
|
64 |
"It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
|
65 |
"And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
|
66 |
"\n"
|
67 |
-
"So here's the big question: Do you want to understand how deep learning works?\n"
|
68 |
-
"How to use it to build powerful models that can predict, automate, and transform industries?\n"
|
69 |
-
"Well, today, I've got some exciting news for you.\n"
|
70 |
-
"\n"
|
71 |
-
"We're going to talk about a course that I highly recommend: Dive into Deep Learning.\n"
|
72 |
-
"It's not just another course; it's an entire experience that will take you from a beginner to someone who is well-versed in deep learning techniques.",
|
73 |
"description": "Single speaker example",
|
74 |
},
|
75 |
"single-speaker-zh": {
|
@@ -80,7 +75,6 @@ PREDEFINED_EXAMPLES = {
|
|
80 |
"<|scene_desc_end|>",
|
81 |
"input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
|
82 |
"今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
|
83 |
-
"无论你是开发者, 数据科学爱好者, 还是只是对人工智能感兴趣的人都一定听说过这个词. 它已经成为AI时代的一个研究热点.\n"
|
84 |
"那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
|
85 |
"或者说, 你能察觉到我其实是个机器人吗?",
|
86 |
"description": "Single speaker with Chinese text",
|
@@ -95,6 +89,11 @@ def encode_audio_file(file_path):
|
|
95 |
return base64.b64encode(audio_file.read()).decode("utf-8")
|
96 |
|
97 |
|
|
|
|
|
|
|
|
|
|
|
98 |
def load_voice_presets():
|
99 |
"""Load the voice presets from the voice_examples directory."""
|
100 |
try:
|
@@ -127,14 +126,15 @@ def get_voice_present(voice_preset):
|
|
127 |
|
128 |
|
129 |
@spaces.GPU
|
130 |
-
def initialize_engine(model_path, audio_tokenizer_path
|
131 |
"""Initialize the HiggsAudioServeEngine."""
|
132 |
global engine
|
133 |
try:
|
|
|
134 |
engine = HiggsAudioServeEngine(
|
135 |
model_name_or_path=model_path,
|
136 |
audio_tokenizer_name_or_path=audio_tokenizer_path,
|
137 |
-
device=
|
138 |
)
|
139 |
logger.info(f"Successfully initialized HiggsAudioServeEngine with model: {model_path}")
|
140 |
return True
|
@@ -217,10 +217,7 @@ def text_to_speech(
|
|
217 |
global engine
|
218 |
|
219 |
if engine is None:
|
220 |
-
|
221 |
-
logger.error(error_msg)
|
222 |
-
gr.Error(error_msg)
|
223 |
-
return f"❌ {error_msg}", None
|
224 |
|
225 |
try:
|
226 |
# Prepare ChatML sample
|
@@ -482,18 +479,6 @@ def main():
|
|
482 |
global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
|
483 |
|
484 |
parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
|
485 |
-
parser.add_argument(
|
486 |
-
"--model-path",
|
487 |
-
type=str,
|
488 |
-
default=DEFAULT_MODEL_PATH,
|
489 |
-
help="Path to the Higgs Audio model.",
|
490 |
-
)
|
491 |
-
parser.add_argument(
|
492 |
-
"--audio-tokenizer-path",
|
493 |
-
type=str,
|
494 |
-
default=DEFAULT_AUDIO_TOKENIZER_PATH,
|
495 |
-
help="Path to the audio tokenizer.",
|
496 |
-
)
|
497 |
parser.add_argument(
|
498 |
"--device",
|
499 |
type=str,
|
@@ -507,13 +492,10 @@ def main():
|
|
507 |
args = parser.parse_args()
|
508 |
|
509 |
# Update default values if provided via command line
|
510 |
-
DEFAULT_MODEL_PATH = args.model_path
|
511 |
-
DEFAULT_AUDIO_TOKENIZER_PATH = args.audio_tokenizer_path
|
512 |
VOICE_PRESETS = load_voice_presets()
|
513 |
|
514 |
# Load model on startup
|
515 |
-
|
516 |
-
result = initialize_engine(args.model_path, args.audio_tokenizer_path, args.device)
|
517 |
|
518 |
# Exit if model loading failed
|
519 |
if not result:
|
|
|
15 |
from functools import lru_cache
|
16 |
import re
|
17 |
import spaces
|
18 |
+
import torch
|
19 |
|
20 |
# Import HiggsAudio components
|
21 |
from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
|
|
|
64 |
"It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
|
65 |
"And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
|
66 |
"\n"
|
67 |
+
"So here's the big question: Do you want to understand how deep learning works?\n",
|
|
|
|
|
|
|
|
|
|
|
68 |
"description": "Single speaker example",
|
69 |
},
|
70 |
"single-speaker-zh": {
|
|
|
75 |
"<|scene_desc_end|>",
|
76 |
"input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
|
77 |
"今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
|
|
|
78 |
"那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
|
79 |
"或者说, 你能察觉到我其实是个机器人吗?",
|
80 |
"description": "Single speaker with Chinese text",
|
|
|
89 |
return base64.b64encode(audio_file.read()).decode("utf-8")
|
90 |
|
91 |
|
92 |
+
def get_current_device():
|
93 |
+
"""Get the current device."""
|
94 |
+
return "cuda" if torch.cuda.is_available() else "cpu"
|
95 |
+
|
96 |
+
|
97 |
def load_voice_presets():
|
98 |
"""Load the voice presets from the voice_examples directory."""
|
99 |
try:
|
|
|
126 |
|
127 |
|
128 |
@spaces.GPU
|
129 |
+
def initialize_engine(model_path, audio_tokenizer_path) -> bool:
|
130 |
"""Initialize the HiggsAudioServeEngine."""
|
131 |
global engine
|
132 |
try:
|
133 |
+
logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
|
134 |
engine = HiggsAudioServeEngine(
|
135 |
model_name_or_path=model_path,
|
136 |
audio_tokenizer_name_or_path=audio_tokenizer_path,
|
137 |
+
device=get_current_device(),
|
138 |
)
|
139 |
logger.info(f"Successfully initialized HiggsAudioServeEngine with model: {model_path}")
|
140 |
return True
|
|
|
217 |
global engine
|
218 |
|
219 |
if engine is None:
|
220 |
+
initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
|
|
|
|
|
|
|
221 |
|
222 |
try:
|
223 |
# Prepare ChatML sample
|
|
|
479 |
global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
|
480 |
|
481 |
parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
parser.add_argument(
|
483 |
"--device",
|
484 |
type=str,
|
|
|
492 |
args = parser.parse_args()
|
493 |
|
494 |
# Update default values if provided via command line
|
|
|
|
|
495 |
VOICE_PRESETS = load_voice_presets()
|
496 |
|
497 |
# Load model on startup
|
498 |
+
result = initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
|
|
|
499 |
|
500 |
# Exit if model loading failed
|
501 |
if not result:
|