Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse filesChanged it to 2 step process to deal with Meta error.
app.py
CHANGED
@@ -7,6 +7,7 @@ import torch
|
|
7 |
import spaces
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
9 |
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
|
|
|
10 |
|
11 |
|
12 |
# --- Constants ---
|
@@ -76,17 +77,37 @@ class ModelWrapper:
|
|
76 |
self.tokenizer.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
|
77 |
|
78 |
print(f"Loading model: {model_name}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
#
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
offload_folder="offload" # Keep this for memory management
|
88 |
-
).eval()
|
89 |
-
|
90 |
print(f"Model {model_name} loaded successfully.")
|
91 |
|
92 |
def get_message_template(self, system_content=None, user_content=None, assistant_content=None):
|
|
|
7 |
import spaces
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
9 |
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
|
10 |
+
from huggingface_hub import snapshot_download
|
11 |
|
12 |
|
13 |
# --- Constants ---
|
|
|
77 |
self.tokenizer.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
|
78 |
|
79 |
print(f"Loading model: {model_name}...")
|
80 |
+
|
81 |
+
# For large models, we use a more robust, memory-safe loading method.
|
82 |
+
# This explicitly handles the "meta tensor" device placement.
|
83 |
+
if "8b" in model_name.lower() or "4b" in model_name.lower():
|
84 |
+
|
85 |
+
# Step 1: Download the model files and get the local path.
|
86 |
+
print(f"Ensuring model checkpoint is available locally for {model_name}...")
|
87 |
+
checkpoint_path = snapshot_download(repo_id=model_name)
|
88 |
+
print(f"Checkpoint is at: {checkpoint_path}")
|
89 |
+
|
90 |
+
# Step 2: Create the model's "skeleton" on the meta device (no memory used).
|
91 |
+
config = AutoConfig.from_pretrained(model_name, torch_dtype=torch.bfloat16)
|
92 |
+
with init_empty_weights():
|
93 |
+
model_empty = AutoModelForCausalLM.from_config(config)
|
94 |
+
|
95 |
+
# Step 3: Load the real weights from the local files directly onto the GPU(s).
|
96 |
+
# This function is designed to handle the meta->device transition correctly.
|
97 |
+
self.model = load_checkpoint_and_dispatch(
|
98 |
+
model_empty,
|
99 |
+
checkpoint_path,
|
100 |
+
device_map="auto",
|
101 |
+
offload_folder="offload"
|
102 |
+
).eval()
|
103 |
|
104 |
+
else: # For smaller models, the simpler method is fine.
|
105 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
106 |
+
model_name,
|
107 |
+
device_map="auto",
|
108 |
+
torch_dtype=torch.bfloat16
|
109 |
+
).eval()
|
110 |
+
|
|
|
|
|
|
|
111 |
print(f"Model {model_name} loaded successfully.")
|
112 |
|
113 |
def get_message_template(self, system_content=None, user_content=None, assistant_content=None):
|