Spaces:
Runtime error
Runtime error
big refactor to include original model card
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ import subprocess
|
|
3 |
import signal
|
4 |
import tempfile
|
5 |
from pathlib import Path
|
6 |
-
from textwrap import dedent
|
7 |
import logging
|
8 |
import gradio as gr
|
9 |
from huggingface_hub import HfApi, ModelCard, whoami
|
@@ -32,12 +31,7 @@ logging.basicConfig(
|
|
32 |
logger = logging.getLogger(__name__)
|
33 |
|
34 |
|
35 |
-
def
|
36 |
-
gguf_files,
|
37 |
-
new_repo_url,
|
38 |
-
split_model,
|
39 |
-
model_id=None,
|
40 |
-
):
|
41 |
try:
|
42 |
result = subprocess.run(
|
43 |
["git", "-C", "./llama.cpp", "describe", "--tags", "--always"],
|
@@ -47,58 +41,13 @@ def get_llama_cpp_notes(
|
|
47 |
text=True,
|
48 |
)
|
49 |
version = result.stdout.strip().split("-")[0]
|
50 |
-
|
51 |
-
*Produced by [Antigma Labs](https://antigma.ai), [Antigma Quantize Space](https://huggingface.co/spaces/Antigma/quantize-my-repo)*
|
52 |
-
|
53 |
-
*Follow Antigma Labs in X [https://x.com/antigma_labs](https://x.com/antigma_labs)*
|
54 |
-
|
55 |
-
*Antigma's GitHub Homepage [https://github.com/AntigmaLabs](https://github.com/AntigmaLabs)*
|
56 |
-
|
57 |
-
## llama.cpp quantization
|
58 |
-
Using <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> release <a href="https://github.com/ggml-org/llama.cpp/releases/tag/{version}">{version}</a> for quantization.
|
59 |
-
Original model: https://huggingface.co/{model_id}
|
60 |
-
Run them directly with [llama.cpp](https://github.com/ggml-org/llama.cpp), or any other llama.cpp based project
|
61 |
-
## Prompt format
|
62 |
-
```
|
63 |
-
<|begin▁of▁sentence|>{{system_prompt}}<|User|>{{prompt}}<|Assistant|><|end▁of▁sentence|><|Assistant|>
|
64 |
-
```
|
65 |
-
## Download a file (not the whole branch) from below:
|
66 |
-
| Filename | Quant type | File Size | Split |
|
67 |
-
| -------- | ---------- | --------- | ----- |
|
68 |
-
| {'|'.join(['|'.join([gguf_files[i][0][:-5] if split_model else ('['+gguf_files[i][0]+']'+'(' + new_repo_url+'/blob/main/'+gguf_files[i][0] + ')'), gguf_files[i][3], f"{gguf_files[i][2]:.2f}" + ' GB', str(split_model),'''
|
69 |
-
''']) for i in range(len(gguf_files))]) }
|
70 |
-
## Downloading using huggingface-cli
|
71 |
-
<details>
|
72 |
-
<summary>Click to view download instructions</summary>
|
73 |
-
First, make sure you have hugginface-cli installed:
|
74 |
-
|
75 |
-
```
|
76 |
-
pip install -U "huggingface_hub[cli]"
|
77 |
-
```
|
78 |
-
|
79 |
-
Then, you can target the specific file you want:
|
80 |
-
|
81 |
-
```
|
82 |
-
huggingface-cli download {new_repo_url} --include "{gguf_files[0][0]}" --local-dir ./
|
83 |
-
```
|
84 |
-
|
85 |
-
If the model is bigger than 50GB, it will have been split into multiple files. In order to download them all to a local folder, run:
|
86 |
-
|
87 |
-
```
|
88 |
-
huggingface-cli download {new_repo_url} --include "{gguf_files[0][0]}/*" --local-dir ./
|
89 |
-
```
|
90 |
-
|
91 |
-
You can either specify a new local-dir (deepseek-ai_DeepSeek-V3-0324-Q8_0) or download them all in place (./)
|
92 |
-
|
93 |
-
</details>
|
94 |
-
"""
|
95 |
-
return text
|
96 |
except subprocess.CalledProcessError as e:
|
97 |
-
|
98 |
return None
|
99 |
|
100 |
|
101 |
-
def get_repo_namespace(repo_owner, username, user_orgs):
|
102 |
if repo_owner == "self":
|
103 |
return username
|
104 |
for org in user_orgs:
|
@@ -117,7 +66,7 @@ def escape(s: str) -> str:
|
|
117 |
)
|
118 |
|
119 |
|
120 |
-
def toggle_repo_owner(export_to_org, oauth_token: gr.OAuthToken | None):
|
121 |
if oauth_token is None or oauth_token.token is None:
|
122 |
raise gr.Error("You must be logged in to use quantize-my-repo")
|
123 |
if not export_to_org:
|
@@ -131,7 +80,9 @@ def toggle_repo_owner(export_to_org, oauth_token: gr.OAuthToken | None):
|
|
131 |
)
|
132 |
|
133 |
|
134 |
-
def generate_importance_matrix(
|
|
|
|
|
135 |
imatrix_command = [
|
136 |
"./llama.cpp/llama-imatrix",
|
137 |
"-m",
|
@@ -147,25 +98,27 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
|
|
147 |
]
|
148 |
|
149 |
if not os.path.isfile(model_path):
|
150 |
-
raise
|
151 |
|
152 |
-
|
153 |
process = subprocess.Popen(imatrix_command, shell=False)
|
154 |
|
155 |
try:
|
156 |
-
process.wait(timeout=60)
|
157 |
except subprocess.TimeoutExpired:
|
158 |
-
|
159 |
"Imatrix computation timed out. Sending SIGINT to allow graceful termination..."
|
160 |
)
|
161 |
process.send_signal(signal.SIGINT)
|
162 |
try:
|
163 |
-
process.wait(timeout=5)
|
164 |
except subprocess.TimeoutExpired:
|
165 |
-
|
|
|
|
|
166 |
process.kill()
|
167 |
|
168 |
-
|
169 |
|
170 |
|
171 |
def split_upload_model(
|
@@ -173,101 +126,160 @@ def split_upload_model(
|
|
173 |
outdir: str,
|
174 |
repo_id: str,
|
175 |
oauth_token: gr.OAuthToken | None,
|
176 |
-
split_max_tensors=256,
|
177 |
-
split_max_size=None,
|
178 |
-
org_token=None,
|
179 |
-
export_to_org=False,
|
180 |
-
):
|
181 |
-
|
182 |
-
|
183 |
|
184 |
if oauth_token is None or oauth_token.token is None:
|
185 |
raise ValueError("You have to be logged in.")
|
186 |
|
187 |
-
split_cmd = [
|
188 |
-
"./llama.cpp/llama-gguf-split",
|
189 |
-
"--split",
|
190 |
-
]
|
191 |
if split_max_size:
|
192 |
-
split_cmd.
|
193 |
-
split_cmd.append(split_max_size)
|
194 |
else:
|
195 |
-
split_cmd.
|
196 |
-
split_cmd.append(str(split_max_tensors))
|
197 |
|
198 |
-
|
199 |
-
|
200 |
-
model_path.split(".")[:-1]
|
201 |
-
) # remove the file extension
|
202 |
-
split_cmd.append(model_path)
|
203 |
-
split_cmd.append(model_path_prefix)
|
204 |
|
205 |
-
|
206 |
|
207 |
result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
|
208 |
-
|
209 |
-
|
210 |
|
211 |
if result.returncode != 0:
|
212 |
-
|
213 |
-
|
214 |
-
print("Model split successfully!")
|
215 |
|
216 |
-
# remove the original model file if needed
|
217 |
if os.path.exists(model_path):
|
218 |
os.remove(model_path)
|
219 |
|
220 |
model_file_prefix = model_path_prefix.split("/")[-1]
|
221 |
-
|
222 |
sharded_model_files = [
|
223 |
f
|
224 |
for f in os.listdir(outdir)
|
225 |
if f.startswith(model_file_prefix) and f.endswith(".gguf")
|
226 |
]
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
else:
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
246 |
|
247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
|
250 |
def process_model(
|
251 |
-
model_id,
|
252 |
-
q_method,
|
253 |
-
use_imatrix,
|
254 |
-
imatrix_q_method,
|
255 |
-
private_repo,
|
256 |
-
train_data_file,
|
257 |
-
split_model,
|
258 |
-
split_max_tensors,
|
259 |
-
split_max_size,
|
260 |
-
export_to_org,
|
261 |
-
repo_owner,
|
262 |
-
org_token,
|
263 |
oauth_token: gr.OAuthToken | None,
|
264 |
-
):
|
265 |
if oauth_token is None or oauth_token.token is None:
|
266 |
raise gr.Error("You must be logged in to use quantize-my-repo")
|
267 |
try:
|
268 |
whoami(oauth_token.token)
|
269 |
except Exception as e:
|
270 |
-
raise gr.Error("You must be logged in to use quantize-my-repo")
|
271 |
|
272 |
user_info = whoami(oauth_token.token)
|
273 |
username = user_info["name"]
|
@@ -277,15 +289,17 @@ def process_model(
|
|
277 |
|
278 |
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
279 |
logger.info(
|
280 |
-
|
|
|
|
|
|
|
|
|
281 |
)
|
282 |
|
283 |
repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
|
284 |
model_name = model_id.split("/")[-1]
|
285 |
try:
|
286 |
-
api_token = (
|
287 |
-
org_token if (export_to_org and org_token != "") else oauth_token.token
|
288 |
-
)
|
289 |
api = HfApi(token=api_token)
|
290 |
|
291 |
dl_pattern = ["*.md", "*.json", "*.model"]
|
@@ -297,7 +311,7 @@ def process_model(
|
|
297 |
)
|
298 |
else "*.bin"
|
299 |
)
|
300 |
-
dl_pattern
|
301 |
|
302 |
os.makedirs(downloads_dir, exist_ok=True)
|
303 |
os.makedirs(outputs_dir, exist_ok=True)
|
@@ -306,10 +320,7 @@ def process_model(
|
|
306 |
fp16 = str(Path(outdir) / f"{model_name}.fp16.gguf")
|
307 |
|
308 |
with tempfile.TemporaryDirectory(dir=downloads_dir) as tmpdir:
|
309 |
-
|
310 |
-
logger.info(
|
311 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Start download"
|
312 |
-
)
|
313 |
local_dir = Path(tmpdir) / model_name
|
314 |
api.snapshot_download(
|
315 |
repo_id=model_id,
|
@@ -323,17 +334,10 @@ def process_model(
|
|
323 |
if os.path.exists(adapter_config_dir) and not os.path.exists(
|
324 |
config_dir
|
325 |
):
|
326 |
-
raise
|
327 |
"adapter_config.json is present. If converting LoRA, use GGUF-my-lora."
|
328 |
)
|
329 |
-
|
330 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
331 |
-
+ " Download successfully"
|
332 |
-
)
|
333 |
-
logger.info(
|
334 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
335 |
-
+ " Download successfully"
|
336 |
-
)
|
337 |
|
338 |
result = subprocess.run(
|
339 |
[
|
@@ -348,15 +352,10 @@ def process_model(
|
|
348 |
shell=False,
|
349 |
capture_output=True,
|
350 |
)
|
351 |
-
|
352 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Converted to f16"
|
353 |
-
)
|
354 |
-
logger.info(
|
355 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Converted to f16"
|
356 |
-
)
|
357 |
|
358 |
if result.returncode != 0:
|
359 |
-
raise
|
360 |
f"Error converting to fp16: {result.stderr.decode()}"
|
361 |
)
|
362 |
shutil.rmtree(downloads_dir)
|
@@ -369,7 +368,9 @@ def process_model(
|
|
369 |
else "llama.cpp/groups_merged.txt"
|
370 |
)
|
371 |
if not os.path.isfile(train_data_path):
|
372 |
-
raise
|
|
|
|
|
373 |
generate_importance_matrix(fp16, train_data_path, imatrix_path)
|
374 |
|
375 |
quant_methods = (
|
@@ -381,11 +382,7 @@ def process_model(
|
|
381 |
|
382 |
gguf_files = []
|
383 |
for method in quant_methods:
|
384 |
-
|
385 |
-
logger.info(
|
386 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Begin quantize"
|
387 |
-
)
|
388 |
-
|
389 |
name = (
|
390 |
f"{model_name.lower()}-{method.lower()}-{suffix}.gguf"
|
391 |
if suffix
|
@@ -406,18 +403,13 @@ def process_model(
|
|
406 |
)
|
407 |
result = subprocess.run(quant_cmd, shell=False, capture_output=True)
|
408 |
if result.returncode != 0:
|
409 |
-
raise
|
410 |
f"Quantization failed ({method}): {result.stderr.decode()}"
|
411 |
)
|
412 |
size = os.path.getsize(path) / 1024 / 1024 / 1024
|
413 |
gguf_files.append((name, path, size, method))
|
414 |
|
415 |
-
|
416 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Quantize successfully!"
|
417 |
-
)
|
418 |
-
logger.info(
|
419 |
-
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Quantize successfully!"
|
420 |
-
)
|
421 |
|
422 |
suffix_for_repo = (
|
423 |
f"{imatrix_q_method}-imat" if use_imatrix else "-".join(quant_methods)
|
@@ -428,13 +420,12 @@ def process_model(
|
|
428 |
)
|
429 |
|
430 |
try:
|
431 |
-
|
432 |
-
except:
|
433 |
-
|
434 |
-
|
435 |
-
card
|
436 |
-
|
437 |
-
get_llama_cpp_notes(gguf_files, new_repo_url, split_model, model_id)
|
438 |
)
|
439 |
readme_path = Path(outdir) / "README.md"
|
440 |
card.save(readme_path)
|
@@ -478,6 +469,7 @@ def process_model(
|
|
478 |
css = """/* Custom CSS to allow scrolling */
|
479 |
.gradio-container {overflow-y: auto;}
|
480 |
"""
|
|
|
481 |
model_id = HuggingfaceHubSearch(
|
482 |
label="Hub Model ID",
|
483 |
placeholder="Search for model id on Huggingface",
|
@@ -578,6 +570,7 @@ iface = gr.Interface(
|
|
578 |
description="We take your Hugging Face repo — a terrific repo — we quantize it, we package it beautifully, and we give you your very own repo. It's smart. It's efficient. It's huge. You're gonna love it.",
|
579 |
api_name=False,
|
580 |
)
|
|
|
581 |
with gr.Blocks(css=".gradio-container {overflow-y: auto;}") as demo:
|
582 |
gr.Markdown("Logged in, you must be. Classy, secure, and victorious, it keeps us.")
|
583 |
gr.LoginButton(min_width=250)
|
|
|
3 |
import signal
|
4 |
import tempfile
|
5 |
from pathlib import Path
|
|
|
6 |
import logging
|
7 |
import gradio as gr
|
8 |
from huggingface_hub import HfApi, ModelCard, whoami
|
|
|
31 |
logger = logging.getLogger(__name__)
|
32 |
|
33 |
|
34 |
+
def get_llama_cpp_version():
|
|
|
|
|
|
|
|
|
|
|
35 |
try:
|
36 |
result = subprocess.run(
|
37 |
["git", "-C", "./llama.cpp", "describe", "--tags", "--always"],
|
|
|
41 |
text=True,
|
42 |
)
|
43 |
version = result.stdout.strip().split("-")[0]
|
44 |
+
return version
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
except subprocess.CalledProcessError as e:
|
46 |
+
logger.error("Error getting llama.cpp version: %s", e.stderr.strip())
|
47 |
return None
|
48 |
|
49 |
|
50 |
+
def get_repo_namespace(repo_owner: str, username: str, user_orgs: list) -> str:
|
51 |
if repo_owner == "self":
|
52 |
return username
|
53 |
for org in user_orgs:
|
|
|
66 |
)
|
67 |
|
68 |
|
69 |
+
def toggle_repo_owner(export_to_org: bool, oauth_token: gr.OAuthToken | None) -> tuple:
|
70 |
if oauth_token is None or oauth_token.token is None:
|
71 |
raise gr.Error("You must be logged in to use quantize-my-repo")
|
72 |
if not export_to_org:
|
|
|
80 |
)
|
81 |
|
82 |
|
83 |
+
def generate_importance_matrix(
|
84 |
+
model_path: str, train_data_path: str, output_path: str
|
85 |
+
) -> None:
|
86 |
imatrix_command = [
|
87 |
"./llama.cpp/llama-imatrix",
|
88 |
"-m",
|
|
|
98 |
]
|
99 |
|
100 |
if not os.path.isfile(model_path):
|
101 |
+
raise FileNotFoundError(f"Model file not found: {model_path}")
|
102 |
|
103 |
+
logger.info("Running imatrix command...")
|
104 |
process = subprocess.Popen(imatrix_command, shell=False)
|
105 |
|
106 |
try:
|
107 |
+
process.wait(timeout=60)
|
108 |
except subprocess.TimeoutExpired:
|
109 |
+
logger.warning(
|
110 |
"Imatrix computation timed out. Sending SIGINT to allow graceful termination..."
|
111 |
)
|
112 |
process.send_signal(signal.SIGINT)
|
113 |
try:
|
114 |
+
process.wait(timeout=5)
|
115 |
except subprocess.TimeoutExpired:
|
116 |
+
logger.error(
|
117 |
+
"Imatrix proc still didn't term. Forecfully terming process..."
|
118 |
+
)
|
119 |
process.kill()
|
120 |
|
121 |
+
logger.info("Importance matrix generation completed.")
|
122 |
|
123 |
|
124 |
def split_upload_model(
|
|
|
126 |
outdir: str,
|
127 |
repo_id: str,
|
128 |
oauth_token: gr.OAuthToken | None,
|
129 |
+
split_max_tensors: int = 256,
|
130 |
+
split_max_size: str | None = None,
|
131 |
+
org_token: str | None = None,
|
132 |
+
export_to_org: bool = False,
|
133 |
+
) -> None:
|
134 |
+
logger.info("Model path: %s", model_path)
|
135 |
+
logger.info("Output dir: %s", outdir)
|
136 |
|
137 |
if oauth_token is None or oauth_token.token is None:
|
138 |
raise ValueError("You have to be logged in.")
|
139 |
|
140 |
+
split_cmd = ["./llama.cpp/llama-gguf-split", "--split"]
|
|
|
|
|
|
|
141 |
if split_max_size:
|
142 |
+
split_cmd.extend(["--split-max-size", split_max_size])
|
|
|
143 |
else:
|
144 |
+
split_cmd.extend(["--split-max-tensors", str(split_max_tensors)])
|
|
|
145 |
|
146 |
+
model_path_prefix = ".".join(model_path.split(".")[:-1])
|
147 |
+
split_cmd.extend([model_path, model_path_prefix])
|
|
|
|
|
|
|
|
|
148 |
|
149 |
+
logger.info("Split command: %s", split_cmd)
|
150 |
|
151 |
result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
|
152 |
+
logger.info("Split command stdout: %s", result.stdout)
|
153 |
+
logger.info("Split command stderr: %s", result.stderr)
|
154 |
|
155 |
if result.returncode != 0:
|
156 |
+
raise RuntimeError(f"Error splitting the model: {result.stderr}")
|
157 |
+
logger.info("Model split successfully!")
|
|
|
158 |
|
|
|
159 |
if os.path.exists(model_path):
|
160 |
os.remove(model_path)
|
161 |
|
162 |
model_file_prefix = model_path_prefix.split("/")[-1]
|
163 |
+
logger.info("Model file name prefix: %s", model_file_prefix)
|
164 |
sharded_model_files = [
|
165 |
f
|
166 |
for f in os.listdir(outdir)
|
167 |
if f.startswith(model_file_prefix) and f.endswith(".gguf")
|
168 |
]
|
169 |
+
|
170 |
+
if not sharded_model_files:
|
171 |
+
raise RuntimeError("No sharded files found.")
|
172 |
+
|
173 |
+
logger.info("Sharded model files: %s", sharded_model_files)
|
174 |
+
api = HfApi(token=org_token if (export_to_org and org_token) else oauth_token.token)
|
175 |
+
|
176 |
+
for file in sharded_model_files:
|
177 |
+
file_path = os.path.join(outdir, file)
|
178 |
+
logger.info("Uploading file: %s", file_path)
|
179 |
+
try:
|
180 |
+
api.upload_file(
|
181 |
+
path_or_fileobj=file_path,
|
182 |
+
path_in_repo=file,
|
183 |
+
repo_id=repo_id,
|
184 |
+
)
|
185 |
+
except Exception as e:
|
186 |
+
raise RuntimeError(f"Error uploading file {file_path}: {e}") from e
|
187 |
+
|
188 |
+
logger.info("Sharded model has been uploaded successfully!")
|
189 |
+
|
190 |
+
|
191 |
+
def get_new_model_card(
|
192 |
+
original_card: ModelCard,
|
193 |
+
original_model_id: str,
|
194 |
+
gguf_files: list,
|
195 |
+
new_repo_url: str,
|
196 |
+
split_model: bool,
|
197 |
+
) -> ModelCard:
|
198 |
+
version = get_llama_cpp_version()
|
199 |
+
model_card = original_card.copy()
|
200 |
+
model_card.data.tags = (model_card.data.tags or []) + [
|
201 |
+
"antigma",
|
202 |
+
"quantize-my-repo",
|
203 |
+
]
|
204 |
+
|
205 |
+
# Format the table rows
|
206 |
+
table_rows = []
|
207 |
+
for file_info in gguf_files:
|
208 |
+
name, _, size, method = file_info
|
209 |
+
if split_model:
|
210 |
+
display_name = name[:-5]
|
211 |
else:
|
212 |
+
display_name = f"[{name}]({new_repo_url}/blob/main/{name})"
|
213 |
+
table_rows.append(f"{display_name}|{method}|{size:.2f} GB|{split_model}")
|
214 |
+
|
215 |
+
model_card.text = f"""
|
216 |
+
*Produced by [Antigma Labs](https://antigma.ai), [Antigma Quantize Space](https://huggingface.co/spaces/Antigma/quantize-my-repo)*
|
217 |
+
|
218 |
+
*Follow Antigma Labs in X [https://x.com/antigma_labs](https://x.com/antigma_labs)*
|
219 |
+
|
220 |
+
*Antigma's GitHub Homepage [https://github.com/AntigmaLabs](https://github.com/AntigmaLabs)*
|
221 |
+
|
222 |
+
## Quantization Format (GGUF)
|
223 |
+
We use <a href="https://github.com/ggml-org/llama.cpp">llama.cpp</a> release <a href="https://github.com/ggml-org/llama.cpp/releases/tag/{version}">{version}</a> for quantization.
|
224 |
+
Original model: https://huggingface.co/{original_model_id}
|
225 |
+
|
226 |
+
## Download a file (not the whole branch) from below:
|
227 |
+
| Filename | Quant type | File Size | Split |
|
228 |
+
| -------- | ---------- | --------- | ----- |
|
229 |
+
| {'|'.join(table_rows)}
|
230 |
+
|
231 |
+
## Original Model Card
|
232 |
+
{original_card.text}
|
233 |
+
|
234 |
+
## Downloading using huggingface-cli
|
235 |
+
<details>
|
236 |
+
<summary>Click to view download instructions</summary>
|
237 |
+
First, make sure you have hugginface-cli installed:
|
238 |
|
239 |
+
```
|
240 |
+
pip install -U "huggingface_hub[cli]"
|
241 |
+
```
|
242 |
+
|
243 |
+
Then, you can target the specific file you want:
|
244 |
+
|
245 |
+
```
|
246 |
+
huggingface-cli download {new_repo_url} --include "{gguf_files[0][0]}" --local-dir ./
|
247 |
+
```
|
248 |
+
|
249 |
+
If the model is bigger than 50GB, it will have been split into multiple files. In order to download them all to a local folder, run:
|
250 |
+
|
251 |
+
```
|
252 |
+
huggingface-cli download {new_repo_url} --include "{gguf_files[0][0]}/*" --local-dir ./
|
253 |
+
```
|
254 |
+
|
255 |
+
You can either specify a new local-dir (e.g. deepseek-ai_DeepSeek-V3-0324-Q8_0) or it will be in default hugging face cache
|
256 |
+
|
257 |
+
</details>
|
258 |
+
"""
|
259 |
+
return model_card
|
260 |
|
261 |
|
262 |
def process_model(
|
263 |
+
model_id: str,
|
264 |
+
q_method: str | list,
|
265 |
+
use_imatrix: bool,
|
266 |
+
imatrix_q_method: str,
|
267 |
+
private_repo: bool,
|
268 |
+
train_data_file: gr.File | None,
|
269 |
+
split_model: bool,
|
270 |
+
split_max_tensors: int,
|
271 |
+
split_max_size: str | None,
|
272 |
+
export_to_org: bool,
|
273 |
+
repo_owner: str,
|
274 |
+
org_token: str | None,
|
275 |
oauth_token: gr.OAuthToken | None,
|
276 |
+
) -> tuple[str, str]:
|
277 |
if oauth_token is None or oauth_token.token is None:
|
278 |
raise gr.Error("You must be logged in to use quantize-my-repo")
|
279 |
try:
|
280 |
whoami(oauth_token.token)
|
281 |
except Exception as e:
|
282 |
+
raise gr.Error("You must be logged in to use quantize-my-repo") from e
|
283 |
|
284 |
user_info = whoami(oauth_token.token)
|
285 |
username = user_info["name"]
|
|
|
289 |
|
290 |
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
291 |
logger.info(
|
292 |
+
"Time %s, Username %s, Model_ID %s, q_method %s",
|
293 |
+
current_time,
|
294 |
+
username,
|
295 |
+
model_id,
|
296 |
+
",".join(q_method) if isinstance(q_method, list) else q_method,
|
297 |
)
|
298 |
|
299 |
repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
|
300 |
model_name = model_id.split("/")[-1]
|
301 |
try:
|
302 |
+
api_token = org_token if (export_to_org and org_token) else oauth_token.token
|
|
|
|
|
303 |
api = HfApi(token=api_token)
|
304 |
|
305 |
dl_pattern = ["*.md", "*.json", "*.model"]
|
|
|
311 |
)
|
312 |
else "*.bin"
|
313 |
)
|
314 |
+
dl_pattern.append(pattern)
|
315 |
|
316 |
os.makedirs(downloads_dir, exist_ok=True)
|
317 |
os.makedirs(outputs_dir, exist_ok=True)
|
|
|
320 |
fp16 = str(Path(outdir) / f"{model_name}.fp16.gguf")
|
321 |
|
322 |
with tempfile.TemporaryDirectory(dir=downloads_dir) as tmpdir:
|
323 |
+
logger.info("Start download")
|
|
|
|
|
|
|
324 |
local_dir = Path(tmpdir) / model_name
|
325 |
api.snapshot_download(
|
326 |
repo_id=model_id,
|
|
|
334 |
if os.path.exists(adapter_config_dir) and not os.path.exists(
|
335 |
config_dir
|
336 |
):
|
337 |
+
raise RuntimeError(
|
338 |
"adapter_config.json is present. If converting LoRA, use GGUF-my-lora."
|
339 |
)
|
340 |
+
logger.info("Download successfully")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
result = subprocess.run(
|
343 |
[
|
|
|
352 |
shell=False,
|
353 |
capture_output=True,
|
354 |
)
|
355 |
+
logger.info("Converted to f16")
|
|
|
|
|
|
|
|
|
|
|
356 |
|
357 |
if result.returncode != 0:
|
358 |
+
raise RuntimeError(
|
359 |
f"Error converting to fp16: {result.stderr.decode()}"
|
360 |
)
|
361 |
shutil.rmtree(downloads_dir)
|
|
|
368 |
else "llama.cpp/groups_merged.txt"
|
369 |
)
|
370 |
if not os.path.isfile(train_data_path):
|
371 |
+
raise FileNotFoundError(
|
372 |
+
f"Training data not found: {train_data_path}"
|
373 |
+
)
|
374 |
generate_importance_matrix(fp16, train_data_path, imatrix_path)
|
375 |
|
376 |
quant_methods = (
|
|
|
382 |
|
383 |
gguf_files = []
|
384 |
for method in quant_methods:
|
385 |
+
logger.info("Begin quantize")
|
|
|
|
|
|
|
|
|
386 |
name = (
|
387 |
f"{model_name.lower()}-{method.lower()}-{suffix}.gguf"
|
388 |
if suffix
|
|
|
403 |
)
|
404 |
result = subprocess.run(quant_cmd, shell=False, capture_output=True)
|
405 |
if result.returncode != 0:
|
406 |
+
raise RuntimeError(
|
407 |
f"Quantization failed ({method}): {result.stderr.decode()}"
|
408 |
)
|
409 |
size = os.path.getsize(path) / 1024 / 1024 / 1024
|
410 |
gguf_files.append((name, path, size, method))
|
411 |
|
412 |
+
logger.info("Quantize successfully!")
|
|
|
|
|
|
|
|
|
|
|
413 |
|
414 |
suffix_for_repo = (
|
415 |
f"{imatrix_q_method}-imat" if use_imatrix else "-".join(quant_methods)
|
|
|
420 |
)
|
421 |
|
422 |
try:
|
423 |
+
original_card = ModelCard.load(model_id, token=oauth_token.token)
|
424 |
+
except Exception:
|
425 |
+
original_card = ModelCard("")
|
426 |
+
|
427 |
+
card = get_new_model_card(
|
428 |
+
original_card, model_id, gguf_files, new_repo_url, split_model
|
|
|
429 |
)
|
430 |
readme_path = Path(outdir) / "README.md"
|
431 |
card.save(readme_path)
|
|
|
469 |
css = """/* Custom CSS to allow scrolling */
|
470 |
.gradio-container {overflow-y: auto;}
|
471 |
"""
|
472 |
+
|
473 |
model_id = HuggingfaceHubSearch(
|
474 |
label="Hub Model ID",
|
475 |
placeholder="Search for model id on Huggingface",
|
|
|
570 |
description="We take your Hugging Face repo — a terrific repo — we quantize it, we package it beautifully, and we give you your very own repo. It's smart. It's efficient. It's huge. You're gonna love it.",
|
571 |
api_name=False,
|
572 |
)
|
573 |
+
|
574 |
with gr.Blocks(css=".gradio-container {overflow-y: auto;}") as demo:
|
575 |
gr.Markdown("Logged in, you must be. Classy, secure, and victorious, it keeps us.")
|
576 |
gr.LoginButton(min_width=250)
|