Brianpuz commited on
Commit
999dc22
·
verified ·
1 Parent(s): d6d2e11

Add multi-quantize functions, logging of the use, and export to organizations

Browse files

If you want to export the model to an organization repository, you should either authorize the space with that organization (then you can use blank token in the interface) at the first time or provide a new token.

Files changed (1) hide show
  1. app.py +181 -261
app.py CHANGED
@@ -1,30 +1,39 @@
1
  import os
2
  import subprocess
3
  import signal
4
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
- import gradio as gr
6
  import tempfile
7
-
8
- from huggingface_hub import HfApi, ModelCard, whoami
9
- from gradio_huggingfacehub_search import HuggingfaceHubSearch
10
  from pathlib import Path
11
  from textwrap import dedent
 
 
 
 
12
  from apscheduler.schedulers.background import BackgroundScheduler
 
13
 
14
-
15
- # used for restarting the space
16
- HF_TOKEN = os.environ.get("HF_TOKEN")
17
  CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
 
 
 
 
 
 
 
 
 
18
 
19
- # escape HTML for logging
20
  def escape(s: str) -> str:
21
- s = s.replace("&", "&") # Must be done first!
22
- s = s.replace("<", "&lt;")
23
- s = s.replace(">", "&gt;")
24
- s = s.replace('"', "&quot;")
25
- s = s.replace("\n", "<br/>")
26
- return s
27
 
 
 
 
 
 
 
 
 
28
  def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
29
  imatrix_command = [
30
  "./llama.cpp/llama-imatrix",
@@ -54,13 +63,13 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
54
 
55
  print("Importance matrix generation completed.")
56
 
57
- def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
58
  print(f"Model path: {model_path}")
59
  print(f"Output dir: {outdir}")
60
 
61
  if oauth_token is None or oauth_token.token is None:
62
  raise ValueError("You have to be logged in.")
63
-
64
  split_cmd = [
65
  "./llama.cpp/llama-gguf-split",
66
  "--split",
@@ -77,12 +86,12 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
77
  split_cmd.append(model_path)
78
  split_cmd.append(model_path_prefix)
79
 
80
- print(f"Split command: {split_cmd}")
81
-
82
  result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
83
- print(f"Split command stdout: {result.stdout}")
84
- print(f"Split command stderr: {result.stderr}")
85
-
86
  if result.returncode != 0:
87
  stderr_str = result.stderr.decode("utf-8")
88
  raise Exception(f"Error splitting the model: {stderr_str}")
@@ -93,11 +102,14 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
93
  os.remove(model_path)
94
 
95
  model_file_prefix = model_path_prefix.split('/')[-1]
96
- print(f"Model file name prefix: {model_file_prefix}")
97
  sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
98
  if sharded_model_files:
99
  print(f"Sharded model files: {sharded_model_files}")
100
- api = HfApi(token=oauth_token.token)
 
 
 
101
  for file in sharded_model_files:
102
  file_path = os.path.join(outdir, file)
103
  print(f"Uploading file: {file_path}")
@@ -111,214 +123,111 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
111
  raise Exception(f"Error uploading file {file_path}: {e}")
112
  else:
113
  raise Exception("No sharded files found.")
114
-
115
  print("Sharded model has been uploaded successfully!")
116
 
117
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
 
 
118
  if oauth_token is None or oauth_token.token is None:
119
  raise gr.Error("You must be logged in to use GGUF-my-repo")
120
 
121
- # validate the oauth token
122
- try:
123
- whoami(oauth_token.token)
124
- except Exception as e:
125
- raise gr.Error("You must be logged in to use GGUF-my-repo")
 
 
 
 
126
 
 
127
  model_name = model_id.split('/')[-1]
 
 
128
 
129
- try:
130
- api = HfApi(token=oauth_token.token)
 
 
 
 
131
 
132
- dl_pattern = ["*.md", "*.json", "*.model"]
 
133
 
134
- pattern = (
135
- "*.safetensors"
136
- if any(
137
- file.path.endswith(".safetensors")
138
- for file in api.list_repo_tree(
139
- repo_id=model_id,
140
- recursive=True,
141
- )
142
- )
143
- else "*.bin"
144
- )
145
-
146
- dl_pattern += [pattern]
147
-
148
- if not os.path.exists("downloads"):
149
- os.makedirs("downloads")
150
-
151
- if not os.path.exists("outputs"):
152
- os.makedirs("outputs")
153
-
154
- with tempfile.TemporaryDirectory(dir="outputs") as outdir:
155
- fp16 = str(Path(outdir)/f"{model_name}.fp16.gguf")
156
-
157
- with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
158
- # Keep the model name as the dirname so the model name metadata is populated correctly
159
- local_dir = Path(tmpdir)/model_name
160
- print(local_dir)
161
- api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
162
- print("Model downloaded successfully!")
163
- print(f"Current working directory: {os.getcwd()}")
164
- print(f"Model directory contents: {os.listdir(local_dir)}")
165
-
166
- config_dir = local_dir/"config.json"
167
- adapter_config_dir = local_dir/"adapter_config.json"
168
- if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
169
- raise Exception('adapter_config.json is present.<br/><br/>If you are converting a LoRA adapter to GGUF, please use <a href="https://huggingface.co/spaces/ggml-org/gguf-my-lora" target="_blank" style="text-decoration:underline">GGUF-my-lora</a>.')
170
-
171
- result = subprocess.run([
172
- "python", CONVERSION_SCRIPT, local_dir, "--outtype", "f16", "--outfile", fp16
173
- ], shell=False, capture_output=True)
174
- print(result)
175
- if result.returncode != 0:
176
- stderr_str = result.stderr.decode("utf-8")
177
- raise Exception(f"Error converting to fp16: {stderr_str}")
178
- print("Model converted to fp16 successfully!")
179
- print(f"Converted model path: {fp16}")
180
-
181
- imatrix_path = Path(outdir)/"imatrix.dat"
182
-
183
- if use_imatrix:
184
- if train_data_file:
185
- train_data_path = train_data_file.name
186
- else:
187
- train_data_path = "llama.cpp/groups_merged.txt" #fallback calibration dataset
188
-
189
- print(f"Training data file path: {train_data_path}")
190
-
191
- if not os.path.isfile(train_data_path):
192
- raise Exception(f"Training data file not found: {train_data_path}")
193
-
194
- generate_importance_matrix(fp16, train_data_path, imatrix_path)
195
- else:
196
- print("Not using imatrix quantization.")
197
-
198
- # Quantize the model
199
- quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
200
- quantized_gguf_path = str(Path(outdir)/quantized_gguf_name)
201
- if use_imatrix:
202
- quantise_ggml = [
203
- "./llama.cpp/llama-quantize",
204
- "--imatrix", imatrix_path, fp16, quantized_gguf_path, imatrix_q_method
205
- ]
206
- else:
207
- quantise_ggml = [
208
- "./llama.cpp/llama-quantize",
209
- fp16, quantized_gguf_path, q_method
210
- ]
211
- result = subprocess.run(quantise_ggml, shell=False, capture_output=True)
212
- if result.returncode != 0:
213
- stderr_str = result.stderr.decode("utf-8")
214
- raise Exception(f"Error quantizing: {stderr_str}")
215
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
216
- print(f"Quantized model path: {quantized_gguf_path}")
217
 
218
- # Create empty repo
219
- username = whoami(oauth_token.token)["name"]
220
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
221
- new_repo_id = new_repo_url.repo_id
222
- print("Repo created successfully!", new_repo_url)
223
 
224
- try:
225
- card = ModelCard.load(model_id, token=oauth_token.token)
226
- except:
227
- card = ModelCard("")
228
- if card.data.tags is None:
229
- card.data.tags = []
230
- card.data.tags.append("llama-cpp")
231
- card.data.tags.append("gguf-my-repo")
232
- card.data.base_model = model_id
233
- card.text = dedent(
234
- f"""
235
- # {new_repo_id}
236
- This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
237
- Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
238
-
239
- ## Use with llama.cpp
240
- Install llama.cpp through brew (works on Mac and Linux)
241
-
242
- ```bash
243
- brew install llama.cpp
244
-
245
- ```
246
- Invoke the llama.cpp server or the CLI.
247
-
248
- ### CLI:
249
- ```bash
250
- llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
251
- ```
252
-
253
- ### Server:
254
- ```bash
255
- llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
256
- ```
257
-
258
- Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
259
-
260
- Step 1: Clone llama.cpp from GitHub.
261
- ```
262
- git clone https://github.com/ggerganov/llama.cpp
263
- ```
264
-
265
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
266
- ```
267
- cd llama.cpp && LLAMA_CURL=1 make
268
- ```
269
-
270
- Step 3: Run inference through the main binary.
271
- ```
272
- ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
273
- ```
274
- or
275
- ```
276
- ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
277
- ```
278
- """
279
- )
280
- readme_path = Path(outdir)/"README.md"
281
- card.save(readme_path)
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  if split_model:
284
- split_upload_model(str(quantized_gguf_path), outdir, new_repo_id, oauth_token, split_max_tensors, split_max_size)
285
  else:
286
- try:
287
- print(f"Uploading quantized model: {quantized_gguf_path}")
288
- api.upload_file(
289
- path_or_fileobj=quantized_gguf_path,
290
- path_in_repo=quantized_gguf_name,
291
- repo_id=new_repo_id,
292
- )
293
- except Exception as e:
294
- raise Exception(f"Error uploading quantized model: {e}")
295
-
296
- if os.path.isfile(imatrix_path):
297
- try:
298
- print(f"Uploading imatrix.dat: {imatrix_path}")
299
- api.upload_file(
300
- path_or_fileobj=imatrix_path,
301
- path_in_repo="imatrix.dat",
302
- repo_id=new_repo_id,
303
- )
304
- except Exception as e:
305
- raise Exception(f"Error uploading imatrix.dat: {e}")
306
-
307
- api.upload_file(
308
- path_or_fileobj=readme_path,
309
- path_in_repo="README.md",
310
- repo_id=new_repo_id,
311
- )
312
- print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
313
-
314
- # end of the TemporaryDirectory(dir="outputs") block; temporary outputs are deleted here
315
-
316
- return (
317
- f'<h1>✅ DONE</h1><br/>Find your repo here: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{new_repo_id}</a>',
318
- "llama.png",
319
- )
320
- except Exception as e:
321
- return (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
322
 
323
 
324
  css="""/* Custom CSS to allow scrolling */
@@ -330,20 +239,40 @@ model_id = HuggingfaceHubSearch(
330
  search_type="model",
331
  )
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  q_method = gr.Dropdown(
334
  ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
335
  label="Quantization Method",
336
  info="GGML quantization type",
337
  value="Q4_K_M",
338
  filterable=False,
339
- visible=True
 
340
  )
341
 
342
  imatrix_q_method = gr.Dropdown(
343
  ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
344
  label="Imatrix Quantization Method",
345
  info="GGML imatrix quants type",
346
- value="IQ4_NL",
347
  filterable=False,
348
  visible=False
349
  )
@@ -386,58 +315,49 @@ split_max_size = gr.Textbox(
386
  )
387
 
388
  iface = gr.Interface(
389
- fn=process_model,
390
- inputs=[
391
- model_id,
392
- q_method,
393
- use_imatrix,
394
- imatrix_q_method,
395
- private_repo,
396
- train_data_file,
397
- split_model,
398
- split_max_tensors,
399
- split_max_size,
400
- ],
401
- outputs=[
402
- gr.Markdown(label="output"),
403
- gr.Image(show_label=False),
404
- ],
405
- title="Create your own GGUF Quants, blazingly fast ⚡!",
406
- description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
407
- api_name=False
408
- )
409
-
410
- # Create Gradio interface
411
- with gr.Blocks(css=css) as demo:
412
- gr.Markdown("You must be logged in to use GGUF-my-repo.")
 
413
  gr.LoginButton(min_width=250)
414
 
415
- iface.render()
416
 
417
- def update_split_visibility(split_model):
418
- return gr.update(visible=split_model), gr.update(visible=split_model)
419
 
420
- split_model.change(
421
- fn=update_split_visibility,
422
- inputs=split_model,
423
- outputs=[split_max_tensors, split_max_size]
424
- )
 
 
425
 
426
- def update_visibility(use_imatrix):
427
- return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
428
-
429
- use_imatrix.change(
430
- fn=update_visibility,
431
- inputs=use_imatrix,
432
- outputs=[q_method, imatrix_q_method, train_data_file]
433
- )
434
 
435
  def restart_space():
436
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
437
 
438
  scheduler = BackgroundScheduler()
439
  scheduler.add_job(restart_space, "interval", seconds=21600)
440
  scheduler.start()
441
 
442
- # Launch the interface
443
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
1
  import os
2
  import subprocess
3
  import signal
 
 
4
  import tempfile
 
 
 
5
  from pathlib import Path
6
  from textwrap import dedent
7
+ import logging
8
+ import gradio as gr
9
+ from huggingface_hub import HfApi, ModelCard, whoami
10
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
+ from datetime import datetime
13
 
14
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 
 
15
  CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
16
+ logger = logging.getLogger(__name__)
17
+
18
+ def get_repo_namespace(repo_owner, username, user_orgs):
19
+ if repo_owner == 'self':
20
+ return username
21
+ for org in user_orgs:
22
+ if org['name'] == repo_owner:
23
+ return org['name']
24
+ raise ValueError(f"Invalid repo_owner: {repo_owner}")
25
 
 
26
  def escape(s: str) -> str:
27
+ return s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;").replace("\n", "<br/>")
 
 
 
 
 
28
 
29
+ def toggle_repo_owner(export_to_org, oauth_token: gr.OAuthToken | None):
30
+ if oauth_token is None or oauth_token.token is None:
31
+ raise gr.Error("You must be logged in to use GGUF-my-repo")
32
+ if not export_to_org:
33
+ return gr.update(visible=False, choices=["self"], value="self"), gr.update(visible=False, value="")
34
+ info = whoami(oauth_token.token)
35
+ orgs = [org["name"] for org in info.get("orgs", [])]
36
+ return gr.update(visible=True, choices=["self"] + orgs, value="self"), gr.update(visible=True)
37
  def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
38
  imatrix_command = [
39
  "./llama.cpp/llama-imatrix",
 
63
 
64
  print("Importance matrix generation completed.")
65
 
66
+ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None, org_token=None, export_to_org=False):
67
  print(f"Model path: {model_path}")
68
  print(f"Output dir: {outdir}")
69
 
70
  if oauth_token is None or oauth_token.token is None:
71
  raise ValueError("You have to be logged in.")
72
+
73
  split_cmd = [
74
  "./llama.cpp/llama-gguf-split",
75
  "--split",
 
86
  split_cmd.append(model_path)
87
  split_cmd.append(model_path_prefix)
88
 
89
+ print(f"Split command: {split_cmd}")
90
+
91
  result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
92
+ print(f"Split command stdout: {result.stdout}")
93
+ print(f"Split command stderr: {result.stderr}")
94
+
95
  if result.returncode != 0:
96
  stderr_str = result.stderr.decode("utf-8")
97
  raise Exception(f"Error splitting the model: {stderr_str}")
 
102
  os.remove(model_path)
103
 
104
  model_file_prefix = model_path_prefix.split('/')[-1]
105
+ print(f"Model file name prefix: {model_file_prefix}")
106
  sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
107
  if sharded_model_files:
108
  print(f"Sharded model files: {sharded_model_files}")
109
+ if export_to_org and org_token!="":
110
+ api = HfApi(token = org_token)
111
+ else:
112
+ api = HfApi(token=oauth_token.token)
113
  for file in sharded_model_files:
114
  file_path = os.path.join(outdir, file)
115
  print(f"Uploading file: {file_path}")
 
123
  raise Exception(f"Error uploading file {file_path}: {e}")
124
  else:
125
  raise Exception("No sharded files found.")
126
+
127
  print("Sharded model has been uploaded successfully!")
128
 
129
+ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo,
130
+ train_data_file, split_model, split_max_tensors, split_max_size,
131
+ export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None):
132
  if oauth_token is None or oauth_token.token is None:
133
  raise gr.Error("You must be logged in to use GGUF-my-repo")
134
 
135
+ user_info = whoami(oauth_token.token)
136
+ username = user_info["name"]
137
+ user_orgs = user_info.get("orgs", [])
138
+ if not export_to_org:
139
+ repo_owner = "self"
140
+
141
+
142
+ current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
143
+ print(f"Time {current_time}, Username {username}, Model_ID, {model_id}, q_method {','.join(q_method)}")
144
 
145
+ repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
146
  model_name = model_id.split('/')[-1]
147
+ api_token = org_token if (export_to_org and org_token!="") else oauth_token.token
148
+ api = HfApi(token=api_token)
149
 
150
+ dl_pattern = ["*.md", "*.json", "*.model"]
151
+ pattern = "*.safetensors" if any(
152
+ f.path.endswith(".safetensors")
153
+ for f in api.list_repo_tree(repo_id=model_id, recursive=True)
154
+ ) else "*.bin"
155
+ dl_pattern += [pattern]
156
 
157
+ os.makedirs("downloads", exist_ok=True)
158
+ os.makedirs("outputs", exist_ok=True)
159
 
160
+ with tempfile.TemporaryDirectory(dir="outputs") as outdir:
161
+ fp16 = str(Path(outdir)/f"{model_name}.fp16.gguf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
164
+ local_dir = Path(tmpdir)/model_name
165
+ api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
 
 
166
 
167
+ config_dir = local_dir/"config.json"
168
+ adapter_config_dir = local_dir/"adapter_config.json"
169
+ if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
170
+ raise Exception("adapter_config.json is present. If converting LoRA, use GGUF-my-lora.")
171
+
172
+ result = subprocess.run(["python", CONVERSION_SCRIPT, local_dir, "--outtype", "f16", "--outfile", fp16], shell=False, capture_output=True)
173
+ if result.returncode != 0:
174
+ raise Exception(f"Error converting to fp16: {result.stderr.decode()}")
175
+
176
+ imatrix_path = Path(outdir)/"imatrix.dat"
177
+ if use_imatrix:
178
+ train_data_path = train_data_file.name if train_data_file else "llama.cpp/groups_merged.txt"
179
+ if not os.path.isfile(train_data_path):
180
+ raise Exception(f"Training data not found: {train_data_path}")
181
+ generate_importance_matrix(fp16, train_data_path, imatrix_path)
182
+
183
+ quant_methods = [imatrix_q_method] if use_imatrix else (q_method if isinstance(q_method, list) else [q_method])
184
+ suffix = "imat" if use_imatrix else None
185
+
186
+ gguf_files = []
187
+ for method in quant_methods:
188
+ name = f"{model_name.lower()}-{method.lower()}-{suffix}.gguf" if suffix else f"{model_name.lower()}-{method.lower()}.gguf"
189
+ path = str(Path(outdir)/name)
190
+ quant_cmd = ["./llama.cpp/llama-quantize", "--imatrix", imatrix_path, fp16, path, method] if use_imatrix else ["./llama.cpp/llama-quantize", fp16, path, method]
191
+ result = subprocess.run(quant_cmd, shell=False, capture_output=True)
192
+ if result.returncode != 0:
193
+ raise Exception(f"Quantization failed ({method}): {result.stderr.decode()}")
194
+ gguf_files.append((name, path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ suffix_for_repo = f"{imatrix_q_method}-imat" if use_imatrix else "-".join(quant_methods)
197
+ repo_id = f"{repo_namespace}/{model_name}-{suffix_for_repo}-GGUF"
198
+ new_repo_url = api.create_repo(repo_id=repo_id, exist_ok=True, private=private_repo)
199
+
200
+ try:
201
+ card = ModelCard.load(model_id, token=oauth_token.token)
202
+ except:
203
+ card = ModelCard("")
204
+ card.data.tags = (card.data.tags or []) + ["llama-cpp", "gguf-my-repo"]
205
+ card.data.base_model = model_id
206
+ card.text = dedent(f"""
207
+ # {repo_id}
208
+ Absolutely tremendous! This repo features **GGUF quantized** versions of [{model_id}](https://huggingface.co/{model_id}) — made possible using the *very powerful* `llama.cpp`. Believe me, it's fast, it's smart, it's winning.
209
+ ## Quantized Versions:
210
+ Only the best quantization. You’ll love it.
211
+ ## Run with llama.cpp
212
+ Just plug it in, hit the command line, and boom — you're running world-class AI, folks:
213
+ ```bash
214
+ llama-cli --hf-repo {repo_id} --hf-file {gguf_files[0][0]} -p "AI First, but also..."
215
+ ```
216
+ This beautiful Hugging Face Space was brought to you by the **amazing team at [Antigma Labs](https://antigma.ai)**. Great people. Big vision. Doing things that matter — and doing them right.
217
+ Total winners.
218
+ """)
219
+ readme_path = Path(outdir)/"README.md"
220
+ card.save(readme_path)
221
+ for name, path in gguf_files:
222
  if split_model:
223
+ split_upload_model(path, outdir, repo_id, oauth_token, split_max_tensors, split_max_size, org_token, export_to_org)
224
  else:
225
+ api.upload_file(path_or_fileobj=path, path_in_repo=name, repo_id=repo_id)
226
+ if use_imatrix and os.path.isfile(imatrix_path):
227
+ api.upload_file(path_or_fileobj=imatrix_path, path_in_repo="imatrix.dat", repo_id=repo_id)
228
+ api.upload_file(path_or_fileobj=readme_path, path_in_repo="README.md", repo_id=repo_id)
229
+
230
+ return (f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>', "llama.png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
 
233
  css="""/* Custom CSS to allow scrolling */
 
239
  search_type="model",
240
  )
241
 
242
+ export_to_org = gr.Checkbox(
243
+ label="Export to Organization Repository",
244
+ value=False,
245
+ info="If checked, you can select an organization to export to."
246
+ )
247
+
248
+ repo_owner = gr.Dropdown(
249
+ choices=["self"],
250
+ value="self",
251
+ label="Repository Owner",
252
+ visible=False
253
+ )
254
+
255
+ org_token = gr.Textbox(
256
+ label="Org Access Token",
257
+ type="password",
258
+ visible=False
259
+ )
260
+
261
  q_method = gr.Dropdown(
262
  ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
263
  label="Quantization Method",
264
  info="GGML quantization type",
265
  value="Q4_K_M",
266
  filterable=False,
267
+ visible=True,
268
+ multiselect=True
269
  )
270
 
271
  imatrix_q_method = gr.Dropdown(
272
  ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
273
  label="Imatrix Quantization Method",
274
  info="GGML imatrix quants type",
275
+ value="IQ4_NL",
276
  filterable=False,
277
  visible=False
278
  )
 
315
  )
316
 
317
  iface = gr.Interface(
318
+ fn=process_model,
319
+ inputs=[
320
+ model_id,
321
+ q_method,
322
+ use_imatrix,
323
+ imatrix_q_method,
324
+ private_repo,
325
+ train_data_file,
326
+ split_model,
327
+ split_max_tensors,
328
+ split_max_size,
329
+ export_to_org,
330
+ repo_owner,
331
+ org_token
332
+ ],
333
+ outputs=[
334
+ gr.Markdown(label="Output"),
335
+ gr.Image(show_label=False)
336
+ ],
337
+ title="Make your own GGUF Quants — faster than ever before, believe me.",
338
+ description="We take your Hugging Face repo — a terrific repo — we quantize it, we package it beautifully, and we give you your very own repo. It's smart. It's efficient. It's huge. You're gonna love it.",
339
+ api_name=False
340
+ )
341
+ with gr.Blocks(css=".gradio-container {overflow-y: auto;}") as demo:
342
+ gr.Markdown("Logged in, you must be. Classy, secure, and victorious, it keeps us.")
343
  gr.LoginButton(min_width=250)
344
 
 
345
 
 
 
346
 
347
+ export_to_org.change(fn=toggle_repo_owner, inputs=[export_to_org], outputs=[repo_owner, org_token])
348
+
349
+ split_model.change(fn=lambda sm: (gr.update(visible=sm), gr.update(visible=sm)), inputs=split_model, outputs=[split_max_tensors, split_max_size])
350
+ use_imatrix.change(fn=lambda use: (gr.update(visible=not use), gr.update(visible=use), gr.update(visible=use)), inputs=use_imatrix, outputs=[q_method, imatrix_q_method, train_data_file])
351
+
352
+ iface.render()
353
+
354
 
 
 
 
 
 
 
 
 
355
 
356
  def restart_space():
357
+ HfApi().restart_space(repo_id="Brianpuz/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
358
 
359
  scheduler = BackgroundScheduler()
360
  scheduler.add_job(restart_space, "interval", seconds=21600)
361
  scheduler.start()
362
 
 
363
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)