mohanz commited on
Commit
acf163c
·
1 Parent(s): 3ea29f2
Files changed (1) hide show
  1. app.py +296 -146
app.py CHANGED
@@ -21,21 +21,27 @@ os.makedirs(log_dir, exist_ok=True)
21
  logging.basicConfig(
22
  filename=os.path.join(log_dir, "app.log"),
23
  level=logging.INFO,
24
- format="%(asctime)s - %(levelname)s - %(message)s"
25
  )
26
 
27
  logger = logging.getLogger(__name__)
28
 
29
- def get_llama_cpp_notes(gguf_files, new_repo_url, split_model, model_id = None,):
 
 
 
 
 
 
30
  try:
31
  result = subprocess.run(
32
- ['git', '-C', './llama.cpp', 'describe', '--tags', '--always'],
33
  stdout=subprocess.PIPE,
34
  stderr=subprocess.PIPE,
35
  check=True,
36
- text=True
37
  )
38
- version = result.stdout.strip().split('-')[0]
39
  text = f"""
40
  *Produced by [Antigma Labs](https://antigma.ai)*
41
  ## llama.cpp quantization
@@ -76,32 +82,51 @@ You can either specify a new local-dir (deepseek-ai_DeepSeek-V3-0324-Q8_0) or do
76
 
77
 
78
  def get_repo_namespace(repo_owner, username, user_orgs):
79
- if repo_owner == 'self':
80
  return username
81
  for org in user_orgs:
82
- if org['name'] == repo_owner:
83
- return org['name']
84
  raise ValueError(f"Invalid repo_owner: {repo_owner}")
85
 
 
86
  def escape(s: str) -> str:
87
- return s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;").replace('"', "&quot;").replace("\n", "<br/>")
 
 
 
 
 
 
 
88
 
89
  def toggle_repo_owner(export_to_org, oauth_token: gr.OAuthToken | None):
90
  if oauth_token is None or oauth_token.token is None:
91
  raise gr.Error("You must be logged in to use GGUF-my-repo")
92
  if not export_to_org:
93
- return gr.update(visible=False, choices=["self"], value="self"), gr.update(visible=False, value="")
 
 
94
  info = whoami(oauth_token.token)
95
  orgs = [org["name"] for org in info.get("orgs", [])]
96
- return gr.update(visible=True, choices=["self"] + orgs, value="self"), gr.update(visible=True)
 
 
 
 
97
  def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
98
  imatrix_command = [
99
  "./llama.cpp/llama-imatrix",
100
- "-m", model_path,
101
- "-f", train_data_path,
102
- "-ngl", "99",
103
- "--output-frequency", "10",
104
- "-o", output_path,
 
 
 
 
 
105
  ]
106
 
107
  if not os.path.isfile(model_path):
@@ -113,7 +138,9 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
113
  try:
114
  process.wait(timeout=60) # added wait
115
  except subprocess.TimeoutExpired:
116
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
 
 
117
  process.send_signal(signal.SIGINT)
118
  try:
119
  process.wait(timeout=5) # grace period
@@ -123,7 +150,17 @@ def generate_importance_matrix(model_path: str, train_data_path: str, output_pat
123
 
124
  print("Importance matrix generation completed.")
125
 
126
- def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None, org_token=None, export_to_org=False):
 
 
 
 
 
 
 
 
 
 
127
  print(f"Model path: {model_path}")
128
  print(f"Output dir: {outdir}")
129
 
@@ -142,7 +179,9 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
142
  split_cmd.append(str(split_max_tensors))
143
 
144
  # args for output
145
- model_path_prefix = '.'.join(model_path.split('.')[:-1]) # remove the file extension
 
 
146
  split_cmd.append(model_path)
147
  split_cmd.append(model_path_prefix)
148
 
@@ -161,15 +200,19 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
161
  if os.path.exists(model_path):
162
  os.remove(model_path)
163
 
164
- model_file_prefix = model_path_prefix.split('/')[-1]
165
  print(f"Model file name prefix: {model_file_prefix}")
166
- sharded_model_files = [f for f in os.listdir(outdir) if f.startswith(model_file_prefix) and f.endswith(".gguf")]
 
 
 
 
167
  if sharded_model_files:
168
  print(f"Sharded model files: {sharded_model_files}")
169
- if export_to_org and org_token!="":
170
- api = HfApi(token = org_token)
171
  else:
172
- api = HfApi(token=oauth_token.token)
173
  for file in sharded_model_files:
174
  file_path = os.path.join(outdir, file)
175
  print(f"Uploading file: {file_path}")
@@ -186,9 +229,22 @@ def split_upload_model(model_path: str, outdir: str, repo_id: str, oauth_token:
186
 
187
  print("Sharded model has been uploaded successfully!")
188
 
189
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo,
190
- train_data_file, split_model, split_max_tensors, split_max_size,
191
- export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  if oauth_token is None or oauth_token.token is None:
193
  raise gr.Error("You must be logged in to use GGUF-my-repo")
194
 
@@ -198,91 +254,175 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
198
  if not export_to_org:
199
  repo_owner = "self"
200
 
201
-
202
  current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
203
- logger.info(f"Time {current_time}, Username {username}, Model_ID, {model_id}, q_method {','.join(q_method)}")
 
 
204
 
205
  repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
206
- model_name = model_id.split('/')[-1]
207
  try:
208
- api_token = org_token if (export_to_org and org_token!="") else oauth_token.token
209
- api = HfApi(token=api_token)
210
-
211
- dl_pattern = ["*.md", "*.json", "*.model"]
212
- pattern = "*.safetensors" if any(
213
- f.path.endswith(".safetensors")
214
- for f in api.list_repo_tree(repo_id=model_id, recursive=True)
215
- ) else "*.bin"
216
- dl_pattern += [pattern]
217
-
218
- os.makedirs("downloads", exist_ok=True)
219
- os.makedirs("outputs", exist_ok=True)
220
-
221
- with tempfile.TemporaryDirectory(dir="outputs") as outdir:
222
- fp16 = str(Path(outdir)/f"{model_name}.fp16.gguf")
223
-
224
- with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
225
- local_dir = Path(tmpdir)/model_name
226
- api.snapshot_download(repo_id=model_id, local_dir=local_dir, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
227
-
228
- config_dir = local_dir/"config.json"
229
- adapter_config_dir = local_dir/"adapter_config.json"
230
- if os.path.exists(adapter_config_dir) and not os.path.exists(config_dir):
231
- raise Exception("adapter_config.json is present. If converting LoRA, use GGUF-my-lora.")
232
-
233
- result = subprocess.run(["python", CONVERSION_SCRIPT, local_dir, "--outtype", "f16", "--outfile", fp16], shell=False, capture_output=True)
234
- if result.returncode != 0:
235
- raise Exception(f"Error converting to fp16: {result.stderr.decode()}")
236
-
237
- imatrix_path = Path(outdir)/"imatrix.dat"
238
- if use_imatrix:
239
- train_data_path = train_data_file.name if train_data_file else "llama.cpp/groups_merged.txt"
240
- if not os.path.isfile(train_data_path):
241
- raise Exception(f"Training data not found: {train_data_path}")
242
- generate_importance_matrix(fp16, train_data_path, imatrix_path)
243
-
244
- quant_methods = [imatrix_q_method] if use_imatrix else (q_method if isinstance(q_method, list) else [q_method])
245
- suffix = "imat" if use_imatrix else None
246
-
247
- gguf_files = []
248
- for method in quant_methods:
249
- name = f"{model_name.lower()}-{method.lower()}-{suffix}.gguf" if suffix else f"{model_name.lower()}-{method.lower()}.gguf"
250
- path = str(Path(outdir)/name)
251
- quant_cmd = ["./llama.cpp/llama-quantize", "--imatrix", imatrix_path, fp16, path, method] if use_imatrix else ["./llama.cpp/llama-quantize", fp16, path, method]
252
- result = subprocess.run(quant_cmd, shell=False, capture_output=True)
253
- if result.returncode != 0:
254
- raise Exception(f"Quantization failed ({method}): {result.stderr.decode()}")
255
- size = os.path.getsize(path)/1024/1024/1024
256
- gguf_files.append((name, path, size, method))
257
-
258
- suffix_for_repo = f"{imatrix_q_method}-imat" if use_imatrix else "-".join(quant_methods)
259
- repo_id = f"{repo_namespace}/{model_name}-{suffix_for_repo}-GGUF"
260
- new_repo_url = api.create_repo(repo_id=repo_id, exist_ok=True, private=private_repo)
261
-
262
- try:
263
- card = ModelCard.load(model_id, token=oauth_token.token)
264
- except:
265
- card = ModelCard("")
266
- card.data.tags = (card.data.tags or []) + ["llama-cpp", "gguf-my-repo"]
267
- card.data.base_model = model_id
268
- card.text = dedent(get_llama_cpp_notes(gguf_files, new_repo_url, split_model, model_id))
269
- readme_path = Path(outdir)/"README.md"
270
- card.save(readme_path)
271
- for name, path, _, _ in gguf_files:
272
- if split_model:
273
- split_upload_model(path, outdir, repo_id, oauth_token, split_max_tensors, split_max_size, org_token, export_to_org)
274
- else:
275
- api.upload_file(path_or_fileobj=path, path_in_repo=name, repo_id=repo_id)
276
- if use_imatrix and os.path.isfile(imatrix_path):
277
- api.upload_file(path_or_fileobj=imatrix_path, path_in_repo="imatrix.dat", repo_id=repo_id)
278
- api.upload_file(path_or_fileobj=readme_path, path_in_repo="README.md", repo_id=repo_id)
279
-
280
- return (f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>', f"llama{np.random.randint(9)}.png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  except Exception as e:
282
- raise (f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>', "error.png")
 
 
 
283
 
284
 
285
- css="""/* Custom CSS to allow scrolling */
286
  .gradio-container {overflow-y: auto;}
287
  """
288
  model_id = HuggingfaceHubSearch(
@@ -294,30 +434,36 @@ model_id = HuggingfaceHubSearch(
294
  export_to_org = gr.Checkbox(
295
  label="Export to Organization Repository",
296
  value=False,
297
- info="If checked, you can select an organization to export to."
298
  )
299
 
300
  repo_owner = gr.Dropdown(
301
- choices=["self"],
302
- value="self",
303
- label="Repository Owner",
304
- visible=False
305
  )
306
 
307
- org_token = gr.Textbox(
308
- label="Org Access Token",
309
- type="password",
310
- visible=False
311
- )
312
 
313
  q_method = gr.Dropdown(
314
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  label="Quantization Method",
316
  info="GGML quantization type",
317
  value="Q4_K_M",
318
  filterable=False,
319
  visible=True,
320
- multiselect=True
321
  )
322
 
323
  imatrix_q_method = gr.Dropdown(
@@ -326,44 +472,36 @@ imatrix_q_method = gr.Dropdown(
326
  info="GGML imatrix quants type",
327
  value="IQ4_NL",
328
  filterable=False,
329
- visible=False
330
  )
331
 
332
  use_imatrix = gr.Checkbox(
333
  value=False,
334
  label="Use Imatrix Quantization",
335
- info="Use importance matrix for quantization."
336
  )
337
 
338
  private_repo = gr.Checkbox(
339
- value=False,
340
- label="Private Repo",
341
- info="Create a private repo under your username."
342
  )
343
 
344
- train_data_file = gr.File(
345
- label="Training Data File",
346
- file_types=["txt"],
347
- visible=False
348
- )
349
 
350
  split_model = gr.Checkbox(
351
- value=False,
352
- label="Split Model",
353
- info="Shard the model using gguf-split."
354
  )
355
 
356
  split_max_tensors = gr.Number(
357
  value=256,
358
  label="Max Tensors per File",
359
  info="Maximum number of tensors per file when splitting model.",
360
- visible=False
361
  )
362
 
363
  split_max_size = gr.Textbox(
364
  label="Max File Size",
365
  info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
366
- visible=False
367
  )
368
 
369
  iface = gr.Interface(
@@ -380,35 +518,47 @@ iface = gr.Interface(
380
  split_max_size,
381
  export_to_org,
382
  repo_owner,
383
- org_token
384
- ],
385
- outputs=[
386
- gr.Markdown(label="Output"),
387
- gr.Image(show_label=False)
388
  ],
 
389
  title="Make your own GGUF Quants — faster than ever before, believe me.",
390
  description="We take your Hugging Face repo — a terrific repo — we quantize it, we package it beautifully, and we give you your very own repo. It's smart. It's efficient. It's huge. You're gonna love it.",
391
- api_name=False
392
  )
393
  with gr.Blocks(css=".gradio-container {overflow-y: auto;}") as demo:
394
  gr.Markdown("Logged in, you must be. Classy, secure, and victorious, it keeps us.")
395
  gr.LoginButton(min_width=250)
396
 
397
-
398
-
399
- export_to_org.change(fn=toggle_repo_owner, inputs=[export_to_org], outputs=[repo_owner, org_token])
400
-
401
- split_model.change(fn=lambda sm: (gr.update(visible=sm), gr.update(visible=sm)), inputs=split_model, outputs=[split_max_tensors, split_max_size])
402
- use_imatrix.change(fn=lambda use: (gr.update(visible=not use), gr.update(visible=use), gr.update(visible=use)), inputs=use_imatrix, outputs=[q_method, imatrix_q_method, train_data_file])
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
  iface.render()
405
 
406
 
407
  def restart_space():
408
- HfApi().restart_space(repo_id="Antigma/quantize-my-repo", token=HF_TOKEN, factory_reboot=True)
 
 
 
409
 
410
  scheduler = BackgroundScheduler()
411
  scheduler.add_job(restart_space, "interval", seconds=21600)
412
  scheduler.start()
413
 
414
- demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
21
  logging.basicConfig(
22
  filename=os.path.join(log_dir, "app.log"),
23
  level=logging.INFO,
24
+ format="%(asctime)s - %(levelname)s - %(message)s",
25
  )
26
 
27
  logger = logging.getLogger(__name__)
28
 
29
+
30
+ def get_llama_cpp_notes(
31
+ gguf_files,
32
+ new_repo_url,
33
+ split_model,
34
+ model_id=None,
35
+ ):
36
  try:
37
  result = subprocess.run(
38
+ ["git", "-C", "./llama.cpp", "describe", "--tags", "--always"],
39
  stdout=subprocess.PIPE,
40
  stderr=subprocess.PIPE,
41
  check=True,
42
+ text=True,
43
  )
44
+ version = result.stdout.strip().split("-")[0]
45
  text = f"""
46
  *Produced by [Antigma Labs](https://antigma.ai)*
47
  ## llama.cpp quantization
 
82
 
83
 
84
  def get_repo_namespace(repo_owner, username, user_orgs):
85
+ if repo_owner == "self":
86
  return username
87
  for org in user_orgs:
88
+ if org["name"] == repo_owner:
89
+ return org["name"]
90
  raise ValueError(f"Invalid repo_owner: {repo_owner}")
91
 
92
+
93
  def escape(s: str) -> str:
94
+ return (
95
+ s.replace("&", "&amp;")
96
+ .replace("<", "&lt;")
97
+ .replace(">", "&gt;")
98
+ .replace('"', "&quot;")
99
+ .replace("\n", "<br/>")
100
+ )
101
+
102
 
103
  def toggle_repo_owner(export_to_org, oauth_token: gr.OAuthToken | None):
104
  if oauth_token is None or oauth_token.token is None:
105
  raise gr.Error("You must be logged in to use GGUF-my-repo")
106
  if not export_to_org:
107
+ return gr.update(visible=False, choices=["self"], value="self"), gr.update(
108
+ visible=False, value=""
109
+ )
110
  info = whoami(oauth_token.token)
111
  orgs = [org["name"] for org in info.get("orgs", [])]
112
+ return gr.update(visible=True, choices=["self"] + orgs, value="self"), gr.update(
113
+ visible=True
114
+ )
115
+
116
+
117
  def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
118
  imatrix_command = [
119
  "./llama.cpp/llama-imatrix",
120
+ "-m",
121
+ model_path,
122
+ "-f",
123
+ train_data_path,
124
+ "-ngl",
125
+ "99",
126
+ "--output-frequency",
127
+ "10",
128
+ "-o",
129
+ output_path,
130
  ]
131
 
132
  if not os.path.isfile(model_path):
 
138
  try:
139
  process.wait(timeout=60) # added wait
140
  except subprocess.TimeoutExpired:
141
+ print(
142
+ "Imatrix computation timed out. Sending SIGINT to allow graceful termination..."
143
+ )
144
  process.send_signal(signal.SIGINT)
145
  try:
146
  process.wait(timeout=5) # grace period
 
150
 
151
  print("Importance matrix generation completed.")
152
 
153
+
154
+ def split_upload_model(
155
+ model_path: str,
156
+ outdir: str,
157
+ repo_id: str,
158
+ oauth_token: gr.OAuthToken | None,
159
+ split_max_tensors=256,
160
+ split_max_size=None,
161
+ org_token=None,
162
+ export_to_org=False,
163
+ ):
164
  print(f"Model path: {model_path}")
165
  print(f"Output dir: {outdir}")
166
 
 
179
  split_cmd.append(str(split_max_tensors))
180
 
181
  # args for output
182
+ model_path_prefix = ".".join(
183
+ model_path.split(".")[:-1]
184
+ ) # remove the file extension
185
  split_cmd.append(model_path)
186
  split_cmd.append(model_path_prefix)
187
 
 
200
  if os.path.exists(model_path):
201
  os.remove(model_path)
202
 
203
+ model_file_prefix = model_path_prefix.split("/")[-1]
204
  print(f"Model file name prefix: {model_file_prefix}")
205
+ sharded_model_files = [
206
+ f
207
+ for f in os.listdir(outdir)
208
+ if f.startswith(model_file_prefix) and f.endswith(".gguf")
209
+ ]
210
  if sharded_model_files:
211
  print(f"Sharded model files: {sharded_model_files}")
212
+ if export_to_org and org_token != "":
213
+ api = HfApi(token=org_token)
214
  else:
215
+ api = HfApi(token=oauth_token.token)
216
  for file in sharded_model_files:
217
  file_path = os.path.join(outdir, file)
218
  print(f"Uploading file: {file_path}")
 
229
 
230
  print("Sharded model has been uploaded successfully!")
231
 
232
+
233
+ def process_model(
234
+ model_id,
235
+ q_method,
236
+ use_imatrix,
237
+ imatrix_q_method,
238
+ private_repo,
239
+ train_data_file,
240
+ split_model,
241
+ split_max_tensors,
242
+ split_max_size,
243
+ export_to_org,
244
+ repo_owner,
245
+ org_token,
246
+ oauth_token: gr.OAuthToken | None,
247
+ ):
248
  if oauth_token is None or oauth_token.token is None:
249
  raise gr.Error("You must be logged in to use GGUF-my-repo")
250
 
 
254
  if not export_to_org:
255
  repo_owner = "self"
256
 
 
257
  current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
258
+ logger.info(
259
+ f"Time {current_time}, Username {username}, Model_ID, {model_id}, q_method {','.join(q_method)}"
260
+ )
261
 
262
  repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
263
+ model_name = model_id.split("/")[-1]
264
  try:
265
+ api_token = (
266
+ org_token if (export_to_org and org_token != "") else oauth_token.token
267
+ )
268
+ api = HfApi(token=api_token)
269
+
270
+ dl_pattern = ["*.md", "*.json", "*.model"]
271
+ pattern = (
272
+ "*.safetensors"
273
+ if any(
274
+ f.path.endswith(".safetensors")
275
+ for f in api.list_repo_tree(repo_id=model_id, recursive=True)
276
+ )
277
+ else "*.bin"
278
+ )
279
+ dl_pattern += [pattern]
280
+
281
+ os.makedirs("downloads", exist_ok=True)
282
+ os.makedirs("outputs", exist_ok=True)
283
+
284
+ with tempfile.TemporaryDirectory(dir="outputs") as outdir:
285
+ fp16 = str(Path(outdir) / f"{model_name}.fp16.gguf")
286
+
287
+ with tempfile.TemporaryDirectory(dir="downloads") as tmpdir:
288
+ local_dir = Path(tmpdir) / model_name
289
+ api.snapshot_download(
290
+ repo_id=model_id,
291
+ local_dir=local_dir,
292
+ local_dir_use_symlinks=False,
293
+ allow_patterns=dl_pattern,
294
+ )
295
+
296
+ config_dir = local_dir / "config.json"
297
+ adapter_config_dir = local_dir / "adapter_config.json"
298
+ if os.path.exists(adapter_config_dir) and not os.path.exists(
299
+ config_dir
300
+ ):
301
+ raise Exception(
302
+ "adapter_config.json is present. If converting LoRA, use GGUF-my-lora."
303
+ )
304
+
305
+ result = subprocess.run(
306
+ [
307
+ "python",
308
+ CONVERSION_SCRIPT,
309
+ local_dir,
310
+ "--outtype",
311
+ "f16",
312
+ "--outfile",
313
+ fp16,
314
+ ],
315
+ shell=False,
316
+ capture_output=True,
317
+ )
318
+ if result.returncode != 0:
319
+ raise Exception(
320
+ f"Error converting to fp16: {result.stderr.decode()}"
321
+ )
322
+
323
+ imatrix_path = Path(outdir) / "imatrix.dat"
324
+ if use_imatrix:
325
+ train_data_path = (
326
+ train_data_file.name
327
+ if train_data_file
328
+ else "llama.cpp/groups_merged.txt"
329
+ )
330
+ if not os.path.isfile(train_data_path):
331
+ raise Exception(f"Training data not found: {train_data_path}")
332
+ generate_importance_matrix(fp16, train_data_path, imatrix_path)
333
+
334
+ quant_methods = (
335
+ [imatrix_q_method]
336
+ if use_imatrix
337
+ else (q_method if isinstance(q_method, list) else [q_method])
338
+ )
339
+ suffix = "imat" if use_imatrix else None
340
+
341
+ gguf_files = []
342
+ for method in quant_methods:
343
+ name = (
344
+ f"{model_name.lower()}-{method.lower()}-{suffix}.gguf"
345
+ if suffix
346
+ else f"{model_name.lower()}-{method.lower()}.gguf"
347
+ )
348
+ path = str(Path(outdir) / name)
349
+ quant_cmd = (
350
+ [
351
+ "./llama.cpp/llama-quantize",
352
+ "--imatrix",
353
+ imatrix_path,
354
+ fp16,
355
+ path,
356
+ method,
357
+ ]
358
+ if use_imatrix
359
+ else ["./llama.cpp/llama-quantize", fp16, path, method]
360
+ )
361
+ result = subprocess.run(quant_cmd, shell=False, capture_output=True)
362
+ if result.returncode != 0:
363
+ raise Exception(
364
+ f"Quantization failed ({method}): {result.stderr.decode()}"
365
+ )
366
+ size = os.path.getsize(path) / 1024 / 1024 / 1024
367
+ gguf_files.append((name, path, size, method))
368
+
369
+ suffix_for_repo = (
370
+ f"{imatrix_q_method}-imat" if use_imatrix else "-".join(quant_methods)
371
+ )
372
+ repo_id = f"{repo_namespace}/{model_name}-{suffix_for_repo}-GGUF"
373
+ new_repo_url = api.create_repo(
374
+ repo_id=repo_id, exist_ok=True, private=private_repo
375
+ )
376
+
377
+ try:
378
+ card = ModelCard.load(model_id, token=oauth_token.token)
379
+ except:
380
+ card = ModelCard("")
381
+ card.data.tags = (card.data.tags or []) + ["llama-cpp", "gguf-my-repo"]
382
+ card.data.base_model = model_id
383
+ card.text = dedent(
384
+ get_llama_cpp_notes(gguf_files, new_repo_url, split_model, model_id)
385
+ )
386
+ readme_path = Path(outdir) / "README.md"
387
+ card.save(readme_path)
388
+ for name, path, _, _ in gguf_files:
389
+ if split_model:
390
+ split_upload_model(
391
+ path,
392
+ outdir,
393
+ repo_id,
394
+ oauth_token,
395
+ split_max_tensors,
396
+ split_max_size,
397
+ org_token,
398
+ export_to_org,
399
+ )
400
+ else:
401
+ api.upload_file(
402
+ path_or_fileobj=path, path_in_repo=name, repo_id=repo_id
403
+ )
404
+ if use_imatrix and os.path.isfile(imatrix_path):
405
+ api.upload_file(
406
+ path_or_fileobj=imatrix_path,
407
+ path_in_repo="imatrix.dat",
408
+ repo_id=repo_id,
409
+ )
410
+ api.upload_file(
411
+ path_or_fileobj=readme_path, path_in_repo="README.md", repo_id=repo_id
412
+ )
413
+
414
+ return (
415
+ f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>',
416
+ f"llama{np.random.randint(9)}.png",
417
+ )
418
  except Exception as e:
419
+ raise (
420
+ f'<h1>❌ ERROR</h1><br/><pre style="white-space:pre-wrap;">{escape(str(e))}</pre>',
421
+ "error.png",
422
+ )
423
 
424
 
425
+ css = """/* Custom CSS to allow scrolling */
426
  .gradio-container {overflow-y: auto;}
427
  """
428
  model_id = HuggingfaceHubSearch(
 
434
  export_to_org = gr.Checkbox(
435
  label="Export to Organization Repository",
436
  value=False,
437
+ info="If checked, you can select an organization to export to.",
438
  )
439
 
440
  repo_owner = gr.Dropdown(
441
+ choices=["self"], value="self", label="Repository Owner", visible=False
 
 
 
442
  )
443
 
444
+ org_token = gr.Textbox(label="Org Access Token", type="password", visible=False)
 
 
 
 
445
 
446
  q_method = gr.Dropdown(
447
+ [
448
+ "Q2_K",
449
+ "Q3_K_S",
450
+ "Q3_K_M",
451
+ "Q3_K_L",
452
+ "Q4_0",
453
+ "Q4_K_S",
454
+ "Q4_K_M",
455
+ "Q5_0",
456
+ "Q5_K_S",
457
+ "Q5_K_M",
458
+ "Q6_K",
459
+ "Q8_0",
460
+ ],
461
  label="Quantization Method",
462
  info="GGML quantization type",
463
  value="Q4_K_M",
464
  filterable=False,
465
  visible=True,
466
+ multiselect=True,
467
  )
468
 
469
  imatrix_q_method = gr.Dropdown(
 
472
  info="GGML imatrix quants type",
473
  value="IQ4_NL",
474
  filterable=False,
475
+ visible=False,
476
  )
477
 
478
  use_imatrix = gr.Checkbox(
479
  value=False,
480
  label="Use Imatrix Quantization",
481
+ info="Use importance matrix for quantization.",
482
  )
483
 
484
  private_repo = gr.Checkbox(
485
+ value=False, label="Private Repo", info="Create a private repo under your username."
 
 
486
  )
487
 
488
+ train_data_file = gr.File(label="Training Data File", file_types=["txt"], visible=False)
 
 
 
 
489
 
490
  split_model = gr.Checkbox(
491
+ value=False, label="Split Model", info="Shard the model using gguf-split."
 
 
492
  )
493
 
494
  split_max_tensors = gr.Number(
495
  value=256,
496
  label="Max Tensors per File",
497
  info="Maximum number of tensors per file when splitting model.",
498
+ visible=False,
499
  )
500
 
501
  split_max_size = gr.Textbox(
502
  label="Max File Size",
503
  info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G",
504
+ visible=False,
505
  )
506
 
507
  iface = gr.Interface(
 
518
  split_max_size,
519
  export_to_org,
520
  repo_owner,
521
+ org_token,
 
 
 
 
522
  ],
523
+ outputs=[gr.Markdown(label="Output"), gr.Image(show_label=False)],
524
  title="Make your own GGUF Quants — faster than ever before, believe me.",
525
  description="We take your Hugging Face repo — a terrific repo — we quantize it, we package it beautifully, and we give you your very own repo. It's smart. It's efficient. It's huge. You're gonna love it.",
526
+ api_name=False,
527
  )
528
  with gr.Blocks(css=".gradio-container {overflow-y: auto;}") as demo:
529
  gr.Markdown("Logged in, you must be. Classy, secure, and victorious, it keeps us.")
530
  gr.LoginButton(min_width=250)
531
 
532
+ export_to_org.change(
533
+ fn=toggle_repo_owner, inputs=[export_to_org], outputs=[repo_owner, org_token]
534
+ )
535
+
536
+ split_model.change(
537
+ fn=lambda sm: (gr.update(visible=sm), gr.update(visible=sm)),
538
+ inputs=split_model,
539
+ outputs=[split_max_tensors, split_max_size],
540
+ )
541
+ use_imatrix.change(
542
+ fn=lambda use: (
543
+ gr.update(visible=not use),
544
+ gr.update(visible=use),
545
+ gr.update(visible=use),
546
+ ),
547
+ inputs=use_imatrix,
548
+ outputs=[q_method, imatrix_q_method, train_data_file],
549
+ )
550
 
551
  iface.render()
552
 
553
 
554
  def restart_space():
555
+ HfApi().restart_space(
556
+ repo_id="Antigma/quantize-my-repo", token=HF_TOKEN, factory_reboot=True
557
+ )
558
+
559
 
560
  scheduler = BackgroundScheduler()
561
  scheduler.add_job(restart_space, "interval", seconds=21600)
562
  scheduler.start()
563
 
564
+ demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)