import os
import subprocess
import signal
import tempfile
from pathlib import Path
from textwrap import dedent
import logging
import gradio as gr
from huggingface_hub import HfApi, ModelCard, whoami
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime
import numpy as np
import shutil
HF_TOKEN = os.environ.get("HF_TOKEN")
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
CONVERSION_SCRIPT = "./llama.cpp/convert_hf_to_gguf.py"
log_dir = "/data/logs"
downloads_dir = "/data/downloads"
outputs_dir = "/data/outputs"
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(
filename=os.path.join(log_dir, "app.log"),
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger(__name__)
def get_llama_cpp_notes(
gguf_files,
new_repo_url,
split_model,
model_id=None,
):
try:
result = subprocess.run(
["git", "-C", "./llama.cpp", "describe", "--tags", "--always"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
text=True,
)
version = result.stdout.strip().split("-")[0]
text = f"""
*Produced by [Antigma Labs](https://antigma.ai)*
*Follow Antigma Labs in X [https://x.com/antigma_labs](https://x.com/antigma_labs)*
*Antigma's GitHub Homepage [https://github.com/AntigmaLabs](https://github.com/AntigmaLabs)*
## llama.cpp quantization
Using llama.cpp release b4944 for quantization.
Original model: https://huggingface.co/{model_id}
Run them directly with [llama.cpp](https://github.com/ggml-org/llama.cpp), or any other llama.cpp based project
## Prompt format
```
<|begin▁of▁sentence|>{{system_prompt}}<|User|>{{prompt}}<|Assistant|><|end▁of▁sentence|><|Assistant|>
```
## Download a file (not the whole branch) from below:
| Filename | Quant type | File Size | Split |
| -------- | ---------- | --------- | ----- |
| {'|'.join(['|'.join([gguf_files[i][0][:-5] if split_model else ('['+gguf_files[i][0]+']'+'(' + new_repo_url+'/blob/main/'+gguf_files[i][0] + ')'), gguf_files[i][3], f"{gguf_files[i][2]:.2f}" + ' GB', str(split_model),'''
''']) for i in range(len(gguf_files))]) }
## Downloading using huggingface-cli
Click to view download instructions
First, make sure you have hugginface-cli installed:
```
pip install -U "huggingface_hub[cli]"
```
Then, you can target the specific file you want:
```
huggingface-cli download {new_repo_url} --include "{gguf_files[0][0]}" --local-dir ./
```
If the model is bigger than 50GB, it will have been split into multiple files. In order to download them all to a local folder, run:
```
huggingface-cli download {new_repo_url} --include "{gguf_files[0][0]}/*" --local-dir ./
```
You can either specify a new local-dir (deepseek-ai_DeepSeek-V3-0324-Q8_0) or download them all in place (./)
")
)
def toggle_repo_owner(export_to_org, oauth_token: gr.OAuthToken | None):
if oauth_token is None or oauth_token.token is None:
raise gr.Error("You must be logged in to use GGUF-my-repo")
if not export_to_org:
return gr.update(visible=False, choices=["self"], value="self"), gr.update(
visible=False, value=""
)
info = whoami(oauth_token.token)
orgs = [org["name"] for org in info.get("orgs", [])]
return gr.update(visible=True, choices=["self"] + orgs, value="self"), gr.update(
visible=True
)
def generate_importance_matrix(model_path: str, train_data_path: str, output_path: str):
imatrix_command = [
"./llama.cpp/llama-imatrix",
"-m",
model_path,
"-f",
train_data_path,
"-ngl",
"99",
"--output-frequency",
"10",
"-o",
output_path,
]
if not os.path.isfile(model_path):
raise Exception(f"Model file not found: {model_path}")
print("Running imatrix command...")
process = subprocess.Popen(imatrix_command, shell=False)
try:
process.wait(timeout=60) # added wait
except subprocess.TimeoutExpired:
print(
"Imatrix computation timed out. Sending SIGINT to allow graceful termination..."
)
process.send_signal(signal.SIGINT)
try:
process.wait(timeout=5) # grace period
except subprocess.TimeoutExpired:
print("Imatrix proc still didn't term. Forecfully terming process...")
process.kill()
print("Importance matrix generation completed.")
def split_upload_model(
model_path: str,
outdir: str,
repo_id: str,
oauth_token: gr.OAuthToken | None,
split_max_tensors=256,
split_max_size=None,
org_token=None,
export_to_org=False,
):
print(f"Model path: {model_path}")
print(f"Output dir: {outdir}")
if oauth_token is None or oauth_token.token is None:
raise ValueError("You have to be logged in.")
split_cmd = [
"./llama.cpp/llama-gguf-split",
"--split",
]
if split_max_size:
split_cmd.append("--split-max-size")
split_cmd.append(split_max_size)
else:
split_cmd.append("--split-max-tensors")
split_cmd.append(str(split_max_tensors))
# args for output
model_path_prefix = ".".join(
model_path.split(".")[:-1]
) # remove the file extension
split_cmd.append(model_path)
split_cmd.append(model_path_prefix)
print(f"Split command: {split_cmd}")
result = subprocess.run(split_cmd, shell=False, capture_output=True, text=True)
print(f"Split command stdout: {result.stdout}")
print(f"Split command stderr: {result.stderr}")
if result.returncode != 0:
stderr_str = result.stderr.decode("utf-8")
raise Exception(f"Error splitting the model: {stderr_str}")
print("Model split successfully!")
# remove the original model file if needed
if os.path.exists(model_path):
os.remove(model_path)
model_file_prefix = model_path_prefix.split("/")[-1]
print(f"Model file name prefix: {model_file_prefix}")
sharded_model_files = [
f
for f in os.listdir(outdir)
if f.startswith(model_file_prefix) and f.endswith(".gguf")
]
if sharded_model_files:
print(f"Sharded model files: {sharded_model_files}")
if export_to_org and org_token != "":
api = HfApi(token=org_token)
else:
api = HfApi(token=oauth_token.token)
for file in sharded_model_files:
file_path = os.path.join(outdir, file)
print(f"Uploading file: {file_path}")
try:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file,
repo_id=repo_id,
)
except Exception as e:
raise Exception(f"Error uploading file {file_path}: {e}")
else:
raise Exception("No sharded files found.")
print("Sharded model has been uploaded successfully!")
def process_model(
model_id,
q_method,
use_imatrix,
imatrix_q_method,
private_repo,
train_data_file,
split_model,
split_max_tensors,
split_max_size,
export_to_org,
repo_owner,
org_token,
oauth_token: gr.OAuthToken | None,
):
if oauth_token is None or oauth_token.token is None:
raise gr.Error("You must be logged in to use GGUF-my-repo")
try:
whoami(oauth_token.token)
except Exception as e:
raise gr.Error("You must be logged in to use GGUF-my-repo")
user_info = whoami(oauth_token.token)
username = user_info["name"]
user_orgs = user_info.get("orgs", [])
if not export_to_org:
repo_owner = "self"
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logger.info(
f"Time {current_time}, Username {username}, Model_ID, {model_id}, q_method {','.join(q_method)}"
)
repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
model_name = model_id.split("/")[-1]
try:
api_token = (
org_token if (export_to_org and org_token != "") else oauth_token.token
)
api = HfApi(token=api_token)
dl_pattern = ["*.md", "*.json", "*.model"]
pattern = (
"*.safetensors"
if any(
f.path.endswith(".safetensors")
for f in api.list_repo_tree(repo_id=model_id, recursive=True)
)
else "*.bin"
)
dl_pattern += [pattern]
os.makedirs(downloads_dir, exist_ok=True)
os.makedirs(outputs_dir, exist_ok=True)
with tempfile.TemporaryDirectory(dir=outputs_dir) as outdir:
fp16 = str(Path(outdir) / f"{model_name}.fp16.gguf")
with tempfile.TemporaryDirectory(dir=downloads_dir) as tmpdir:
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Start download")
logger.info(
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Start download"
)
local_dir = Path(tmpdir) / model_name
api.snapshot_download(
repo_id=model_id,
local_dir=local_dir,
local_dir_use_symlinks=False,
allow_patterns=dl_pattern,
)
config_dir = local_dir / "config.json"
adapter_config_dir = local_dir / "adapter_config.json"
if os.path.exists(adapter_config_dir) and not os.path.exists(
config_dir
):
raise Exception(
"adapter_config.json is present. If converting LoRA, use GGUF-my-lora."
)
print(
datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ " Download successfully"
)
logger.info(
datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ " Download successfully"
)
result = subprocess.run(
[
"python",
CONVERSION_SCRIPT,
local_dir,
"--outtype",
"f16",
"--outfile",
fp16,
],
shell=False,
capture_output=True,
)
print(
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Converted to f16"
)
logger.info(
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Converted to f16"
)
if result.returncode != 0:
raise Exception(
f"Error converting to fp16: {result.stderr.decode()}"
)
shutil.rmtree(downloads_dir)
imatrix_path = Path(outdir) / "imatrix.dat"
if use_imatrix:
train_data_path = (
train_data_file.name
if train_data_file
else "llama.cpp/groups_merged.txt"
)
if not os.path.isfile(train_data_path):
raise Exception(f"Training data not found: {train_data_path}")
generate_importance_matrix(fp16, train_data_path, imatrix_path)
quant_methods = (
[imatrix_q_method]
if use_imatrix
else (q_method if isinstance(q_method, list) else [q_method])
)
suffix = "imat" if use_imatrix else None
gguf_files = []
for method in quant_methods:
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Begin quantize")
logger.info(
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Begin quantize"
)
name = (
f"{model_name.lower()}-{method.lower()}-{suffix}.gguf"
if suffix
else f"{model_name.lower()}-{method.lower()}.gguf"
)
path = str(Path(outdir) / name)
quant_cmd = (
[
"./llama.cpp/llama-quantize",
"--imatrix",
imatrix_path,
fp16,
path,
method,
]
if use_imatrix
else ["./llama.cpp/llama-quantize", fp16, path, method]
)
result = subprocess.run(quant_cmd, shell=False, capture_output=True)
if result.returncode != 0:
raise Exception(
f"Quantization failed ({method}): {result.stderr.decode()}"
)
size = os.path.getsize(path) / 1024 / 1024 / 1024
gguf_files.append((name, path, size, method))
print(
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Quantize successfully!"
)
logger.info(
datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " Quantize successfully!"
)
suffix_for_repo = (
f"{imatrix_q_method}-imat" if use_imatrix else "-".join(quant_methods)
)
repo_id = f"{repo_namespace}/{model_name}-GGUF"
new_repo_url = api.create_repo(
repo_id=repo_id, exist_ok=True, private=private_repo
)
try:
card = ModelCard.load(model_id, token=oauth_token.token)
except:
card = ModelCard("")
card.data.tags = (card.data.tags or []) + ["llama-cpp", "gguf-my-repo"]
card.data.base_model = model_id
card.text = dedent(
get_llama_cpp_notes(gguf_files, new_repo_url, split_model, model_id)
)
readme_path = Path(outdir) / "README.md"
card.save(readme_path)
for name, path, _, _ in gguf_files:
if split_model:
split_upload_model(
path,
outdir,
repo_id,
oauth_token,
split_max_tensors,
split_max_size,
org_token,
export_to_org,
)
else:
api.upload_file(
path_or_fileobj=path, path_in_repo=name, repo_id=repo_id
)
if use_imatrix and os.path.isfile(imatrix_path):
api.upload_file(
path_or_fileobj=imatrix_path,
path_in_repo="imatrix.dat",
repo_id=repo_id,
)
api.upload_file(
path_or_fileobj=readme_path, path_in_repo="README.md", repo_id=repo_id
)
return (
f'
{escape(str(e))}', "error.png", ) css = """/* Custom CSS to allow scrolling */ .gradio-container {overflow-y: auto;} """ model_id = HuggingfaceHubSearch( label="Hub Model ID", placeholder="Search for model id on Huggingface", search_type="model", ) export_to_org = gr.Checkbox( label="Export to Organization Repository", value=False, info="If checked, you can select an organization to export to.", ) repo_owner = gr.Dropdown( choices=["self"], value="self", label="Repository Owner", visible=False ) org_token = gr.Textbox(label="Org Access Token", type="password", visible=False) q_method = gr.Dropdown( [ "Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0", ], label="Quantization Method", info="GGML quantization type", value="Q4_K_M", filterable=False, visible=True, multiselect=True, ) imatrix_q_method = gr.Dropdown( ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"], label="Imatrix Quantization Method", info="GGML imatrix quants type", value="IQ4_NL", filterable=False, visible=False, ) use_imatrix = gr.Checkbox( value=False, label="Use Imatrix Quantization", info="Use importance matrix for quantization.", ) private_repo = gr.Checkbox( value=False, label="Private Repo", info="Create a private repo under your username." ) train_data_file = gr.File(label="Training Data File", file_types=["txt"], visible=False) split_model = gr.Checkbox( value=False, label="Split Model", info="Shard the model using gguf-split." ) split_max_tensors = gr.Number( value=256, label="Max Tensors per File", info="Maximum number of tensors per file when splitting model.", visible=False, ) split_max_size = gr.Textbox( label="Max File Size", info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default. Accepted suffixes: M, G. Example: 256M, 5G", visible=False, ) iface = gr.Interface( fn=process_model, inputs=[ model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, export_to_org, repo_owner, org_token, ], outputs=[gr.Markdown(label="Output"), gr.Image(show_label=False)], title="Make your own GGUF Quants — faster than ever before, believe me.", description="We take your Hugging Face repo — a terrific repo — we quantize it, we package it beautifully, and we give you your very own repo. It's smart. It's efficient. It's huge. You're gonna love it.", api_name=False, ) with gr.Blocks(css=".gradio-container {overflow-y: auto;}") as demo: gr.Markdown("Logged in, you must be. Classy, secure, and victorious, it keeps us.") gr.LoginButton(min_width=250) export_to_org.change( fn=toggle_repo_owner, inputs=[export_to_org], outputs=[repo_owner, org_token] ) split_model.change( fn=lambda sm: (gr.update(visible=sm), gr.update(visible=sm)), inputs=split_model, outputs=[split_max_tensors, split_max_size], ) use_imatrix.change( fn=lambda use: ( gr.update(visible=not use), gr.update(visible=use), gr.update(visible=use), ), inputs=use_imatrix, outputs=[q_method, imatrix_q_method, train_data_file], ) iface.render() def restart_space(): HfApi().restart_space( repo_id="Antigma/quantize-my-repo", token=HF_TOKEN, factory_reboot=True ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=86400) scheduler.start() demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)