File size: 3,255 Bytes
0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 0eb3167 58e2d13 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import os
import gradio as gr
import spaces
import torch
from huggingface_hub import HfApi
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
# Held on CPU outside the GPU context
zero = torch.tensor([0])
# Mapping of friendly names β quantization recipes
SCHEMES = {
"W4A16": lambda: [
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
],
"W8A8": lambda: [
SmoothQuantModifier(smoothing_strength=0.8),
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
],
}
def build_recipe(scheme: str):
if scheme not in SCHEMES:
raise ValueError(f"Unsupported quantization scheme: {scheme}")
return SCHEMES[scheme]()
@spaces.GPU # Dynamic GPU allocation
def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str):
# Demonstrate that we're now on GPU
print("zero.device inside GPU context:", zero.to("cuda").device)
recipe = build_recipe(scheme)
out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"
# Oneβshot quantization
oneshot(
model=model_id,
dataset="open_platypus", # tiny calibration set bundled with the lib
recipe=recipe,
output_dir=out_dir,
max_seq_length=2048,
num_calibration_samples=512,
)
# Optional push to Hub
if push_to_hub and dest_repo:
token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
if not token:
return "β No HF token provided or set via environment. Model saved locally."
api = HfApi(token=token)
api.create_repo(dest_repo, exist_ok=True)
api.upload_folder(
repo_id=dest_repo,
folder_path=out_dir,
commit_message=f"Add {model_id} quantized with {scheme}",
)
return f"β
Quantized and pushed to https://huggingface.co/{dest_repo}"
return f"β
Quantized and saved locally at {out_dir}"
# βββββββββββββββββββββββββββββββββββββββββ Interface ββββββββββββββββββββββββββββββββββββββββββ
with gr.Blocks() as demo:
gr.Markdown(
"""# β‘οΈ ZeroβGPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llmβcompressor**."""
)
with gr.Row():
model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")
with gr.Row():
dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
token_in = gr.Textbox(label="HF Token (optional)", type="password")
push_chk = gr.Checkbox(label="Push to Hub", value=True)
run_btn = gr.Button("π Quantize")
status_out = gr.Textbox(label="Status log")
run_btn.click(
fn=quantize,
inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in],
outputs=status_out,
)
if __name__ == "__main__":
demo.queue().launch()
|