|
import os |
|
import gradio as gr |
|
import spaces |
|
import torch |
|
from huggingface_hub import HfApi |
|
from llmcompressor import oneshot |
|
from llmcompressor.modifiers.quantization import GPTQModifier |
|
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier |
|
|
|
|
|
zero = torch.tensor([0]) |
|
|
|
|
|
SCHEMES = { |
|
"W4A16": lambda: [ |
|
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) |
|
], |
|
"W8A8": lambda: [ |
|
SmoothQuantModifier(smoothing_strength=0.8), |
|
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), |
|
], |
|
} |
|
|
|
def build_recipe(scheme: str): |
|
if scheme not in SCHEMES: |
|
raise ValueError(f"Unsupported quantization scheme: {scheme}") |
|
return SCHEMES[scheme]() |
|
|
|
@spaces.GPU |
|
def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str): |
|
|
|
print("zero.device inside GPU context:", zero.to("cuda").device) |
|
|
|
recipe = build_recipe(scheme) |
|
out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}" |
|
|
|
|
|
oneshot( |
|
model=model_id, |
|
dataset="open_platypus", |
|
recipe=recipe, |
|
output_dir=out_dir, |
|
max_seq_length=2048, |
|
num_calibration_samples=512, |
|
) |
|
|
|
|
|
if push_to_hub and dest_repo: |
|
token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") |
|
if not token: |
|
return "β No HF token provided or set via environment. Model saved locally." |
|
|
|
api = HfApi(token=token) |
|
api.create_repo(dest_repo, exist_ok=True) |
|
api.upload_folder( |
|
repo_id=dest_repo, |
|
folder_path=out_dir, |
|
commit_message=f"Add {model_id} quantized with {scheme}", |
|
) |
|
return f"β
Quantized and pushed to https://huggingface.co/{dest_repo}" |
|
|
|
return f"β
Quantized and saved locally at {out_dir}" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
"""# β‘οΈ ZeroβGPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llmβcompressor**.""" |
|
) |
|
|
|
with gr.Row(): |
|
model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0") |
|
scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16") |
|
|
|
with gr.Row(): |
|
dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)") |
|
token_in = gr.Textbox(label="HF Token (optional)", type="password") |
|
|
|
push_chk = gr.Checkbox(label="Push to Hub", value=True) |
|
run_btn = gr.Button("π Quantize") |
|
status_out = gr.Textbox(label="Status log") |
|
|
|
run_btn.click( |
|
fn=quantize, |
|
inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in], |
|
outputs=status_out, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.queue().launch() |
|
|