import os import gradio as gr import spaces import torch from huggingface_hub import HfApi from llmcompressor import oneshot from llmcompressor.modifiers.quantization import GPTQModifier from llmcompressor.modifiers.smoothquant import SmoothQuantModifier # Held on CPU outside the GPU context zero = torch.tensor([0]) # Mapping of friendly names → quantization recipes SCHEMES = { "W4A16": lambda: [ GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]) ], "W8A8": lambda: [ SmoothQuantModifier(smoothing_strength=0.8), GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]), ], } def build_recipe(scheme: str): if scheme not in SCHEMES: raise ValueError(f"Unsupported quantization scheme: {scheme}") return SCHEMES[scheme]() @spaces.GPU # Dynamic GPU allocation def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str): # Demonstrate that we're now on GPU print("zero.device inside GPU context:", zero.to("cuda").device) recipe = build_recipe(scheme) out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}" # One‑shot quantization oneshot( model=model_id, dataset="open_platypus", # tiny calibration set bundled with the lib recipe=recipe, output_dir=out_dir, max_seq_length=2048, num_calibration_samples=512, ) # Optional push to Hub if push_to_hub and dest_repo: token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") if not token: return "❌ No HF token provided or set via environment. Model saved locally." api = HfApi(token=token) api.create_repo(dest_repo, exist_ok=True) api.upload_folder( repo_id=dest_repo, folder_path=out_dir, commit_message=f"Add {model_id} quantized with {scheme}", ) return f"✅ Quantized and pushed to https://huggingface.co/{dest_repo}" return f"✅ Quantized and saved locally at {out_dir}" # ───────────────────────────────────────── Interface ────────────────────────────────────────── with gr.Blocks() as demo: gr.Markdown( """# ⚡️ Zero‑GPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llm‑compressor**.""" ) with gr.Row(): model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0") scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16") with gr.Row(): dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)") token_in = gr.Textbox(label="HF Token (optional)", type="password") push_chk = gr.Checkbox(label="Push to Hub", value=True) run_btn = gr.Button("🚀 Quantize") status_out = gr.Textbox(label="Status log") run_btn.click( fn=quantize, inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in], outputs=status_out, ) if __name__ == "__main__": demo.queue().launch()