Spaces:

cmcmaster
/

AutoQuant

Sleeping

App Files Files Community

AutoQuant / app.py

cmcmaster

Update app.py

0eb3167 verified 19 days ago

raw

history blame contribute delete

3.26 kB

	import os
	import gradio as gr
	import spaces
	import torch
	from huggingface_hub import HfApi
	from llmcompressor import oneshot
	from llmcompressor.modifiers.quantization import GPTQModifier
	from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

	# Held on CPU outside the GPU context
	zero = torch.tensor([0])

	# Mapping of friendly names → quantization recipes
	SCHEMES = {
	"W4A16": lambda: [
	GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
	],
	"W8A8": lambda: [
	SmoothQuantModifier(smoothing_strength=0.8),
	GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
	],
	}

	def build_recipe(scheme: str):
	if scheme not in SCHEMES:
	raise ValueError(f"Unsupported quantization scheme: {scheme}")
	return SCHEMES[scheme]()

	@spaces.GPU # Dynamic GPU allocation
	def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str):
	# Demonstrate that we're now on GPU
	print("zero.device inside GPU context:", zero.to("cuda").device)

	recipe = build_recipe(scheme)
	out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"

	# One‑shot quantization
	oneshot(
	model=model_id,
	dataset="open_platypus", # tiny calibration set bundled with the lib
	recipe=recipe,
	output_dir=out_dir,
	max_seq_length=2048,
	num_calibration_samples=512,
	)

	# Optional push to Hub
	if push_to_hub and dest_repo:
	token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
	if not token:
	return "❌ No HF token provided or set via environment. Model saved locally."

	api = HfApi(token=token)
	api.create_repo(dest_repo, exist_ok=True)
	api.upload_folder(
	repo_id=dest_repo,
	folder_path=out_dir,
	commit_message=f"Add {model_id} quantized with {scheme}",
	)
	return f"✅ Quantized and pushed to https://huggingface.co/{dest_repo}"

	return f"✅ Quantized and saved locally at {out_dir}"

	# ───────────────────────────────────────── Interface ──────────────────────────────────────────
	with gr.Blocks() as demo:
	gr.Markdown(
	"""# ⚡️ Zero‑GPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with llm‑compressor."""
	)

	with gr.Row():
	model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
	scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")

	with gr.Row():
	dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
	token_in = gr.Textbox(label="HF Token (optional)", type="password")

	push_chk = gr.Checkbox(label="Push to Hub", value=True)
	run_btn = gr.Button("🚀 Quantize")
	status_out = gr.Textbox(label="Status log")

	run_btn.click(
	fn=quantize,
	inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in],
	outputs=status_out,
	)

	if __name__ == "__main__":
	demo.queue().launch()