Spaces:

cmcmaster
/

AutoQuant

Sleeping

App Files Files Community

cmcmaster commited on 21 days ago

Commit

0eb3167

verified ·

1 Parent(s): 58e2d13

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -21

app.py CHANGED Viewed

@@ -1,15 +1,20 @@
-import os, gradio as gr, spaces, torch
 from huggingface_hub import HfApi
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-# stays on CPU until we enter the @spaces.GPU context
 zero = torch.tensor([0])
-# minimal mapping ↦ recipes; extend as needed
 SCHEMES = {
-    "W4A16": lambda: [GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])],
     "W8A8": lambda: [
         SmoothQuantModifier(smoothing_strength=0.8),
         GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
@@ -17,31 +22,34 @@ SCHEMES = {
 }
 def build_recipe(scheme: str):
-    try:
-        return SCHEMES[scheme]()
-    except KeyError as e:
-        raise ValueError(f"Unsupported quantization scheme: {scheme}") from e
-@spaces.GPU  # dynamic GPU allocation on demand
-def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool):
-    print(f"zero.device before GPU context: {zero.device}")
     recipe = build_recipe(scheme)
     out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"
-    # run LLM‑Compressor one‑shot quantization
     oneshot(
         model=model_id,
-        dataset="open_platypus",  # tiny calibration set shipped with the lib
         recipe=recipe,
         output_dir=out_dir,
         max_seq_length=2048,
         num_calibration_samples=512,
     )
     if push_to_hub and dest_repo:
-        token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
         if not token:
-            return "❌ No HF token found (set $HF_TOKEN). Model saved locally."  # early exit
         api = HfApi(token=token)
         api.create_repo(dest_repo, exist_ok=True)
         api.upload_folder(
@@ -53,19 +61,27 @@ def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool):
     return f"✅ Quantized and saved locally at {out_dir}"
 with gr.Blocks() as demo:
-    gr.Markdown("""# ⚡️ Zero‑GPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llm‑compressor**.""")
-    model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-    scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")
-    dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
     push_chk = gr.Checkbox(label="Push to Hub", value=True)
     status_out = gr.Textbox(label="Status log")
-    run_btn = gr.Button("🚀 Quantize")
     run_btn.click(
         fn=quantize,
-        inputs=[model_id_in, scheme_in, dest_repo_in, push_chk],
         outputs=status_out,
     )

+import os
+import gradio as gr
+import spaces
+import torch
 from huggingface_hub import HfApi
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+# Held on CPU outside the GPU context
 zero = torch.tensor([0])
+# Mapping of friendly names → quantization recipes
 SCHEMES = {
+    "W4A16": lambda: [
+        GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
+    ],
     "W8A8": lambda: [
         SmoothQuantModifier(smoothing_strength=0.8),
         GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
 }
 def build_recipe(scheme: str):
+    if scheme not in SCHEMES:
+        raise ValueError(f"Unsupported quantization scheme: {scheme}")
+    return SCHEMES[scheme]()
+@spaces.GPU  # Dynamic GPU allocation
+def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str):
+    # Demonstrate that we're now on GPU
+    print("zero.device inside GPU context:", zero.to("cuda").device)
     recipe = build_recipe(scheme)
     out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"
+    # One‑shot quantization
     oneshot(
         model=model_id,
+        dataset="open_platypus",  # tiny calibration set bundled with the lib
         recipe=recipe,
         output_dir=out_dir,
         max_seq_length=2048,
         num_calibration_samples=512,
     )
+    # Optional push to Hub
     if push_to_hub and dest_repo:
+        token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
         if not token:
+            return "❌ No HF token provided or set via environment. Model saved locally."
         api = HfApi(token=token)
         api.create_repo(dest_repo, exist_ok=True)
         api.upload_folder(
     return f"✅ Quantized and saved locally at {out_dir}"
+# ───────────────────────────────────────── Interface ──────────────────────────────────────────
 with gr.Blocks() as demo:
+    gr.Markdown(
+        """# ⚡️ Zero‑GPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llm‑compressor**."""
+    )
+    with gr.Row():
+        model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+        scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")
+    with gr.Row():
+        dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
+        token_in = gr.Textbox(label="HF Token (optional)", type="password")
     push_chk = gr.Checkbox(label="Push to Hub", value=True)
+    run_btn = gr.Button("🚀 Quantize")
     status_out = gr.Textbox(label="Status log")
     run_btn.click(
         fn=quantize,
+        inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in],
         outputs=status_out,
     )