cmcmaster commited on
Commit
0eb3167
Β·
verified Β·
1 Parent(s): 58e2d13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -21
app.py CHANGED
@@ -1,15 +1,20 @@
1
- import os, gradio as gr, spaces, torch
 
 
 
2
  from huggingface_hub import HfApi
3
  from llmcompressor import oneshot
4
  from llmcompressor.modifiers.quantization import GPTQModifier
5
  from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
6
 
7
- # stays on CPU until we enter the @spaces.GPU context
8
  zero = torch.tensor([0])
9
 
10
- # minimal mapping ↦ recipes; extend as needed
11
  SCHEMES = {
12
- "W4A16": lambda: [GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])],
 
 
13
  "W8A8": lambda: [
14
  SmoothQuantModifier(smoothing_strength=0.8),
15
  GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
@@ -17,31 +22,34 @@ SCHEMES = {
17
  }
18
 
19
  def build_recipe(scheme: str):
20
- try:
21
- return SCHEMES[scheme]()
22
- except KeyError as e:
23
- raise ValueError(f"Unsupported quantization scheme: {scheme}") from e
 
 
 
 
24
 
25
- @spaces.GPU # dynamic GPU allocation on demand
26
- def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool):
27
- print(f"zero.device before GPU context: {zero.device}")
28
  recipe = build_recipe(scheme)
29
  out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"
30
 
31
- # run LLM‑Compressor one‑shot quantization
32
  oneshot(
33
  model=model_id,
34
- dataset="open_platypus", # tiny calibration set shipped with the lib
35
  recipe=recipe,
36
  output_dir=out_dir,
37
  max_seq_length=2048,
38
  num_calibration_samples=512,
39
  )
40
 
 
41
  if push_to_hub and dest_repo:
42
- token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
43
  if not token:
44
- return "❌ No HF token found (set $HF_TOKEN). Model saved locally." # early exit
 
45
  api = HfApi(token=token)
46
  api.create_repo(dest_repo, exist_ok=True)
47
  api.upload_folder(
@@ -53,19 +61,27 @@ def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool):
53
 
54
  return f"βœ… Quantized and saved locally at {out_dir}"
55
 
 
56
  with gr.Blocks() as demo:
57
- gr.Markdown("""# ⚑️ Zero‑GPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llm‑compressor**.""")
 
 
 
 
 
 
 
 
 
 
58
 
59
- model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
60
- scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")
61
- dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
62
  push_chk = gr.Checkbox(label="Push to Hub", value=True)
 
63
  status_out = gr.Textbox(label="Status log")
64
 
65
- run_btn = gr.Button("πŸš€ Quantize")
66
  run_btn.click(
67
  fn=quantize,
68
- inputs=[model_id_in, scheme_in, dest_repo_in, push_chk],
69
  outputs=status_out,
70
  )
71
 
 
1
+ import os
2
+ import gradio as gr
3
+ import spaces
4
+ import torch
5
  from huggingface_hub import HfApi
6
  from llmcompressor import oneshot
7
  from llmcompressor.modifiers.quantization import GPTQModifier
8
  from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
9
 
10
+ # Held on CPU outside the GPU context
11
  zero = torch.tensor([0])
12
 
13
+ # Mapping of friendly names β†’ quantization recipes
14
  SCHEMES = {
15
+ "W4A16": lambda: [
16
+ GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
17
+ ],
18
  "W8A8": lambda: [
19
  SmoothQuantModifier(smoothing_strength=0.8),
20
  GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
 
22
  }
23
 
24
  def build_recipe(scheme: str):
25
+ if scheme not in SCHEMES:
26
+ raise ValueError(f"Unsupported quantization scheme: {scheme}")
27
+ return SCHEMES[scheme]()
28
+
29
+ @spaces.GPU # Dynamic GPU allocation
30
+ def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str):
31
+ # Demonstrate that we're now on GPU
32
+ print("zero.device inside GPU context:", zero.to("cuda").device)
33
 
 
 
 
34
  recipe = build_recipe(scheme)
35
  out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"
36
 
37
+ # One‑shot quantization
38
  oneshot(
39
  model=model_id,
40
+ dataset="open_platypus", # tiny calibration set bundled with the lib
41
  recipe=recipe,
42
  output_dir=out_dir,
43
  max_seq_length=2048,
44
  num_calibration_samples=512,
45
  )
46
 
47
+ # Optional push to Hub
48
  if push_to_hub and dest_repo:
49
+ token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
50
  if not token:
51
+ return "❌ No HF token provided or set via environment. Model saved locally."
52
+
53
  api = HfApi(token=token)
54
  api.create_repo(dest_repo, exist_ok=True)
55
  api.upload_folder(
 
61
 
62
  return f"βœ… Quantized and saved locally at {out_dir}"
63
 
64
+ # ───────────────────────────────────────── Interface ──────────────────────────────────────────
65
  with gr.Blocks() as demo:
66
+ gr.Markdown(
67
+ """# ⚑️ Zero‑GPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llm‑compressor**."""
68
+ )
69
+
70
+ with gr.Row():
71
+ model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
72
+ scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")
73
+
74
+ with gr.Row():
75
+ dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
76
+ token_in = gr.Textbox(label="HF Token (optional)", type="password")
77
 
 
 
 
78
  push_chk = gr.Checkbox(label="Push to Hub", value=True)
79
+ run_btn = gr.Button("πŸš€ Quantize")
80
  status_out = gr.Textbox(label="Status log")
81
 
 
82
  run_btn.click(
83
  fn=quantize,
84
+ inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in],
85
  outputs=status_out,
86
  )
87