Update app.py
Browse files
app.py
CHANGED
@@ -1,15 +1,20 @@
|
|
1 |
-
import os
|
|
|
|
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
from llmcompressor import oneshot
|
4 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
5 |
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
6 |
|
7 |
-
#
|
8 |
zero = torch.tensor([0])
|
9 |
|
10 |
-
#
|
11 |
SCHEMES = {
|
12 |
-
"W4A16": lambda: [
|
|
|
|
|
13 |
"W8A8": lambda: [
|
14 |
SmoothQuantModifier(smoothing_strength=0.8),
|
15 |
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
|
@@ -17,31 +22,34 @@ SCHEMES = {
|
|
17 |
}
|
18 |
|
19 |
def build_recipe(scheme: str):
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
@spaces.GPU # dynamic GPU allocation on demand
|
26 |
-
def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool):
|
27 |
-
print(f"zero.device before GPU context: {zero.device}")
|
28 |
recipe = build_recipe(scheme)
|
29 |
out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"
|
30 |
|
31 |
-
#
|
32 |
oneshot(
|
33 |
model=model_id,
|
34 |
-
dataset="open_platypus", # tiny calibration set
|
35 |
recipe=recipe,
|
36 |
output_dir=out_dir,
|
37 |
max_seq_length=2048,
|
38 |
num_calibration_samples=512,
|
39 |
)
|
40 |
|
|
|
41 |
if push_to_hub and dest_repo:
|
42 |
-
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
|
43 |
if not token:
|
44 |
-
return "β No HF token
|
|
|
45 |
api = HfApi(token=token)
|
46 |
api.create_repo(dest_repo, exist_ok=True)
|
47 |
api.upload_folder(
|
@@ -53,19 +61,27 @@ def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool):
|
|
53 |
|
54 |
return f"β
Quantized and saved locally at {out_dir}"
|
55 |
|
|
|
56 |
with gr.Blocks() as demo:
|
57 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
60 |
-
scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")
|
61 |
-
dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
|
62 |
push_chk = gr.Checkbox(label="Push to Hub", value=True)
|
|
|
63 |
status_out = gr.Textbox(label="Status log")
|
64 |
|
65 |
-
run_btn = gr.Button("π Quantize")
|
66 |
run_btn.click(
|
67 |
fn=quantize,
|
68 |
-
inputs=[model_id_in, scheme_in, dest_repo_in, push_chk],
|
69 |
outputs=status_out,
|
70 |
)
|
71 |
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import spaces
|
4 |
+
import torch
|
5 |
from huggingface_hub import HfApi
|
6 |
from llmcompressor import oneshot
|
7 |
from llmcompressor.modifiers.quantization import GPTQModifier
|
8 |
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
|
9 |
|
10 |
+
# Held on CPU outside the GPU context
|
11 |
zero = torch.tensor([0])
|
12 |
|
13 |
+
# Mapping of friendly names β quantization recipes
|
14 |
SCHEMES = {
|
15 |
+
"W4A16": lambda: [
|
16 |
+
GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
|
17 |
+
],
|
18 |
"W8A8": lambda: [
|
19 |
SmoothQuantModifier(smoothing_strength=0.8),
|
20 |
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
|
|
|
22 |
}
|
23 |
|
24 |
def build_recipe(scheme: str):
|
25 |
+
if scheme not in SCHEMES:
|
26 |
+
raise ValueError(f"Unsupported quantization scheme: {scheme}")
|
27 |
+
return SCHEMES[scheme]()
|
28 |
+
|
29 |
+
@spaces.GPU # Dynamic GPU allocation
|
30 |
+
def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str):
|
31 |
+
# Demonstrate that we're now on GPU
|
32 |
+
print("zero.device inside GPU context:", zero.to("cuda").device)
|
33 |
|
|
|
|
|
|
|
34 |
recipe = build_recipe(scheme)
|
35 |
out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"
|
36 |
|
37 |
+
# Oneβshot quantization
|
38 |
oneshot(
|
39 |
model=model_id,
|
40 |
+
dataset="open_platypus", # tiny calibration set bundled with the lib
|
41 |
recipe=recipe,
|
42 |
output_dir=out_dir,
|
43 |
max_seq_length=2048,
|
44 |
num_calibration_samples=512,
|
45 |
)
|
46 |
|
47 |
+
# Optional push to Hub
|
48 |
if push_to_hub and dest_repo:
|
49 |
+
token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
|
50 |
if not token:
|
51 |
+
return "β No HF token provided or set via environment. Model saved locally."
|
52 |
+
|
53 |
api = HfApi(token=token)
|
54 |
api.create_repo(dest_repo, exist_ok=True)
|
55 |
api.upload_folder(
|
|
|
61 |
|
62 |
return f"β
Quantized and saved locally at {out_dir}"
|
63 |
|
64 |
+
# βββββββββββββββββββββββββββββββββββββββββ Interface ββββββββββββββββββββββββββββββββββββββββββ
|
65 |
with gr.Blocks() as demo:
|
66 |
+
gr.Markdown(
|
67 |
+
"""# β‘οΈ ZeroβGPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llmβcompressor**."""
|
68 |
+
)
|
69 |
+
|
70 |
+
with gr.Row():
|
71 |
+
model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
|
72 |
+
scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")
|
73 |
+
|
74 |
+
with gr.Row():
|
75 |
+
dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
|
76 |
+
token_in = gr.Textbox(label="HF Token (optional)", type="password")
|
77 |
|
|
|
|
|
|
|
78 |
push_chk = gr.Checkbox(label="Push to Hub", value=True)
|
79 |
+
run_btn = gr.Button("π Quantize")
|
80 |
status_out = gr.Textbox(label="Status log")
|
81 |
|
|
|
82 |
run_btn.click(
|
83 |
fn=quantize,
|
84 |
+
inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in],
|
85 |
outputs=status_out,
|
86 |
)
|
87 |
|