File size: 3,255 Bytes
0eb3167
 
 
 
58e2d13
 
 
 
 
0eb3167
58e2d13
 
0eb3167
58e2d13
0eb3167
 
 
58e2d13
 
 
 
 
 
 
0eb3167
 
 
 
 
 
 
 
58e2d13
 
 
 
0eb3167
58e2d13
 
0eb3167
58e2d13
 
 
 
 
 
0eb3167
58e2d13
0eb3167
58e2d13
0eb3167
 
58e2d13
 
 
 
 
 
 
 
 
 
 
0eb3167
58e2d13
0eb3167
 
 
 
 
 
 
 
 
 
 
58e2d13
 
0eb3167
58e2d13
 
 
 
0eb3167
58e2d13
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import gradio as gr
import spaces
import torch
from huggingface_hub import HfApi
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

# Held on CPU outside the GPU context
zero = torch.tensor([0])

# Mapping of friendly names β†’ quantization recipes
SCHEMES = {
    "W4A16": lambda: [
        GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
    ],
    "W8A8": lambda: [
        SmoothQuantModifier(smoothing_strength=0.8),
        GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
    ],
}

def build_recipe(scheme: str):
    if scheme not in SCHEMES:
        raise ValueError(f"Unsupported quantization scheme: {scheme}")
    return SCHEMES[scheme]()

@spaces.GPU  # Dynamic GPU allocation
def quantize(model_id: str, scheme: str, dest_repo: str, push_to_hub: bool, token: str):
    # Demonstrate that we're now on GPU
    print("zero.device inside GPU context:", zero.to("cuda").device)

    recipe = build_recipe(scheme)
    out_dir = f"/tmp/{(dest_repo or 'quantized-model').split('/')[-1]}"

    # One‑shot quantization
    oneshot(
        model=model_id,
        dataset="open_platypus",  # tiny calibration set bundled with the lib
        recipe=recipe,
        output_dir=out_dir,
        max_seq_length=2048,
        num_calibration_samples=512,
    )

    # Optional push to Hub
    if push_to_hub and dest_repo:
        token = token or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
        if not token:
            return "❌ No HF token provided or set via environment. Model saved locally."

        api = HfApi(token=token)
        api.create_repo(dest_repo, exist_ok=True)
        api.upload_folder(
            repo_id=dest_repo,
            folder_path=out_dir,
            commit_message=f"Add {model_id} quantized with {scheme}",
        )
        return f"βœ… Quantized and pushed to https://huggingface.co/{dest_repo}"

    return f"βœ… Quantized and saved locally at {out_dir}"

# ───────────────────────────────────────── Interface ──────────────────────────────────────────
with gr.Blocks() as demo:
    gr.Markdown(
        """# ⚑️ Zero‑GPU Quantizer\nSelect a model, choose a scheme, and quantize on demand with **llm‑compressor**."""
    )

    with gr.Row():
        model_id_in = gr.Textbox(label="Model ID", value="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
        scheme_in = gr.Dropdown(list(SCHEMES.keys()), label="Quantization Scheme", value="W4A16")

    with gr.Row():
        dest_repo_in = gr.Textbox(label="Destination HF repo (user/repo)")
        token_in = gr.Textbox(label="HF Token (optional)", type="password")

    push_chk = gr.Checkbox(label="Push to Hub", value=True)
    run_btn = gr.Button("πŸš€ Quantize")
    status_out = gr.Textbox(label="Status log")

    run_btn.click(
        fn=quantize,
        inputs=[model_id_in, scheme_in, dest_repo_in, push_chk, token_in],
        outputs=status_out,
    )

if __name__ == "__main__":
    demo.queue().launch()