markredito commited on
Commit
d886ca7
·
verified ·
1 Parent(s): 6af65ad

Update app.py

Browse files

added 4 bit quantization code
added examples

Files changed (1) hide show
  1. app.py +67 -38
app.py CHANGED
@@ -1,73 +1,102 @@
1
  import os
2
  import torch
3
  import gradio as gr
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
  from peft import PeftModel
6
 
7
- # Read your Hugging Face token from Space Secrets
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
9
 
10
- # Hugging Face model identifiers
11
  BASE_MODEL = "google/gemma-3-1b-it"
12
- LORA_ADAPTER = "markredito/gemma-pip-finetuned-v2" # 🔁
13
 
14
- # Load base model with token (required for gated models)
15
- tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
16
-
17
- # Detect if GPU is available
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
- dtype = torch.bfloat16 if device == "cuda" else torch.float32
 
 
 
 
 
 
 
 
 
 
20
 
21
  model = AutoModelForCausalLM.from_pretrained(
22
  BASE_MODEL,
23
- device_map="auto" if device == "cuda" else None,
24
- torch_dtype=dtype,
25
- token=HF_TOKEN
 
 
26
  )
27
 
28
- model = PeftModel.from_pretrained(
29
- model,
30
- LORA_ADAPTER,
31
- token=HF_TOKEN
32
- )
33
 
34
  # Pad token fallback
35
  if tokenizer.pad_token is None:
36
  tokenizer.pad_token = tokenizer.eos_token
37
  tokenizer.padding_side = "right"
38
 
39
- def generate_response(user_input):
40
- prompt = (
 
41
  "<start_of_turn>user\n"
42
- f"{user_input.strip()}\n"
43
  "<end_of_turn>\n"
44
  "<start_of_turn>model\n"
45
  )
46
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
47
-
 
48
  outputs = model.generate(
49
  **inputs,
50
  max_new_tokens=200,
51
  do_sample=True,
52
- temperature=0.7,
53
- top_p=0.9,
54
- top_k=50,
55
  pad_token_id=tokenizer.pad_token_id,
56
  eos_token_id=tokenizer.eos_token_id,
57
  )
58
 
59
- # Decode and clean output
60
- response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
61
- response = response.split("<end_of_turn>")[0].replace("model\n", "").strip()
62
-
63
- return response
64
 
65
  # Gradio UI
66
- gr.Interface(
67
- fn=generate_response,
68
- inputs=gr.Textbox(label="Enter your prompt", placeholder="E.g. Describe a universe made of sound..."),
69
- outputs=gr.Textbox(label="Model's response"),
70
- title="Gemma LoRA: Abstract Thought Generator",
71
- description="LoRA fine-tuned `gemma-3-1b-it` on poetic/philosophical prompts. Run your own abstract experiments.",
72
- theme="soft"
73
- ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import torch
3
  import gradio as gr
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
  from peft import PeftModel
6
 
7
+ # Hugging Face Token from Space Secrets
8
  HF_TOKEN = os.environ.get("HF_TOKEN")
9
 
10
+ # Model IDs
11
  BASE_MODEL = "google/gemma-3-1b-it"
12
+ LORA_ADAPTER = "markredito/gemma-pip-finetuned-v2" # 🔁 Replace with your actual LoRA repo
13
 
14
+ # Check device
 
 
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ # Quantization config for 4-bit (recommended on T4 GPU)
18
+ bnb_config = BitsAndBytesConfig(
19
+ load_in_4bit=True,
20
+ bnb_4bit_quant_type="nf4",
21
+ bnb_4bit_compute_dtype=torch.bfloat16,
22
+ bnb_4bit_use_double_quant=True,
23
+ )
24
+
25
+ # Load tokenizer and model
26
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, token=HF_TOKEN)
27
 
28
  model = AutoModelForCausalLM.from_pretrained(
29
  BASE_MODEL,
30
+ device_map="auto",
31
+ torch_dtype=torch.bfloat16,
32
+ quantization_config=bnb_config,
33
+ token=HF_TOKEN,
34
+ attn_implementation="eager" # Required for Gemma3 + quant
35
  )
36
 
37
+ model = PeftModel.from_pretrained(model, LORA_ADAPTER, token=HF_TOKEN)
 
 
 
 
38
 
39
  # Pad token fallback
40
  if tokenizer.pad_token is None:
41
  tokenizer.pad_token = tokenizer.eos_token
42
  tokenizer.padding_side = "right"
43
 
44
+ # Generation function
45
+ def generate_response(prompt, temperature, top_p, top_k):
46
+ formatted = (
47
  "<start_of_turn>user\n"
48
+ f"{prompt.strip()}\n"
49
  "<end_of_turn>\n"
50
  "<start_of_turn>model\n"
51
  )
52
+
53
+ inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
54
+
55
  outputs = model.generate(
56
  **inputs,
57
  max_new_tokens=200,
58
  do_sample=True,
59
+ temperature=temperature,
60
+ top_p=top_p,
61
+ top_k=top_k,
62
  pad_token_id=tokenizer.pad_token_id,
63
  eos_token_id=tokenizer.eos_token_id,
64
  )
65
 
66
+ decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
67
+ cleaned = decoded.split("<end_of_turn>")[0].replace("model\n", "").strip()
68
+ return cleaned
 
 
69
 
70
  # Gradio UI
71
+ with gr.Blocks() as demo:
72
+ gr.Markdown("## ✨ Gemma LoRA Inference Demo")
73
+ gr.Markdown("Use your imagination or try one of the examples below to explore poetic and philosophical responses.")
74
+
75
+ examples = [
76
+ "Describe a world where clouds are solid and people walk on them",
77
+ "Contrast quantum realities phenomena from the perspective of a starship navigator, using a spiral into infinity.",
78
+ "Dream up futuristic phenomena from the perspective of a timeless oracle, using a fractal blooming in chaos.",
79
+ ]
80
+
81
+ with gr.Row():
82
+ with gr.Column():
83
+ prompt_input = gr.Textbox(label="Enter your prompt", lines=4, placeholder="Try something like: What if gravity took a day off?")
84
+
85
+ gr.Examples(
86
+ examples=examples,
87
+ inputs=prompt_input,
88
+ label="Example Prompts"
89
+ )
90
+
91
+ temperature = gr.Slider(0.1, 1.5, value=0.7, label="Temperature")
92
+ top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top-p (nucleus sampling)")
93
+ top_k = gr.Slider(0, 100, step=1, value=50, label="Top-k")
94
+
95
+ submit = gr.Button("Generate")
96
+
97
+ with gr.Column():
98
+ output = gr.Textbox(label="Model Response", lines=10)
99
+
100
+ submit.click(fn=generate_response, inputs=[prompt_input, temperature, top_p, top_k], outputs=output)
101
+
102
+ demo.launch()