Futuresony commited on
Commit
6d9c19c
·
verified ·
1 Parent(s): 09c34c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -81
app.py CHANGED
@@ -1,22 +1,29 @@
 
 
1
  import gradio as gr
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
- from peft import PeftModel
4
- import torch
5
 
6
- # Use a CPU-compatible base model (replace this with your actual full-precision model)
7
- base_model_id = "unsloth/gemma-2-9b" # Replace with real CPU-compatible model
8
- lora_model_id = "import gradio as gr"
9
- from huggingface_hub import InferenceClient
10
- import os
11
 
12
- # 🔹 Hugging Face Credentials
13
- HF_REPO = "Futuresony/gemma2-9b-lora-alpaca"
14
- HF_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 
 
 
15
 
16
- client = InferenceClient(HF_REPO, token=HF_TOKEN)
 
17
 
 
 
 
 
 
18
  def format_alpaca_prompt(user_input, system_prompt, history):
19
- """Formats input in Alpaca/LLaMA style"""
20
  history_str = "\n".join([f"### Instruction:\n{h[0]}\n### Response:\n{h[1]}" for h in history])
21
  prompt = f"""{system_prompt}
22
  {history_str}
@@ -24,84 +31,44 @@ def format_alpaca_prompt(user_input, system_prompt, history):
24
  ### Instruction:
25
  {user_input}
26
 
27
- ### Response:
28
- """
29
  return prompt
30
 
 
31
  def respond(message, history, system_message, max_tokens, temperature, top_p):
32
- formatted_prompt = format_alpaca_prompt(message, system_message, history)
33
-
34
- response = client.text_generation(
35
- formatted_prompt,
36
- max_new_tokens=max_tokens,
37
- temperature=temperature,
38
- top_p=top_p,
39
- )
40
-
41
- # ✅ Extract only the response
42
- cleaned_response = response.split("### Response:")[-1].strip()
43
-
44
- history.append((message, cleaned_response)) # ✅ Update history with the new message and response
45
-
46
- yield cleaned_response # Output only the answer
47
-
48
- demo = gr.ChatInterface(
49
- respond,
50
- additional_inputs=[
51
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
52
- gr.Slider(minimum=1, maximum=250, value=128, step=1, label="Max new tokens"),
53
- gr.Slider(minimum=0.1, maximum=4.0, value=0.9, step=0.1, label="Temperature"),
54
- gr.Slider(minimum=0.1, maximum=1.0, value=0.99, step=0.01, label="Top-p (nucleus sampling)"),
55
- ],
56
- )
57
-
58
- if __name__ == "__main__":
59
- demo.launch()"
60
-
61
- # Load the base model on CPU
62
- base_model = AutoModelForCausalLM.from_pretrained(
63
- base_model_id,
64
- torch_dtype=torch.float32, # Use float32 for CPU compatibility
65
- device_map="cpu"
66
- )
67
-
68
- # Load the PEFT LoRA model
69
- model = PeftModel.from_pretrained(base_model, lora_model_id)
70
-
71
- # Load tokenizer
72
- tokenizer = AutoTokenizer.from_pretrained(base_model_id)
73
-
74
- # Chat function
75
- def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
76
- messages = [{"role": "system", "content": system_message}]
77
- for user_msg, bot_msg in history:
78
- if user_msg:
79
- messages.append({"role": "user", "content": user_msg})
80
- if bot_msg:
81
- messages.append({"role": "assistant", "content": bot_msg})
82
- messages.append({"role": "user", "content": message})
83
-
84
- # Generate response (simulated loop for streaming output)
85
- inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cpu")
86
- outputs = model.generate(
87
- inputs,
88
- max_new_tokens=max_tokens,
89
- temperature=temperature,
90
- top_p=top_p,
91
- do_sample=True,
92
- )
93
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
94
- yield response
95
-
96
- # Gradio UI
97
  demo = gr.ChatInterface(
98
  fn=respond,
99
  additional_inputs=[
100
  gr.Textbox(value="You are a friendly chatbot.", label="System message"),
101
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
102
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
103
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
104
  ],
 
105
  )
106
 
107
  if __name__ == "__main__":
 
1
+ import os
2
+ import torch
3
  import gradio as gr
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from peft import PeftModel, PeftConfig
 
6
 
7
+ # Set the HF repo and LoRA model location
8
+ base_model_id = "unsloth/gemma-2-9b"
9
+ lora_model_id = "Futuresony/gemma2-9b-lora-alpaca"
 
 
10
 
11
+ # Load base model on CPU
12
+ base_model = AutoModelForCausalLM.from_pretrained(
13
+ base_model_id,
14
+ device_map="cpu",
15
+ torch_dtype=torch.float32,
16
+ )
17
 
18
+ # Load tokenizer from base model
19
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
20
 
21
+ # Load LoRA adapter
22
+ model = PeftModel.from_pretrained(base_model, lora_model_id)
23
+ model.eval()
24
+
25
+ # === Alpaca-style formatter ===
26
  def format_alpaca_prompt(user_input, system_prompt, history):
 
27
  history_str = "\n".join([f"### Instruction:\n{h[0]}\n### Response:\n{h[1]}" for h in history])
28
  prompt = f"""{system_prompt}
29
  {history_str}
 
31
  ### Instruction:
32
  {user_input}
33
 
34
+ ### Response:"""
 
35
  return prompt
36
 
37
+ # === Chat logic ===
38
  def respond(message, history, system_message, max_tokens, temperature, top_p):
39
+ prompt = format_alpaca_prompt(message, system_message, history)
40
+ inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
41
+
42
+ with torch.no_grad():
43
+ outputs = model.generate(
44
+ **inputs,
45
+ max_new_tokens=max_tokens,
46
+ temperature=temperature,
47
+ top_p=top_p,
48
+ do_sample=True,
49
+ pad_token_id=tokenizer.eos_token_id,
50
+ )
51
+
52
+ response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
53
+ # Only return the part after "### Response:"
54
+ if "### Response:" in response_text:
55
+ final_output = response_text.split("### Response:")[-1].strip()
56
+ else:
57
+ final_output = response_text.strip()
58
+
59
+ history.append((message, final_output))
60
+ yield final_output
61
+
62
+ # === Gradio Interface ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  demo = gr.ChatInterface(
64
  fn=respond,
65
  additional_inputs=[
66
  gr.Textbox(value="You are a friendly chatbot.", label="System message"),
67
+ gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
68
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
69
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.01, label="Top-p"),
70
  ],
71
+ title="Offline Gemma-2B Alpaca Chatbot (LoRA)",
72
  )
73
 
74
  if __name__ == "__main__":