Spaces:

Futuresony
/

FutureX

Sleeping

App Files Files Community

Futuresony commited on May 13

Commit

6d9c19c

verified ·

1 Parent(s): 09c34c9

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -81

app.py CHANGED Viewed

@@ -1,22 +1,29 @@
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-import torch
-# Use a CPU-compatible base model (replace this with your actual full-precision model)
-base_model_id = "unsloth/gemma-2-9b"  # Replace with real CPU-compatible model
-lora_model_id = "import gradio as gr"
-from huggingface_hub import InferenceClient
-import os
-# 🔹 Hugging Face Credentials
-HF_REPO = "Futuresony/gemma2-9b-lora-alpaca"
-HF_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-client = InferenceClient(HF_REPO, token=HF_TOKEN)
 def format_alpaca_prompt(user_input, system_prompt, history):
-    """Formats input in Alpaca/LLaMA style"""
     history_str = "\n".join([f"### Instruction:\n{h[0]}\n### Response:\n{h[1]}" for h in history])
     prompt = f"""{system_prompt}
 {history_str}
@@ -24,84 +31,44 @@ def format_alpaca_prompt(user_input, system_prompt, history):
 ### Instruction:
 {user_input}
-### Response:
-"""
     return prompt
 def respond(message, history, system_message, max_tokens, temperature, top_p):
-    formatted_prompt = format_alpaca_prompt(message, system_message, history)
-    response = client.text_generation(
-        formatted_prompt,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-    )
-    # ✅ Extract only the response
-    cleaned_response = response.split("### Response:")[-1].strip()
-    history.append((message, cleaned_response))  # ✅ Update history with the new message and response
-    yield cleaned_response  # ✅ Output only the answer
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=250, value=128, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.9, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.99, step=0.01, label="Top-p (nucleus sampling)"),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()"
-# Load the base model on CPU
-base_model = AutoModelForCausalLM.from_pretrained(
-    base_model_id,
-    torch_dtype=torch.float32,  # Use float32 for CPU compatibility
-    device_map="cpu"
-)
-# Load the PEFT LoRA model
-model = PeftModel.from_pretrained(base_model, lora_model_id)
-# Load tokenizer
-tokenizer = AutoTokenizer.from_pretrained(base_model_id)
-# Chat function
-def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
-    messages = [{"role": "system", "content": system_message}]
-    for user_msg, bot_msg in history:
-        if user_msg:
-            messages.append({"role": "user", "content": user_msg})
-        if bot_msg:
-            messages.append({"role": "assistant", "content": bot_msg})
-    messages.append({"role": "user", "content": message})
-    # Generate response (simulated loop for streaming output)
-    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cpu")
-    outputs = model.generate(
-        inputs,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        do_sample=True,
-    )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    yield response
-# Gradio UI
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
         gr.Textbox(value="You are a friendly chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
     ],
 )
 if __name__ == "__main__":

+import os
+import torch
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel, PeftConfig
+# Set the HF repo and LoRA model location
+base_model_id = "unsloth/gemma-2-9b"
+lora_model_id = "Futuresony/gemma2-9b-lora-alpaca"
+# Load base model on CPU
+base_model = AutoModelForCausalLM.from_pretrained(
+    base_model_id,
+    device_map="cpu",
+    torch_dtype=torch.float32,
+)
+# Load tokenizer from base model
+tokenizer = AutoTokenizer.from_pretrained(base_model_id)
+# Load LoRA adapter
+model = PeftModel.from_pretrained(base_model, lora_model_id)
+model.eval()
+# === Alpaca-style formatter ===
 def format_alpaca_prompt(user_input, system_prompt, history):
     history_str = "\n".join([f"### Instruction:\n{h[0]}\n### Response:\n{h[1]}" for h in history])
     prompt = f"""{system_prompt}
 {history_str}
 ### Instruction:
 {user_input}
+### Response:"""
     return prompt
+# === Chat logic ===
 def respond(message, history, system_message, max_tokens, temperature, top_p):
+    prompt = format_alpaca_prompt(message, system_message, history)
+    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=True,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Only return the part after "### Response:"
+    if "### Response:" in response_text:
+        final_output = response_text.split("### Response:")[-1].strip()
+    else:
+        final_output = response_text.strip()
+    history.append((message, final_output))
+    yield final_output
+# === Gradio Interface ===
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
         gr.Textbox(value="You are a friendly chatbot.", label="System message"),
+        gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
         gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.01, label="Top-p"),
     ],
+    title="Offline Gemma-2B Alpaca Chatbot (LoRA)",
 )
 if __name__ == "__main__":