File size: 3,634 Bytes
a4e4083
48d9d00
4d2b819
48d9d00
b7f8793
a294ce4
a789599
09c34c9
ed0ccfa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7f8793
a294ce4
ba09697
 
a294ce4
 
ba09697
b7f8793
48d9d00
ba09697
4d2b819
a294ce4
48d9d00
4d2b819
a294ce4
48d9d00
 
a294ce4
 
 
 
 
48d9d00
 
a294ce4
 
 
 
 
a4e4083
 
a294ce4
 
 
 
a4e4083
a294ce4
a4e4083
a294ce4
a4e4083
a294ce4
48d9d00
 
a294ce4
a4e4083
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Use a CPU-compatible base model (replace this with your actual full-precision model)
base_model_id = "unsloth/gemma-2-9b"  # Replace with real CPU-compatible model
lora_model_id = "import gradio as gr"
from huggingface_hub import InferenceClient
import os

# 🔹 Hugging Face Credentials
HF_REPO = "Futuresony/gemma2-9b-lora-alpaca"
HF_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

client = InferenceClient(HF_REPO, token=HF_TOKEN)

def format_alpaca_prompt(user_input, system_prompt, history):
    """Formats input in Alpaca/LLaMA style"""
    history_str = "\n".join([f"### Instruction:\n{h[0]}\n### Response:\n{h[1]}" for h in history])
    prompt = f"""{system_prompt}
{history_str}

### Instruction:
{user_input}

### Response:
"""
    return prompt

def respond(message, history, system_message, max_tokens, temperature, top_p):
    formatted_prompt = format_alpaca_prompt(message, system_message, history)

    response = client.text_generation(
        formatted_prompt,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )

    # ✅ Extract only the response
    cleaned_response = response.split("### Response:")[-1].strip()
    
    history.append((message, cleaned_response))  # ✅ Update history with the new message and response
    
    yield cleaned_response  # ✅ Output only the answer

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=250, value=128, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.9, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.99, step=0.01, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    demo.launch()"

# Load the base model on CPU
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float32,  # Use float32 for CPU compatibility
    device_map="cpu"
)

# Load the PEFT LoRA model
model = PeftModel.from_pretrained(base_model, lora_model_id)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Chat function
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
    messages = [{"role": "system", "content": system_message}]
    for user_msg, bot_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if bot_msg:
            messages.append({"role": "assistant", "content": bot_msg})
    messages.append({"role": "user", "content": message})

    # Generate response (simulated loop for streaming output)
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cpu")
    outputs = model.generate(
        inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    yield response

# Gradio UI
demo = gr.ChatInterface(
    fn=respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
    ],
)

if __name__ == "__main__":
    demo.launch()