File size: 2,755 Bytes
91b03f9
a445827
0a5ec67
721cdc9
91b03f9
 
a445827
721cdc9
 
a445827
721cdc9
0cd712e
0a5ec67
 
 
 
 
c2a0993
0a5ec67
a445827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724715f
721cdc9
724715f
 
 
 
 
 
 
a445827
724715f
 
91b03f9
721cdc9
 
 
 
a445827
 
721cdc9
 
724715f
721cdc9
a445827
721cdc9
 
 
 
a445827
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91b03f9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

access_token = os.getenv('HF_TOKEN')

# Define the repository ID and access token
repo_id = "Mikhil-jivus/Llama-32-3B-FineTuned"

# Load the tokenizer and model from the Hugging Face repository
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=access_token)

model = AutoModelForCausalLM.from_pretrained(
    repo_id, 
    token=access_token, 
    torch_dtype=torch.bfloat16,  # or use torch.bfloat16 if supported
    device_map="auto"  # Automatically use available GPU/CPU efficiently
)

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    # Tokenize the input messages with dynamic padding and truncation
    input_text = system_message + " ".join([f"{msg['role']}: {msg['content']}" for msg in messages])
    inputs = tokenizer(
        input_text, 
        return_tensors="pt", 
        padding=True,  # Dynamically pad to the longest sequence in the batch
        truncation=True,  # Truncate if exceeds max length
        max_length=max_tokens  # Ensure max length is respected
    )

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate a response
    chat_history_ids = model.generate(
        input_ids,
        max_length=max_tokens,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        attention_mask=attention_mask,  # Use the dynamically generated attention mask
    )

    # Decode the response
    response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)

    yield response

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch(share=True)