Spaces:
Runtime error
Runtime error
File size: 2,755 Bytes
91b03f9 a445827 0a5ec67 721cdc9 91b03f9 a445827 721cdc9 a445827 721cdc9 0cd712e 0a5ec67 c2a0993 0a5ec67 a445827 724715f 721cdc9 724715f a445827 724715f 91b03f9 721cdc9 a445827 721cdc9 724715f 721cdc9 a445827 721cdc9 a445827 91b03f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
access_token = os.getenv('HF_TOKEN')
# Define the repository ID and access token
repo_id = "Mikhil-jivus/Llama-32-3B-FineTuned"
# Load the tokenizer and model from the Hugging Face repository
tokenizer = AutoTokenizer.from_pretrained(repo_id, token=access_token)
model = AutoModelForCausalLM.from_pretrained(
repo_id,
token=access_token,
torch_dtype=torch.bfloat16, # or use torch.bfloat16 if supported
device_map="auto" # Automatically use available GPU/CPU efficiently
)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Tokenize the input messages with dynamic padding and truncation
input_text = system_message + " ".join([f"{msg['role']}: {msg['content']}" for msg in messages])
inputs = tokenizer(
input_text,
return_tensors="pt",
padding=True, # Dynamically pad to the longest sequence in the batch
truncation=True, # Truncate if exceeds max length
max_length=max_tokens # Ensure max length is respected
)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
# Generate a response
chat_history_ids = model.generate(
input_ids,
max_length=max_tokens,
temperature=temperature,
top_p=top_p,
pad_token_id=tokenizer.eos_token_id,
do_sample=True,
attention_mask=attention_mask, # Use the dynamically generated attention mask
)
# Decode the response
response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch(share=True) |