Spaces:
Runtime error
Runtime error
File size: 2,826 Bytes
240a78d 97f10b6 acdea71 97f10b6 18433e4 97f10b6 9d4528b 927446b 97f10b6 9d4528b 927446b 97f10b6 927446b 240a78d 9d4528b 240a78d 9d4528b 927446b 97f10b6 240a78d 97f10b6 240a78d 927446b 240a78d 97f10b6 240a78d 97f10b6 240a78d 18433e4 240a78d 927446b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, LlamaConfig
from peft import PeftModel # For loading adapter files
# Path to the base model and adapter
BASE_MODEL_PATH = "unsloth/Llama-3.2-3B-Instruct" # Replace with your base model path
ADAPTER_PATH = "Futuresony/future_ai_12_10_2024.gguf/adapter" # Your Hugging Face repo
# Function to clean rope_scaling in model config
def clean_rope_scaling(config):
if "rope_scaling" in config:
valid_rope_scaling = {"type": "linear", "factor": config["rope_scaling"].get("factor", 1.0)}
config["rope_scaling"] = valid_rope_scaling
return config
# Load base model and tokenizer
print("Loading base model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
# Load and clean the model config
config = LlamaConfig.from_pretrained(BASE_MODEL_PATH)
clean_config = clean_rope_scaling(config.to_dict())
# Load model with cleaned config
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_PATH, config=clean_config, torch_dtype=torch.float16, device_map="auto")
# Load adapter using PEFT
print("Loading adapter...")
model = PeftModel.from_pretrained(model, ADAPTER_PATH)
# Set model to evaluation mode
model.eval()
# Function to generate responses
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Prepare input
input_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
# Generate response
generation_config = GenerationConfig(
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
output_ids = model.generate(**inputs, generation_config=generation_config)
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return response.split("assistant:")[-1].strip()
# Gradio Interface
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
demo.launch()
|