import torch from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM import gradio as gr checkpoint = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True) # Load GPTQ model correctly model = AutoGPTQForCausalLM.from_quantized( checkpoint, device="cuda:0" if torch.cuda.is_available() else "cpu", torch_dtype=torch.float32, trust_remote_code=True ) # Function to format prompt + generate response def predict(message, history): prompt = f"[INST] {message.strip()} [/INST]" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256, eos_token_id=tokenizer.eos_token_id ) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) response = decoded.split("[/INST]")[-1].strip() return response # Launch Gradio chatbot gr.ChatInterface(predict).launch(debug=True) demo.launch()