import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import os MODEL_REPO = "DreadPoor/Irixium-12B-Model_Stock-Q4_K_M-GGUF" MODEL_FILENAME = "irixium-12b-model_stock-q4_k_m.gguf" MODEL_PATH = "./" + MODEL_FILENAME if not os.path.exists(MODEL_PATH): hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILENAME, repo_type="model", local_dir=".", ) llm = Llama( model_path=MODEL_PATH, n_ctx=4096, n_threads=2, n_threads_batch=2, verbose=False, ) DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart yet silly, obliging and affable slave, your duty is to serve while caring for your master." def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT): messages = [{"role": "system", "content": system_prompt}] for human, assistant in history: messages.append({"role": "user", "content": human}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages]) output = llm(prompt, max_tokens=1024, echo=False) return output["choices"][0]["text"].strip() def chat(message, history, system_prompt): return generate_response(message, history, system_prompt) iface = gr.ChatInterface( fn=chat, title="llama.cpp Chat", description="Test a GGUF model. Chats arent persistent", additional_inputs=[gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3)] ) iface.launch()