import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download import os MODEL_REPO = "mradermacher/Irix-12B-Model_Stock-i1-GGUF" MODEL_FILENAME = "Irix-12B-Model_Stock.i1-Q4_K_M.gguf" MODEL_PATH = "./" + MODEL_FILENAME try: if not os.path.exists(MODEL_PATH): hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILENAME, repo_type="model", local_dir=".", ) llm = Llama( model_path=MODEL_PATH, n_ctx=4096, n_threads=2, n_threads_batch=2, verbose=False, ) except Exception as e: print(f"Error loading model: {e}") llm = None DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart and diligent AI; Doll is a silly, obliging and affable servant, dedicated to serving and caring for your owner." def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.8, top_p=0.9): if llm is None: return "Model failed to load." messages = [{"role": "system", "content": system_prompt}] for human, assistant in history: messages.append({"role": "user", "content": human}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages]) try: output = llm( prompt, max_tokens=1024, echo=False, temperature=temperature, top_p=top_p, ) return output["choices"][0]["text"].strip() except Exception as e: return f"Error during inference: {e}" def chat(message, history, system_prompt, temperature, top_p): return generate_response(message, history, system_prompt, temperature, top_p) iface = gr.ChatInterface( fn=chat, title="llama.cpp Chat", description="Test a GGUF model. Chats arent persistent", additional_inputs=[ gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3), gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.8, step=0.1), gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1), ], cache_examples=False, ) iface.launch()