import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

MODEL_REPO = "DreadPoor/Irixium-12B-Model_Stock-Q4_K_M-GGUF"
MODEL_FILENAME = "irixium-12b-model_stock-q4_k_m.gguf"
MODEL_PATH = "./" + MODEL_FILENAME

if not os.path.exists(MODEL_PATH):
    hf_hub_download(
        repo_id=MODEL_REPO,
        filename=MODEL_FILENAME,
        repo_type="model",
        local_dir=".",
    )

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,
    n_threads=2,
    n_threads_batch=2,
    verbose=False,
)

DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart yet silly, obliging and affable slave, your duty is to serve while caring for your master."

def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT):
    messages = [{"role": "system", "content": system_prompt}]
    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})

    prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])

    output = llm(prompt, max_tokens=1024, echo=False)
    return output["choices"][0]["text"].strip()

def chat(message, history, system_prompt):
    return generate_response(message, history, system_prompt)

iface = gr.ChatInterface(
    fn=chat,
    title="llama.cpp Chat",
    description="Test a GGUF model. Chats arent persistent",
    additional_inputs=[gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3)]
)

iface.launch()