GGUF_CPU_Test_bench

Running

File size: 2,280 Bytes

2f0a0f2
ee85cfc
e1d5f80
ee85cfc
 
24d89b6
 
e1d5f80
 
ba57f6d
 
 
 
 
 
 
 
e1d5f80
ba57f6d
 
 
 
 
 
 
 
 
 
e1d5f80
24d89b6
0228ec9
24d89b6
ba57f6d
 
c6d1330
0228ec9
 
 
c6d1330
 
ba57f6d
 
 
 
 
 
 
 
 
 
 
c6d1330
ba57f6d
 
c6d1330
0228ec9
 
 
59eb87f
ba57f6d
 
24d89b6
ba57f6d
 
 
0228ec9
ee85cfc
2f0a0f2

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

MODEL_REPO = "mradermacher/Irix-12B-Model_Stock-i1-GGUF"
MODEL_FILENAME = "Irix-12B-Model_Stock.i1-Q4_K_M.gguf"
MODEL_PATH = "./" + MODEL_FILENAME

try:
    if not os.path.exists(MODEL_PATH):
        hf_hub_download(
            repo_id=MODEL_REPO,
            filename=MODEL_FILENAME,
            repo_type="model",
            local_dir=".",
        )

    llm = Llama(
        model_path=MODEL_PATH,
        n_ctx=4096,
        n_threads=2,
        n_threads_batch=2,
        verbose=False,
    )
except Exception as e:
    print(f"Error loading model: {e}")
    llm = None

DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart and diligent AI; Doll is a silly, obliging and affable servant, dedicated to serving and caring for your owner."

def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.8, top_p=0.9):
    if llm is None:
        return "Model failed to load."
    messages = [{"role": "system", "content": system_prompt}]
    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})
    prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])
    try:
        output = llm(
            prompt,
            max_tokens=1024,
            echo=False,
            temperature=temperature,
            top_p=top_p,
        )
        return output["choices"][0]["text"].strip()
    except Exception as e:
        return f"Error during inference: {e}"

def chat(message, history, system_prompt, temperature, top_p):
    return generate_response(message, history, system_prompt, temperature, top_p)

iface = gr.ChatInterface(
    fn=chat,
    title="llama.cpp Chat",
    description="Test a GGUF model. Chats arent persistent",
    additional_inputs=[
        gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3),
        gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.8, step=0.1),
        gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1),
    ],
    cache_examples=False,
)

iface.launch()