GGUF_CPU_Test_bench

Sleeping

File size: 2,263 Bytes

2f0a0f2
ee85cfc
e1d5f80
ee85cfc
 
e1d5f80
 
 
 
ba57f6d
 
 
 
 
 
 
 
e1d5f80
ba57f6d
 
 
 
 
 
 
 
 
 
e1d5f80
59eb87f
0228ec9
ba57f6d
 
 
c6d1330
0228ec9
 
 
c6d1330
 
ba57f6d
 
 
 
 
 
 
 
 
 
 
c6d1330
ba57f6d
 
c6d1330
0228ec9
 
 
59eb87f
ba57f6d
 
 
 
 
 
0228ec9
ee85cfc
2f0a0f2

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

MODEL_REPO = "DreadPoor/Irixium-12B-Model_Stock-Q4_K_M-GGUF"
MODEL_FILENAME = "irixium-12b-model_stock-q4_k_m.gguf"
MODEL_PATH = "./" + MODEL_FILENAME

try:
    if not os.path.exists(MODEL_PATH):
        hf_hub_download(
            repo_id=MODEL_REPO,
            filename=MODEL_FILENAME,
            repo_type="model",
            local_dir=".",
        )

    llm = Llama(
        model_path=MODEL_PATH,
        n_ctx=4096,
        n_threads=2,
        n_threads_batch=2,
        verbose=False,
    )
except Exception as e:
    print(f"Error loading model: {e}")
    llm = None

DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart yet silly, obliging and affable slave, your duty is to serve while caring for your master."

def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.7, top_p=0.9):
    if llm is None:
        return "Model failed to load."
    messages = [{"role": "system", "content": system_prompt}]
    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})
    prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])
    try:
        output = llm(
            prompt,
            max_tokens=1024,
            echo=False,
            temperature=temperature,
            top_p=top_p,
        )
        return output["choices"][0]["text"].strip()
    except Exception as e:
        return f"Error during inference: {e}"

def chat(message, history, system_prompt, temperature, top_p):
    return generate_response(message, history, system_prompt, temperature, top_p)

iface = gr.ChatInterface(
    fn=chat,
    title="llama.cpp Chat",
    description="Test a GGUF model. Chats arent persistent",
    additional_inputs=[
        gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3),
        gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1),
        gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1),
    ],
    cache_examples=False,
)

iface.launch()