GGUF_CPU_Test_bench

Running

File size: 2,430 Bytes

2f0a0f2
ee85cfc
2e7e1d4
ee85cfc
 
72c3482
 
2e7e1d4
370ab1e
2e7e1d4
 
 
 
 
 
 
ba57f6d
2e7e1d4
 
67636a4
2e7e1d4
 
 
 
 
 
 
 
 
e1d5f80
3605ca4
ba57f6d
2319127
2e7e1d4
c6d1330
0228ec9
 
 
c6d1330
2e7e1d4
c6d1330
2e7e1d4
ba57f6d
2e7e1d4
ba57f6d
 
 
 
 
2e7e1d4
 
 
2319127
9b43cae
2319127
 
 
2e7e1d4
ba57f6d
2319127
370ab1e
3605ca4
ba57f6d
c6d1330
2e7e1d4
 
 
41fcfb9
2e7e1d4
 
 
 
 
 
 
370ab1e
2e7e1d4

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

MODEL_REPO = "DreadPoor/Sylvan_Lux-Q4_K_M-GGUF"
MODEL_FILENAME = "sylvan_lux-q4_k_m.gguf"
MODEL_PATH = "./" + MODEL_FILENAME

try:
    if not os.path.exists(MODEL_PATH):
        hf_hub_download(
            repo_id=MODEL_REPO,
            filename=MODEL_FILENAME,
            repo_type="model",
            local_dir=".",
        )
    llm = Llama(
        model_path=MODEL_PATH,
        n_ctx=6144,
        n_threads=2,
        n_threads_batch=2,
        verbose=False,
    )
except Exception as e:
    print(f"Error loading model: {e}")
    llm = None

DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart and capable AI; A silly, obliging and affable slave, dedicated to serving and caring for your master."

def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.7, top_p=0.9):
    if llm is None:
        return "Model failed to load."

    messages = [{"role": "system", "content": system_prompt}]
    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})

    prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])

    try:
        stream = llm(
            prompt,
            max_tokens=1024,
            echo=False,
            temperature=temperature,
            top_p=top_p,
            stream=True,  # Enable streaming
        )

        response_text = ""
        for part in stream:
            response_text += part["choices"][0]["text"]

        return response_text.strip()

    except Exception as e:
        return f"Error during inference: {e}"

def chat(message, history, system_prompt, temperature, top_p):
    return generate_response(message, history, system_prompt, temperature, top_p)

iface = gr.ChatInterface(
    fn=chat,
    title="llama.cpp Chat",
    description="Duplicate the space then edit in a GGUF model to test. Chats arent persistent",
    additional_inputs=[
        gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3),
        gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.8, step=0.1),
        gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1),
    ],
    cache_examples=False,
)

iface.launch()