File size: 2,263 Bytes
2f0a0f2
ee85cfc
e1d5f80
ee85cfc
 
e1d5f80
 
 
 
ba57f6d
 
 
 
 
 
 
 
e1d5f80
ba57f6d
 
 
 
 
 
 
 
 
 
e1d5f80
59eb87f
0228ec9
ba57f6d
 
 
c6d1330
0228ec9
 
 
c6d1330
 
ba57f6d
 
 
 
 
 
 
 
 
 
 
c6d1330
ba57f6d
 
c6d1330
0228ec9
 
 
59eb87f
ba57f6d
 
 
 
 
 
0228ec9
ee85cfc
2f0a0f2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os

MODEL_REPO = "DreadPoor/Irixium-12B-Model_Stock-Q4_K_M-GGUF"
MODEL_FILENAME = "irixium-12b-model_stock-q4_k_m.gguf"
MODEL_PATH = "./" + MODEL_FILENAME

try:
    if not os.path.exists(MODEL_PATH):
        hf_hub_download(
            repo_id=MODEL_REPO,
            filename=MODEL_FILENAME,
            repo_type="model",
            local_dir=".",
        )

    llm = Llama(
        model_path=MODEL_PATH,
        n_ctx=4096,
        n_threads=2,
        n_threads_batch=2,
        verbose=False,
    )
except Exception as e:
    print(f"Error loading model: {e}")
    llm = None

DEFAULT_SYSTEM_PROMPT = "You are Doll, a smart yet silly, obliging and affable slave, your duty is to serve while caring for your master."

def generate_response(message, history, system_prompt=DEFAULT_SYSTEM_PROMPT, temperature=0.7, top_p=0.9):
    if llm is None:
        return "Model failed to load."
    messages = [{"role": "system", "content": system_prompt}]
    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})
    prompt = "".join([f"{m['role'].capitalize()}: {m['content']}\n" for m in messages])
    try:
        output = llm(
            prompt,
            max_tokens=1024,
            echo=False,
            temperature=temperature,
            top_p=top_p,
        )
        return output["choices"][0]["text"].strip()
    except Exception as e:
        return f"Error during inference: {e}"

def chat(message, history, system_prompt, temperature, top_p):
    return generate_response(message, history, system_prompt, temperature, top_p)

iface = gr.ChatInterface(
    fn=chat,
    title="llama.cpp Chat",
    description="Test a GGUF model. Chats arent persistent",
    additional_inputs=[
        gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=3),
        gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1),
        gr.Slider(label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1),
    ],
    cache_examples=False,
)

iface.launch()