File size: 1,834 Bytes
c109793
1e2d981
6da1c26
1e2d981
20dd6a1
c2aa89c
 
20dd6a1
1cd873c
fca7347
c2aa89c
f7bec6d
 
 
c2aa89c
1e2d981
 
 
 
 
 
 
 
 
 
68eded2
1e2d981
 
 
 
 
 
 
 
 
1cd873c
1d9d6ab
1cd873c
1e2d981
1cd873c
1e2d981
92cb988
1d9d6ab
 
 
52ae9af
d47337b
d21f374
c109793
 
1e2d981
8a8d916
1e2d981
 
 
02743b6
6da1c26
02743b6
 
a09beae
1e2d981
 
 
 
 
 
 
 
 
6da1c26
1e2d981
 
 
 
8a8d916
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
import gradio as gr
from llama_cpp import Llama

model = "bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
llm = Llama.from_pretrained(
    repo_id=model,
    filename="*Q6_K.gguf",
    verbose=True,
    use_mmap=True,
    use_mlock=True,
    n_threads=4,
    n_threads_batch=4,
    n_ctx=8000,
)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""
    completion = llm.create_chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p
    )

    for message in completion:
        delta = message['choices'][0]['delta']
        if 'content' in delta:
            response += delta['content']
            formatted_response = re.sub(r'<think>(.*?)</think>', r'*\1*', response)
            yield formatted_response


demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are a helpful assistant.",
            label="System message",
        ),
        gr.Slider(minimum=200, maximum=100000, value=4000, step=100, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    description=model,
)


if __name__ == "__main__":
    demo.launch()