File size: 1,941 Bytes
1e2d981
6da1c26
1e2d981
637f425
c2aa89c
 
637f425
1cd873c
c2aa89c
 
 
 
1cd873c
c2aa89c
1e2d981
 
 
 
 
 
 
 
 
 
68eded2
1e2d981
 
 
 
 
 
 
 
 
1cd873c
 
 
 
 
 
 
 
1d9d6ab
1cd873c
1e2d981
1cd873c
1e2d981
 
1d9d6ab
 
 
d6de68f
20afe9e
1cd873c
 
 
1e2d981
8a8d916
1e2d981
 
 
02743b6
6da1c26
02743b6
 
1cd873c
1e2d981
 
 
 
 
 
 
 
 
6da1c26
1e2d981
 
 
 
8a8d916
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import gradio as gr
from llama_cpp import Llama

model = "Qwen/Qwen2-7B-Instruct-GGUF"
llm = Llama.from_pretrained(
    repo_id=model,
    filename="qwen2-7b-instruct-q4_k_m.gguf",
    verbose=True,
    use_mmap=False,
    use_mlock=True,
    n_threads=2,
    n_threads_batch=2,
    n_ctx=8000,
)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    # response = llm.create_chat_completion(
    #     messages=messages,
    #     max_tokens=max_tokens,
    #     temperature=temperature,
    #     top_p=top_p,
    # )
    # return response["choices"][0]["message"]["content"]
    response = ""
    completion = llm.create_chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    )

    for message in completion:
        print(message)
        token = message['choices'][0]['delta']['text']

        response += token
        yield response


demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(
            value="You are a helpful assistant.",
            label="System message",
        ),
        gr.Slider(minimum=1, maximum=2048, value=1024, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
    description=model,
)


if __name__ == "__main__":
    demo.launch()