File size: 1,902 Bytes
9726fac
 
1e2d981
9726fac
1e2d981
9726fac
 
 
 
 
 
 
c2aa89c
1e2d981
 
 
 
 
 
 
 
 
 
9702f0e
 
 
9726fac
 
 
 
 
 
1e2d981
 
 
9726fac
 
9702f0e
9726fac
 
 
 
 
 
1d9d6ab
9a28b27
9726fac
 
1c58dec
2e815cd
 
e8e2a24
2e815cd
e8e2a24
bf5ce6e
1c58dec
9726fac
1e2d981
9726fac
 
 
8a8d916
1e2d981
 
 
1c58dec
9702f0e
9726fac
 
1e2d981
 
 
 
8a8d916
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import socket
import subprocess
import gradio as gr
from openai import OpenAI


subprocess.Popen("bash /home/user/app/start.sh", shell=True)

client = OpenAI(
    base_url="http://0.0.0.0:8000/v1",
    api_key="sk-local",
    timeout=600
)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = []
    if system_message:
        messages = [{"role": "system", "content": system_message}]
    
    for user, assistant in history:
        if user:
            messages.append({"role": "user", "content": user})
        if assistant:
            messages.append({"role": "assistant", "content": assistant})

    messages.append({"role": "user", "content": message})

    try:
        stream = client.chat.completions.create(
            model="Deepseek-R1-0528-Qwen3-8B",
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stream=True,
        )

        print("messages", messages)
        output = ""
        for chunk in stream:
            delta = chunk.choices[0].delta
            print(delta)
            
            try:
                output += delta.reasoning_content
            except:
                output += delta.content or ""

            yield output

    except Exception as e:
        print(f"[Error] {e}")
        yield "⚠️ Llama.cpp server error"

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="", label="System message"),
        gr.Slider(minimum=1, maximum=8000, value=4096, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    demo.launch()