File size: 1,902 Bytes
9726fac 1e2d981 9726fac 1e2d981 9726fac c2aa89c 1e2d981 9702f0e 9726fac 1e2d981 9726fac 9702f0e 9726fac 1d9d6ab 9a28b27 9726fac 1c58dec 2e815cd e8e2a24 2e815cd e8e2a24 bf5ce6e 1c58dec 9726fac 1e2d981 9726fac 8a8d916 1e2d981 1c58dec 9702f0e 9726fac 1e2d981 8a8d916 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import socket
import subprocess
import gradio as gr
from openai import OpenAI
subprocess.Popen("bash /home/user/app/start.sh", shell=True)
client = OpenAI(
base_url="http://0.0.0.0:8000/v1",
api_key="sk-local",
timeout=600
)
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
):
messages = []
if system_message:
messages = [{"role": "system", "content": system_message}]
for user, assistant in history:
if user:
messages.append({"role": "user", "content": user})
if assistant:
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
try:
stream = client.chat.completions.create(
model="Deepseek-R1-0528-Qwen3-8B",
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=True,
)
print("messages", messages)
output = ""
for chunk in stream:
delta = chunk.choices[0].delta
print(delta)
try:
output += delta.reasoning_content
except:
output += delta.content or ""
yield output
except Exception as e:
print(f"[Error] {e}")
yield "⚠️ Llama.cpp server error"
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="", label="System message"),
gr.Slider(minimum=1, maximum=8000, value=4096, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
demo.launch()
|