yzhuang commited on
Commit
c1965a3
·
1 Parent(s): 335b83f
Files changed (1) hide show
  1. app.py +56 -49
app.py CHANGED
@@ -1,74 +1,81 @@
1
- import gradio as gr
 
2
  import requests
3
  import sseclient
4
- import os
5
 
6
  API_URL = "http://localhost:8000/v1/chat/completions"
7
 
8
- def respond(
9
- message,
10
- history: list[tuple[str, str]],
11
- system_message,
12
- max_tokens,
13
- temperature,
14
- top_p,
15
- beta,
16
- ):
17
- # Build message history
18
- messages = [{"role": "system", "content": system_message}]
19
- for user, assistant in history:
20
- if user:
21
- messages.append({"role": "user", "content": user})
22
- if assistant:
23
- messages.append({"role": "assistant", "content": assistant})
24
  messages.append({"role": "user", "content": message})
25
 
26
- # Prepare request payload
27
  payload = {
28
- "model": "Qwen/Qwen3-4B", # Update to your actual model if needed
29
  "messages": messages,
30
  "temperature": temperature,
31
  "top_p": top_p,
32
- "max_tokens": max_tokens,
33
  "stream": True,
34
  }
35
-
36
- # Optional: send beta as a custom OpenAI field
37
  headers = {
38
  "Content-Type": "application/json",
39
- "X-MIXINPUTS-BETA": str(beta), # or modify your vLLM code to read this
40
  }
41
 
42
- # Stream response using SSE (Server-Sent Events)
43
  try:
44
- response = requests.post(API_URL, json=payload, stream=True, headers=headers)
45
- response.raise_for_status()
46
- client = sseclient.SSEClient(response)
47
 
48
- full_text = ""
49
  for event in client.events():
50
- if event.data == "[DONE]":
51
  break
52
- delta = event.json()["choices"][0]["delta"].get("content", "")
53
- full_text += delta
54
- yield full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- except Exception as e:
57
- yield f"[ERROR] {e}"
 
58
 
59
- # UI layout using ChatInterface
60
- demo = gr.ChatInterface(
61
- respond,
62
- additional_inputs=[
63
- gr.Textbox(value="You are a helpful assistant using Mixture of Inputs.", label="System message"),
64
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
65
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
66
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
67
- gr.Slider(minimum=0.0, maximum=10.0, value=1.0, step=0.1, label="MoI Beta"),
68
- ],
69
- title="🧪 Mixture of Inputs (MoI) Demo",
70
- description="Streaming local vLLM demo with dynamic MoI beta adjustment.",
71
- )
72
 
73
  if __name__ == "__main__":
74
- demo.launch()
 
1
+ # app.py
2
+ import json
3
  import requests
4
  import sseclient
5
+ import gradio as gr
6
 
7
  API_URL = "http://localhost:8000/v1/chat/completions"
8
 
9
+
10
+ def stream_completion(message, history, max_tokens, temperature, top_p, beta):
11
+ """
12
+ Gradio callback: takes the newest user message + full chat history,
13
+ returns an updated history while streaming assistant tokens.
14
+ """
15
+ # ------- build OpenAI-style message list (no system prompt) -------------
16
+ messages = []
17
+ for usr, bot in history:
18
+ if usr:
19
+ messages.append({"role": "user", "content": usr})
20
+ if bot:
21
+ messages.append({"role": "assistant", "content": bot})
 
 
 
22
  messages.append({"role": "user", "content": message})
23
 
 
24
  payload = {
25
+ "model": "Qwen/Qwen3-4B",
26
  "messages": messages,
27
  "temperature": temperature,
28
  "top_p": top_p,
29
+ "max_tokens": int(max_tokens),
30
  "stream": True,
31
  }
 
 
32
  headers = {
33
  "Content-Type": "application/json",
34
+ "X-MIXINPUTS-BETA": str(beta),
35
  }
36
 
 
37
  try:
38
+ resp = requests.post(API_URL, json=payload, stream=True, headers=headers, timeout=60)
39
+ resp.raise_for_status()
40
+ client = sseclient.SSEClient(resp)
41
 
42
+ assistant = ""
43
  for event in client.events():
44
+ if event.data.strip() == "[DONE]":
45
  break
46
+ delta = json.loads(event.data)["choices"][0]["delta"].get("content", "")
47
+ assistant += delta
48
+ yield history + [(message, assistant)] # update the chat box live
49
+
50
+ except Exception as err:
51
+ yield history + [(message, f"[ERROR] {err}")]
52
+
53
+
54
+ # ----------------------- UI ---------------------------------------------
55
+ with gr.Blocks(title="🧪 Mixture of Inputs (MoI) Demo") as demo:
56
+ gr.Markdown(
57
+ "## 🧪 Mixture of Inputs (MoI) Demo \n"
58
+ "Streaming local vLLM demo with dynamic **beta** adjustment."
59
+ )
60
+
61
+ # sliders first – all on one row
62
+ with gr.Row():
63
+ max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens")
64
+ temperature = gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature")
65
+ top_p = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p")
66
+ beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Beta")
67
 
68
+ chatbot = gr.Chatbot(height=450)
69
+ user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
70
+ clear_btn = gr.Button("Clear chat")
71
 
72
+ # wiring
73
+ user_box.submit(
74
+ stream_completion,
75
+ inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
76
+ outputs=chatbot,
77
+ )
78
+ clear_btn.click(lambda: None, None, chatbot, queue=False)
 
 
 
 
 
 
79
 
80
  if __name__ == "__main__":
81
+ demo.launch()