yzhuang commited on
Commit
2d7f359
Β·
verified Β·
1 Parent(s): 3579acb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -52
app.py CHANGED
@@ -1,62 +1,86 @@
1
- # app.py
2
- import json, requests, gradio as gr
3
- import server
 
 
4
 
5
- API_URL = "http://0.0.0.0:8000/v1/chat/completions"
 
 
6
 
7
- def stream_completion(message, history, max_tokens, temperature, top_p, beta):
8
- """Gradio callback: stream the assistant’s reply token-by-token."""
9
- # -------- build OpenAI-style message list (no system prompt) -------------
10
- messages = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  for user_msg, assistant_msg in history:
12
- if user_msg: # past user turn
13
  messages.append({"role": "user", "content": user_msg})
14
- if assistant_msg: # past assistant turn
15
  messages.append({"role": "assistant", "content": assistant_msg})
16
 
 
17
  messages.append({"role": "user", "content": message})
18
 
19
- payload = {
20
- "model": "Qwen/Qwen3-4B",
21
- "messages": messages,
22
- "temperature": temperature,
23
- "top_p": top_p,
24
- "max_tokens": int(max_tokens),
25
- "stream": True,
26
- }
27
- headers = {
28
- "Content-Type": "application/json",
29
- "X-MIXINPUTS-BETA": str(beta),
30
- }
31
-
32
  try:
33
- with requests.post(API_URL,
34
- json=payload,
35
- stream=True,
36
- headers=headers,
37
- timeout=(10, None)) as resp:
38
- resp.raise_for_status()
39
-
40
- assistant = ""
41
- # iterate over the HTTP chunks
42
- for raw in resp.iter_lines(decode_unicode=True, delimiter=b"\n"):
43
- if not raw:
44
- continue
45
- if raw.startswith("data: "):
46
- data = raw[6:] # strip the 'data: ' prefix
47
- else:
48
- data = raw
49
-
50
- if data.strip() == "[DONE]":
51
- break
52
-
53
- delta = json.loads(data)["choices"][0]["delta"].get("content", "")
54
- assistant += delta
55
- yield history + [(message, assistant)] # live update in Gradio
56
- except Exception as err:
57
  yield history + [(message, f"[ERROR] {err}")]
58
 
59
- # ---------------------------- UI --------------------------------------------
 
 
 
 
60
  with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
61
  gr.Markdown(
62
  "## 🎨 Mixture of Inputs (MoI) Demo \n"
@@ -65,10 +89,10 @@ with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
65
  )
66
 
67
  with gr.Row(): # sliders first
68
- beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Ξ²")
69
- temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature")
70
- top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top-p")
71
- max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens")
72
 
73
  chatbot = gr.Chatbot(height=450)
74
  user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
@@ -79,7 +103,9 @@ with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
79
  inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
80
  outputs=chatbot,
81
  )
 
82
  clear_btn.click(lambda: None, None, chatbot, queue=False)
83
 
 
84
  if __name__ == "__main__":
85
  demo.launch()
 
1
+ """Gradio chat demo that streams responses from a (local) OpenAI‑compatible
2
+ endpoint using the official `openai` Python SDK. The server is assumed to be
3
+ running at http://0.0.0.0:8000 with the v1 REST routes. A custom header
4
+ `X‑MIXINPUTS‑BETA` is forwarded so MoI can adjust its blending strength at
5
+ runtime.
6
 
7
+ Launch with:
8
+ python app_openai.py
9
+ """
10
 
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import openai
15
+ import gradio as gr
16
+
17
+ # ──────────────────────────────────────────────────────────────────────────────
18
+ # OpenAI client configuration
19
+ # ──────────────────────────────────────────────────────────────────────────────
20
+ # ``openai`` still expects an API key even if the backend ignores it, so we use
21
+ # a dummy value when none is provided. The *base_url* points to the local
22
+ # vLLM server that speaks the OpenAI REST dialect.
23
+ # -----------------------------------------------------------------------------
24
+ openai.api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
25
+ openai.base_url = "http://0.0.0.0:8000/v1"
26
+
27
+ # ──────────────────────────────────────────────────────────────────────────────
28
+ # Chat handler
29
+ # ──────────────────────────────────────────────────────────────────────────────
30
+
31
+ def stream_completion(message: str,
32
+ history: list[tuple[str, str]],
33
+ max_tokens: int,
34
+ temperature: float,
35
+ top_p: float,
36
+ beta: float):
37
+ """Gradio callback that yields streaming assistant replies.
38
+
39
+ The function reconstructs the conversation *excluding* any system prompt
40
+ and then calls ``openai.chat.completions.create`` with ``stream=True``.
41
+ Each incoming delta is appended to an ``assistant`` buffer which is sent
42
+ back to the Chatbot component for real‑time display.
43
+ """
44
+
45
+ # Build OpenAI‑style message list from prior turns
46
+ messages: list[dict[str, str]] = []
47
  for user_msg, assistant_msg in history:
48
+ if user_msg:
49
  messages.append({"role": "user", "content": user_msg})
50
+ if assistant_msg:
51
  messages.append({"role": "assistant", "content": assistant_msg})
52
 
53
+ # Current user input comes last
54
  messages.append({"role": "user", "content": message})
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  try:
57
+ # Kick off streaming completion
58
+ response = openai.chat.completions.create(
59
+ model="Qwen/Qwen3-4B",
60
+ messages=messages,
61
+ temperature=temperature,
62
+ top_p=top_p,
63
+ max_tokens=max_tokens,
64
+ stream=True,
65
+ # Forward MoI blending coefficient to the backend
66
+ extra_headers={"X-MIXINPUTS-BETA": str(beta)},
67
+ )
68
+
69
+ assistant = ""
70
+ for chunk in response:
71
+ # ``delta.content`` is None for e.g. role announcements; guard with or ""
72
+ delta = chunk.choices[0].delta.content or ""
73
+ assistant += delta
74
+ yield history + [(message, assistant)] # live update
75
+
76
+ except Exception as err: # pylint: disable=broad-except
 
 
 
 
77
  yield history + [(message, f"[ERROR] {err}")]
78
 
79
+
80
+ # ──────────────────────────────────────────────────────────────────────────────
81
+ # Gradio UI
82
+ # ──────────────────────────────────────────────────────────────────────────────
83
+
84
  with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
85
  gr.Markdown(
86
  "## 🎨 Mixture of Inputs (MoI) Demo \n"
 
89
  )
90
 
91
  with gr.Row(): # sliders first
92
+ beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Ξ²")
93
+ temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature")
94
+ top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top‑p")
95
+ max_tokens = gr.Slider(1, 2048, value=512, step=1, label="Max new tokens")
96
 
97
  chatbot = gr.Chatbot(height=450)
98
  user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False)
 
103
  inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta],
104
  outputs=chatbot,
105
  )
106
+
107
  clear_btn.click(lambda: None, None, chatbot, queue=False)
108
 
109
+ # ──────────────────────────────────────────────────────────────────────────────
110
  if __name__ == "__main__":
111
  demo.launch()