from __future__ import annotations import os import openai import gradio as gr import server # ────────────────────────────────────────────────────────────────────────────── # OpenAI client configuration # ────────────────────────────────────────────────────────────────────────────── # ``openai`` still expects an API key even if the backend ignores it, so we use # a dummy value when none is provided. The *base_url* points to the local # vLLM server that speaks the OpenAI REST dialect. # ----------------------------------------------------------------------------- openai_api_key = "EMPTY" openai_api_base = "http://0.0.0.0:8000/v1" client = openai.OpenAI( api_key=openai_api_key, base_url=openai_api_base, ) # ────────────────────────────────────────────────────────────────────────────── # Chat handler # ────────────────────────────────────────────────────────────────────────────── def stream_completion(message: str, history: list[tuple[str, str]], max_tokens: int, temperature: float, top_p: float, beta: float): """Gradio callback that yields streaming assistant replies. The function reconstructs the conversation *excluding* any system prompt and then calls ``openai.chat.completions.create`` with ``stream=True``. Each incoming delta is appended to an ``assistant`` buffer which is sent back to the Chatbot component for real‑time display. """ # Build OpenAI‑style message list from prior turns messages: list[dict[str, str]] = [] for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Current user input comes last messages.append({"role": "user", "content": message}) os.environ["MIXINPUTS_BETA"] = str(beta) #try: # Kick off streaming completion response = client.chat.completions.create( model="Qwen/Qwen3-4B", messages=messages, temperature=temperature, top_p=top_p, max_tokens=max_tokens, ) assistant = response.choices[0].message.content yield history + [(message, assistant)] # live update # ────────────────────────────────────────────────────────────────────────────── # Gradio UI # ────────────────────────────────────────────────────────────────────────────── with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo: gr.Markdown( "## 🎨 Mixture of Inputs (MoI) Demo with Qwen3-4B\n" "Streaming vLLM demo with dynamic **beta** adjustment in MoI, feel how it affects the model!\n" "(higher beta → less blending).\n" "📕Paper: https://arxiv.org/abs/2505.14827 \n" "💻Code: https://github.com/EvanZhuang/mixinputs \n" ) with gr.Row(): # sliders first beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI β") temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Top‑p") max_tokens = gr.Slider(1, 3072, value=2048, step=1, label="Max new tokens") chatbot = gr.Chatbot(height=450) user_box = gr.Textbox(placeholder="Type a message and press Enter…", show_label=False) clear_btn = gr.Button("Clear chat") user_box.submit( fn=stream_completion, inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta], outputs=chatbot, ) clear_btn.click(lambda: None, None, chatbot, queue=False) # ────────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": demo.launch()