Spaces:
Sleeping
Sleeping
from __future__ import annotations | |
import os | |
import openai | |
import gradio as gr | |
import server | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# OpenAI client configuration | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# ``openai`` still expects an API key even if the backend ignores it, so we use | |
# a dummy value when none is provided. The *base_url* points to the local | |
# vLLM server that speaks the OpenAI REST dialect. | |
# ----------------------------------------------------------------------------- | |
openai_api_key = "EMPTY" | |
openai_api_base = "http://0.0.0.0:8000/v1" | |
client = openai.OpenAI( | |
api_key=openai_api_key, | |
base_url=openai_api_base, | |
) | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# Chat handler | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def stream_completion(message: str, | |
history: list[tuple[str, str]], | |
max_tokens: int, | |
temperature: float, | |
top_p: float, | |
beta: float): | |
"""Gradio callback that yields streaming assistant replies. | |
The function reconstructs the conversation *excluding* any system prompt | |
and then calls ``openai.chat.completions.create`` with ``stream=True``. | |
Each incoming delta is appended to an ``assistant`` buffer which is sent | |
back to the Chatbot component for realβtime display. | |
""" | |
# Build OpenAIβstyle message list from prior turns | |
messages: list[dict[str, str]] = [] | |
for user_msg, assistant_msg in history: | |
if user_msg: | |
messages.append({"role": "user", "content": user_msg}) | |
if assistant_msg: | |
messages.append({"role": "assistant", "content": assistant_msg}) | |
# Current user input comes last | |
messages.append({"role": "user", "content": message}) | |
os.environ["MIXINPUTS_BETA"] = str(beta) | |
#try: | |
# Kick off streaming completion | |
response = client.chat.completions.create( | |
model="Qwen/Qwen3-4B", | |
messages=messages, | |
temperature=temperature, | |
top_p=top_p, | |
max_tokens=max_tokens, | |
) | |
assistant = response.choices[0].message.content | |
yield history + [(message, assistant)] # live update | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# Gradio UI | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
with gr.Blocks(title="π¨ Mixture of Inputs (MoI) Demo") as demo: | |
gr.Markdown( | |
"## π¨ Mixture of Inputs (MoI) Demo with Qwen3-4B\n" | |
"Streaming vLLM demo with dynamic **beta** adjustment in MoI, feel how it affects the model!\n" | |
"(higher beta β less blending).\n" | |
"πPaper: https://arxiv.org/abs/2505.14827 \n" | |
"π»Code: https://github.com/EvanZhuang/mixinputs \n" | |
) | |
with gr.Row(): # sliders first | |
beta = gr.Slider(0.0, 10.0, value=1.0, step=0.1, label="MoI Ξ²") | |
temperature = gr.Slider(0.1, 1.0, value=0.6, step=0.1, label="Temperature") | |
top_p = gr.Slider(0.1, 1.0, value=0.80, step=0.05, label="Topβp") | |
max_tokens = gr.Slider(1, 3072, value=2048, step=1, label="Max new tokens") | |
chatbot = gr.Chatbot(height=450) | |
user_box = gr.Textbox(placeholder="Type a message and press Enterβ¦", show_label=False) | |
clear_btn = gr.Button("Clear chat") | |
user_box.submit( | |
fn=stream_completion, | |
inputs=[user_box, chatbot, max_tokens, temperature, top_p, beta], | |
outputs=chatbot, | |
) | |
clear_btn.click(lambda: None, None, chatbot, queue=False) | |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
if __name__ == "__main__": | |
demo.launch() | |