Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

App Files Files Community

yzhuang commited on May 22

Commit

25fd9cd

verified ·

1 Parent(s): 524243a

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -13

app.py CHANGED Viewed

@@ -21,8 +21,13 @@ import gradio as gr
 # a dummy value when none is provided.  The *base_url* points to the local
 # vLLM server that speaks the OpenAI REST dialect.
 # -----------------------------------------------------------------------------
-openai.api_key = os.getenv("OPENAI_API_KEY", "EMPTY")
-openai.base_url = "http://0.0.0.0:8000/v1"
 # ──────────────────────────────────────────────────────────────────────────────
 # Chat handler
@@ -57,24 +62,16 @@ def stream_completion(message: str,
     #try:
     # Kick off streaming completion
-    response = openai.chat.completions.create(
         model="Qwen/Qwen3-4B",
         messages=messages,
         temperature=temperature,
         top_p=top_p,
         max_tokens=max_tokens,
-        stream=True,
     )
-    assistant = ""
-    for chunk in response:
-        # ``delta.content`` is None for e.g. role announcements; guard with or ""
-        delta = chunk.choices[0].delta.content or ""
-        assistant += delta
-        yield history + [(message, assistant)]  # live update
-    # except Exception as err:  # pylint: disable=broad-except
-    #     yield history + [(message, f"[ERROR] {err}")]
 # ──────────────────────────────────────────────────────────────────────────────

 # a dummy value when none is provided.  The *base_url* points to the local
 # vLLM server that speaks the OpenAI REST dialect.
 # -----------------------------------------------------------------------------
+openai_api_key = "EMPTY"
+openai_api_base = "http://0.0.0.0:8000/v1"
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
 # ──────────────────────────────────────────────────────────────────────────────
 # Chat handler
     #try:
     # Kick off streaming completion
+    response = client.chat.completions.create(
         model="Qwen/Qwen3-4B",
         messages=messages,
         temperature=temperature,
         top_p=top_p,
         max_tokens=max_tokens,
     )
+    assistant = response.choices[0].message
+    yield history + [(message, assistant)]  # live update
 # ──────────────────────────────────────────────────────────────────────────────