Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

App Files Files Community

yzhuang commited on May 22

Commit

c2ec273

1 Parent(s): 3d9b062

update

Browse files

Files changed (2) hide show

app.py +4 -2
server.py +17 -60

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import requests
 import sseclient
 import gradio as gr
-import server
 API_URL = "http://localhost:8000/v1/chat/completions"
@@ -53,7 +53,7 @@ def stream_completion(message, history, max_tokens, temperature, top_p, beta):
 # ----------------------- UI ---------------------------------------------
-with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo", teardown=lambda: _kill_proc_tree(server_proc)) as demo:
     gr.Markdown(
         "## 🎨 Mixture of Inputs (MoI) Demo  \n"
         "Streaming vLLM demo with dynamic **beta** adjustment in MoI, higher beta means less blending."
@@ -80,4 +80,6 @@ with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo", teardown=lambda: _kill
     clear_btn.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
     demo.launch()

 import requests
 import sseclient
 import gradio as gr
+from server import setup_mixinputs, launch_vllm_server
 API_URL = "http://localhost:8000/v1/chat/completions"
 # ----------------------- UI ---------------------------------------------
+with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
     gr.Markdown(
         "## 🎨 Mixture of Inputs (MoI) Demo  \n"
         "Streaming vLLM demo with dynamic **beta** adjustment in MoI, higher beta means less blending."
     clear_btn.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
+    setup_mixinputs()
+    launch_vllm_server(beta=1.0)
     demo.launch()

server.py CHANGED Viewed

@@ -1,45 +1,20 @@
-# app.py ── launch vLLM inside a Hugging Face Space (with clean shutdown)
-import os, signal, sys, atexit, time, socket, subprocess
-import spaces          # only needed for the GPU decorator
-# ----------------------------------------------------------------------
-# Helpers
-# ----------------------------------------------------------------------
-def _wait_for_port(host: str, port: int, timeout: int = 240):
-    """Block until (host, port) accepts TCP connections or timeout."""
-    deadline = time.time() + timeout
-    while time.time() < deadline:
-        with socket.socket() as sock:
-            sock.settimeout(2)
-            if sock.connect_ex((host, port)) == 0:
-                return
-        time.sleep(1)
-    raise RuntimeError(f"vLLM server on {host}:{port} never came up")
-def _kill_proc_tree(proc: subprocess.Popen):
-    """SIGTERM the whole process-group started by `proc` (if still alive)."""
-    if proc and proc.poll() is None:           # still running
-        pgid = os.getpgid(proc.pid)
-        os.killpg(pgid, signal.SIGTERM)        # graceful
-        try:
-            proc.wait(15)
-        except subprocess.TimeoutExpired:
-            os.killpg(pgid, signal.SIGKILL)    # force
-# ----------------------------------------------------------------------
-# Setup – runs on *CPU* only; fast.
-# ----------------------------------------------------------------------
 def setup_mixinputs():
     subprocess.run(["mixinputs", "setup"], check=True)
-# ----------------------------------------------------------------------
-# Serve – runs on the GPU; heavy, so we mark it.
-# ----------------------------------------------------------------------
-def launch_vllm_server(beta: float = 1.0, port: int = 8000) -> subprocess.Popen:
     env = os.environ.copy()
     env["MIXINPUTS_BETA"] = str(beta)
     env["VLLM_USE_V1"] = "1"
     cmd = [
         "vllm", "serve",
         "Qwen/Qwen3-4B",
@@ -48,33 +23,15 @@ def launch_vllm_server(beta: float = 1.0, port: int = 8000) -> subprocess.Popen:
         "--max-model-len", "2048",
         "--max-seq-len-to-capture", "2048",
         "--max-num-seqs", "1",
-        "--port", str(port)
     ]
-    # new session ⇒ its own process-group
-    proc = subprocess.Popen(cmd, env=env, start_new_session=True)
-    _wait_for_port("localhost", port)          # block until ready
-    return proc
-# ----------------------------------------------------------------------
-# MAIN
-# ----------------------------------------------------------------------
-if __name__ == "__main__":
-    setup_mixinputs()                          # fast
-    server_proc = launch_vllm_server()         # heavy
-    # Ensures the GPU process dies when the Space stops / reloads
-    atexit.register(_kill_proc_tree, server_proc)
-    # ---- your Gradio / FastAPI app goes below ----
-    #   e.g. import gradio as gr
-    #        with gr.Blocks(teardown=lambda: _kill_proc_tree(server_proc)) as demo:
-    #            ...
-    #        demo.launch(server_name="0.0.0.0", server_port=7860)
-    #
-    # For this snippet we’ll just block forever so the container
-    # doesn’t exit immediately.
-    try:
-        server_proc.wait()
-    except KeyboardInterrupt:
-        pass

+import subprocess
+import threading
+import os
+import time
+import spaces
 def setup_mixinputs():
+    # Step 1: Run mixinputs setup
     subprocess.run(["mixinputs", "setup"], check=True)
+def launch_vllm_server(beta=1.0):
+    # Step 2: Set environment variables
     env = os.environ.copy()
     env["MIXINPUTS_BETA"] = str(beta)
     env["VLLM_USE_V1"] = "1"
+    # Step 3: Launch vLLM with custom options
     cmd = [
         "vllm", "serve",
         "Qwen/Qwen3-4B",
         "--max-model-len", "2048",
         "--max-seq-len-to-capture", "2048",
         "--max-num-seqs", "1",
+        "--port", "8000"
     ]
+    subprocess.run(cmd, env=env)
+# # Step 1: Setup
+# setup_mixinputs()
+# # Step 2: Launch vLLM server in background
+# threading.Thread(target=launch_vllm_server, daemon=True).start()
+# # Step 3: Give time for server to initialize
+# time.sleep(60)