Spaces:

yzhuang
/

MixtureOfInputs

Sleeping

App Files Files Community

yzhuang commited on May 22

Commit

3d9b062

1 Parent(s): 5a70ba7

fix

Browse files

Files changed (2) hide show

app.py +1 -1
server.py +60 -18

app.py CHANGED Viewed

@@ -53,7 +53,7 @@ def stream_completion(message, history, max_tokens, temperature, top_p, beta):
 # ----------------------- UI ---------------------------------------------
-with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo") as demo:
     gr.Markdown(
         "## 🎨 Mixture of Inputs (MoI) Demo  \n"
         "Streaming vLLM demo with dynamic **beta** adjustment in MoI, higher beta means less blending."

 # ----------------------- UI ---------------------------------------------
+with gr.Blocks(title="🎨 Mixture of Inputs (MoI) Demo", teardown=lambda: _kill_proc_tree(server_proc)) as demo:
     gr.Markdown(
         "## 🎨 Mixture of Inputs (MoI) Demo  \n"
         "Streaming vLLM demo with dynamic **beta** adjustment in MoI, higher beta means less blending."

server.py CHANGED Viewed

@@ -1,21 +1,45 @@
-import subprocess
-import threading
-import os
-import time
-import spaces
 def setup_mixinputs():
-    # Step 1: Run mixinputs setup
     subprocess.run(["mixinputs", "setup"], check=True)
-# @spaces.GPU(duration=240)
-def launch_vllm_server(beta=1.0):
-    # Step 2: Set environment variables
     env = os.environ.copy()
     env["MIXINPUTS_BETA"] = str(beta)
     env["VLLM_USE_V1"] = "1"
-    # Step 3: Launch vLLM with custom options
     cmd = [
         "vllm", "serve",
         "Qwen/Qwen3-4B",
@@ -24,15 +48,33 @@ def launch_vllm_server(beta=1.0):
         "--max-model-len", "2048",
         "--max-seq-len-to-capture", "2048",
         "--max-num-seqs", "1",
-        "--port", "8000"
     ]
-    subprocess.run(cmd, env=env)
-# Step 1: Setup
-setup_mixinputs()
-# Step 2: Launch vLLM server in background
-threading.Thread(target=launch_vllm_server, daemon=True).start()
-# Step 3: Give time for server to initialize
-time.sleep(60)

+# app.py ── launch vLLM inside a Hugging Face Space (with clean shutdown)
+import os, signal, sys, atexit, time, socket, subprocess
+import spaces          # only needed for the GPU decorator
+# ----------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------
+def _wait_for_port(host: str, port: int, timeout: int = 240):
+    """Block until (host, port) accepts TCP connections or timeout."""
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        with socket.socket() as sock:
+            sock.settimeout(2)
+            if sock.connect_ex((host, port)) == 0:
+                return
+        time.sleep(1)
+    raise RuntimeError(f"vLLM server on {host}:{port} never came up")
+def _kill_proc_tree(proc: subprocess.Popen):
+    """SIGTERM the whole process-group started by `proc` (if still alive)."""
+    if proc and proc.poll() is None:           # still running
+        pgid = os.getpgid(proc.pid)
+        os.killpg(pgid, signal.SIGTERM)        # graceful
+        try:
+            proc.wait(15)
+        except subprocess.TimeoutExpired:
+            os.killpg(pgid, signal.SIGKILL)    # force
+# ----------------------------------------------------------------------
+# Setup – runs on *CPU* only; fast.
+# ----------------------------------------------------------------------
 def setup_mixinputs():
     subprocess.run(["mixinputs", "setup"], check=True)
+# ----------------------------------------------------------------------
+# Serve – runs on the GPU; heavy, so we mark it.
+# ----------------------------------------------------------------------
+def launch_vllm_server(beta: float = 1.0, port: int = 8000) -> subprocess.Popen:
     env = os.environ.copy()
     env["MIXINPUTS_BETA"] = str(beta)
     env["VLLM_USE_V1"] = "1"
     cmd = [
         "vllm", "serve",
         "Qwen/Qwen3-4B",
         "--max-model-len", "2048",
         "--max-seq-len-to-capture", "2048",
         "--max-num-seqs", "1",
+        "--port", str(port)
     ]
+    # new session ⇒ its own process-group
+    proc = subprocess.Popen(cmd, env=env, start_new_session=True)
+    _wait_for_port("localhost", port)          # block until ready
+    return proc
+# ----------------------------------------------------------------------
+# MAIN
+# ----------------------------------------------------------------------
+if __name__ == "__main__":
+    setup_mixinputs()                          # fast
+    server_proc = launch_vllm_server()         # heavy
+    # Ensures the GPU process dies when the Space stops / reloads
+    atexit.register(_kill_proc_tree, server_proc)
+    # ---- your Gradio / FastAPI app goes below ----
+    #   e.g. import gradio as gr
+    #        with gr.Blocks(teardown=lambda: _kill_proc_tree(server_proc)) as demo:
+    #            ...
+    #        demo.launch(server_name="0.0.0.0", server_port=7860)
+    #
+    # For this snippet we’ll just block forever so the container
+    # doesn’t exit immediately.
+    try:
+        server_proc.wait()
+    except KeyboardInterrupt:
+        pass