Spaces:
Sleeping
Sleeping
File size: 1,098 Bytes
c2ec273 a846510 c2ec273 a846510 c2ec273 a846510 c2ec273 a846510 fb33d5d a846510 9bb13ea ce39d2b 8da91a5 c9e16e4 e56723e c9e16e4 a846510 911ff7c 3d038b0 a846510 d776d2f a846510 d776d2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import subprocess
import threading
import os
import time
import spaces
def setup_mixinputs():
# Step 1: Run mixinputs setup
subprocess.run(["mixinputs", "setup"], check=True)
def launch_vllm_server(beta=1.0):
# Step 2: Set environment variables
env = os.environ.copy()
env["MIXINPUTS_BETA"] = str(beta)
env["VLLM_USE_V1"] = "1"
# Step 3: Launch vLLM with custom options
cmd = [
"vllm", "serve",
"Qwen/Qwen3-4B",
"--tensor-parallel-size", "1",
"--enforce-eager",
"--max-model-len", "4096",
"--max-seq-len-to-capture", "4096",
"--max-num-seqs", "36",
"--host", "0.0.0.0",
"--port", "8000",
"--enable-reasoning",
"--reasoning-parser", "deepseek_r1",
"--api-key", "EMPTY"
]
_server_process = subprocess.Popen(cmd, env=env)
# # Wait for the process to complete or be terminated
# _server_process.wait()
# Step 1: Setup
setup_mixinputs()
# Step 2: Launch vLLM server in background
threading.Thread(target=launch_vllm_server, daemon=True).start()
|