import subprocess import threading import os import time import spaces def setup_mixinputs(): # Step 1: Run mixinputs setup subprocess.run(["mixinputs", "setup"], check=True) def launch_vllm_server(beta=1.0): # Step 2: Set environment variables env = os.environ.copy() env["MIXINPUTS_BETA"] = str(beta) env["VLLM_USE_V1"] = "1" # Step 3: Launch vLLM with custom options cmd = [ "vllm", "serve", "Qwen/Qwen3-4B", "--tensor-parallel-size", "1", "--enforce-eager", "--max-model-len", "4096", "--max-seq-len-to-capture", "4096", "--max-num-seqs", "36", "--host", "0.0.0.0", "--port", "8000", "--enable-reasoning", "--reasoning-parser", "deepseek_r1", "--api-key", "EMPTY" ] _server_process = subprocess.Popen(cmd, env=env) # # Wait for the process to complete or be terminated # _server_process.wait() # Step 1: Setup setup_mixinputs() # Step 2: Launch vLLM server in background threading.Thread(target=launch_vllm_server, daemon=True).start()