File size: 1,098 Bytes
c2ec273
 
 
 
 
a846510
 
c2ec273
a846510
 
c2ec273
 
a846510
 
 
 
c2ec273
a846510
 
fb33d5d
a846510
 
9bb13ea
 
ce39d2b
8da91a5
c9e16e4
e56723e
 
c9e16e4
a846510
911ff7c
 
3d038b0
 
a846510
d776d2f
 
a846510
d776d2f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import subprocess
import threading
import os
import time
import spaces

def setup_mixinputs():
    # Step 1: Run mixinputs setup
    subprocess.run(["mixinputs", "setup"], check=True)

def launch_vllm_server(beta=1.0):
    # Step 2: Set environment variables
    env = os.environ.copy()
    env["MIXINPUTS_BETA"] = str(beta)
    env["VLLM_USE_V1"] = "1"

    # Step 3: Launch vLLM with custom options
    cmd = [
        "vllm", "serve",
        "Qwen/Qwen3-4B",
        "--tensor-parallel-size", "1",
        "--enforce-eager",
        "--max-model-len", "4096",
        "--max-seq-len-to-capture", "4096",
        "--max-num-seqs", "36",
        "--host", "0.0.0.0",
        "--port", "8000",
        "--enable-reasoning",
        "--reasoning-parser", "deepseek_r1",
        "--api-key", "EMPTY"
    ]
    _server_process = subprocess.Popen(cmd, env=env)
        
    # # Wait for the process to complete or be terminated
    # _server_process.wait()

# Step 1: Setup
setup_mixinputs()

# Step 2: Launch vLLM server in background
threading.Thread(target=launch_vllm_server, daemon=True).start()