Qwen3 / llm_server.py
Tim Luka Horstmann
Test first deployment
9f40e8d
raw
history blame
2.08 kB
import os, time, logging
from pathlib import Path
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from llama_cpp import Llama
from huggingface_hub import hf_hub_download, login
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
MODEL_DIR = Path("/app/pretrained_models/llm")
MODEL_DIR.mkdir(parents=True, exist_ok=True)
MODEL_FILE = os.getenv("MODEL_FILE")
MODEL_PATH = MODEL_DIR / MODEL_FILE
@app.on_event("startup")
async def startup():
logging.info("Starting LLM service…")
if not MODEL_PATH.exists():
token = os.getenv("HF_TOKEN")
if token:
login(token=token)
hf_hub_download(
repo_id=os.getenv("MODEL_REPO"),
filename=MODEL_FILE,
local_dir=str(MODEL_DIR)
)
global llm
llm = Llama(
model_path=str(MODEL_PATH),
n_ctx=1024,
n_threads=2,
n_gpu_layers=0,
use_mlock=True,
f16_kv=True,
)
logging.info("LLM loaded.")
@app.post("/v1/chat/completions")
async def chat(req: dict):
if req.get("model") != "llama-cpp":
raise HTTPException(status_code=404, detail="Model not found")
resp = llm.create_chat_completion(
messages=req["messages"],
max_tokens=req.get("max_tokens", 256),
temperature=req.get("temperature", 0.7),
top_p=req.get("top_p", 1.0),
stream=False
)
return JSONResponse({
"id": resp["id"],
"object": "chat.completion",
"created": resp.get("created", int(time.time())),
"model": "llama-cpp",
"choices": [{
"index": 0,
"message": {
"role": resp["choices"][0]["message"]["role"],
"content": resp["choices"][0]["message"]["content"],
},
"finish_reason": resp["choices"][0].get("finish_reason", "stop"),
}],
"usage": resp.get("usage", {}),
})