Tim Luka Horstmann commited on
Commit
9f40e8d
·
1 Parent(s): dcce920

Test first deployment

Browse files
Files changed (3) hide show
  1. Dockerfile +31 -0
  2. llm_server.py +70 -0
  3. requirements.txt +3 -0
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive \
4
+ MODEL_REPO="unsloth/Qwen3-0.6B-GGUF" \
5
+ MODEL_FILE="Qwen3-0.6B-Q4_K_M.gguf" \
6
+ HF_HOME=/app/cache
7
+
8
+ # system deps + rust for llama-cpp
9
+ RUN apt-get update && \
10
+ apt-get install -y --no-install-recommends \
11
+ build-essential cmake git curl wget libgomp1 ca-certificates && \
12
+ rm -rf /var/lib/apt/lists/* && \
13
+ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && \
14
+ . "$HOME/.cargo/env" && rustup default stable
15
+
16
+ WORKDIR /app
17
+ COPY requirements.txt .
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ RUN git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python \
21
+ && cd /tmp/llama-cpp-python \
22
+ && FORCE_CMAKE=1 pip install --no-cache-dir . \
23
+ && pip install --no-cache-dir "llama-cpp-python[server]" huggingface_hub \
24
+ && rm -rf /tmp/llama-cpp-python
25
+
26
+
27
+ # Copy the LLM server code
28
+ COPY llm_server.py /app/llm_server.py
29
+
30
+ EXPOSE 7860
31
+ CMD ["uvicorn", "llm_server:app", "--host", "0.0.0.0", "--port", "7860"]
llm_server.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, time, logging
2
+ from pathlib import Path
3
+ from fastapi import FastAPI, HTTPException
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+ from fastapi.responses import JSONResponse
6
+ from llama_cpp import Llama
7
+ from huggingface_hub import hf_hub_download, login
8
+
9
+ app = FastAPI()
10
+ app.add_middleware(
11
+ CORSMiddleware,
12
+ allow_origins=["*"],
13
+ allow_methods=["*"],
14
+ allow_headers=["*"],
15
+ )
16
+
17
+ MODEL_DIR = Path("/app/pretrained_models/llm")
18
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
19
+ MODEL_FILE = os.getenv("MODEL_FILE")
20
+ MODEL_PATH = MODEL_DIR / MODEL_FILE
21
+
22
+ @app.on_event("startup")
23
+ async def startup():
24
+ logging.info("Starting LLM service…")
25
+ if not MODEL_PATH.exists():
26
+ token = os.getenv("HF_TOKEN")
27
+ if token:
28
+ login(token=token)
29
+ hf_hub_download(
30
+ repo_id=os.getenv("MODEL_REPO"),
31
+ filename=MODEL_FILE,
32
+ local_dir=str(MODEL_DIR)
33
+ )
34
+ global llm
35
+ llm = Llama(
36
+ model_path=str(MODEL_PATH),
37
+ n_ctx=1024,
38
+ n_threads=2,
39
+ n_gpu_layers=0,
40
+ use_mlock=True,
41
+ f16_kv=True,
42
+ )
43
+ logging.info("LLM loaded.")
44
+
45
+ @app.post("/v1/chat/completions")
46
+ async def chat(req: dict):
47
+ if req.get("model") != "llama-cpp":
48
+ raise HTTPException(status_code=404, detail="Model not found")
49
+ resp = llm.create_chat_completion(
50
+ messages=req["messages"],
51
+ max_tokens=req.get("max_tokens", 256),
52
+ temperature=req.get("temperature", 0.7),
53
+ top_p=req.get("top_p", 1.0),
54
+ stream=False
55
+ )
56
+ return JSONResponse({
57
+ "id": resp["id"],
58
+ "object": "chat.completion",
59
+ "created": resp.get("created", int(time.time())),
60
+ "model": "llama-cpp",
61
+ "choices": [{
62
+ "index": 0,
63
+ "message": {
64
+ "role": resp["choices"][0]["message"]["role"],
65
+ "content": resp["choices"][0]["message"]["content"],
66
+ },
67
+ "finish_reason": resp["choices"][0].get("finish_reason", "stop"),
68
+ }],
69
+ "usage": resp.get("usage", {}),
70
+ })
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ huggingface_hub