Tim Luka Horstmann
commited on
Commit
Β·
7e32345
1
Parent(s):
c6ab136
Tried to copy other approach
Browse files- Dockerfile +26 -62
- llm_server.py +127 -34
Dockerfile
CHANGED
@@ -1,81 +1,45 @@
|
|
1 |
-
#
|
2 |
-
FROM python:3.10
|
3 |
|
4 |
-
#
|
5 |
-
ARG HF_TOKEN
|
6 |
-
ENV HF_TOKEN=${HF_TOKEN}
|
7 |
-
|
8 |
-
# 3) Environment
|
9 |
ENV DEBIAN_FRONTEND=noninteractive \
|
10 |
RUSTUP_HOME=/root/.rustup \
|
11 |
CARGO_HOME=/root/.cargo \
|
12 |
PATH=/root/.cargo/bin:$PATH \
|
13 |
-
|
14 |
-
|
15 |
-
HF_HOME=/app/cache \
|
16 |
-
TRANSFORMERS_CACHE=/app/cache
|
17 |
|
|
|
18 |
WORKDIR /app
|
19 |
|
20 |
-
#
|
21 |
-
RUN apt-get update && \
|
22 |
-
apt-get install -y --no-install-recommends \
|
23 |
build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
|
24 |
-
gcc g++ libffi-dev
|
25 |
-
libcurl4-openssl-dev \ # β add this
|
26 |
&& rm -rf /var/lib/apt/lists/* \
|
27 |
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
|
28 |
&& rustup default stable
|
29 |
|
30 |
-
#
|
31 |
-
RUN mkdir -p /app/cache /app/
|
32 |
-
chmod -R 777 /app/cache /app/pretrained_models/llm
|
33 |
|
34 |
-
#
|
35 |
COPY requirements.txt .
|
36 |
-
RUN sed -i '/llama-cpp-python/d' requirements.txt
|
37 |
-
|
38 |
-
|
39 |
-
# 7) Build llama-cpp-python from source
|
40 |
-
RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
|
41 |
-
cd /tmp/llama-cpp-python && \
|
42 |
-
git submodule update --init --recursive && \
|
43 |
-
python -m pip install --no-cache-dir . && \
|
44 |
-
rm -rf /tmp/llama-cpp-python
|
45 |
-
|
46 |
-
# 8) Install huggingface_hub CLI
|
47 |
-
RUN pip install --no-cache-dir huggingface_hub
|
48 |
|
49 |
-
# 9) Clone llama.cpp and build just the quantize tool
|
50 |
-
RUN git clone https://github.com/ggerganov/llama.cpp.git /tmp/llama.cpp && \
|
51 |
-
cd /tmp/llama.cpp && \
|
52 |
-
cmake -B build -DLLAMA_BUILD_QUANTIZE=ON . && \
|
53 |
-
cmake --build build --target quantize
|
54 |
|
55 |
-
#
|
56 |
-
RUN
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
filename=os.getenv('MODEL_FILE'),
|
64 |
-
local_dir='/app/pretrained_models/llm',
|
65 |
-
token=os.getenv('HF_TOKEN')
|
66 |
-
)
|
67 |
-
EOF
|
68 |
-
|
69 |
-
RUN /tmp/llama.cpp/build/quantize \
|
70 |
-
/app/pretrained_models/llm/${MODEL_FILE} \
|
71 |
-
/app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
|
72 |
-
q4_K_M && \
|
73 |
-
mv /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
|
74 |
-
/app/pretrained_models/llm/${MODEL_FILE} && \
|
75 |
-
rm -rf /tmp/llama.cpp
|
76 |
-
|
77 |
-
# 11) Copy server
|
78 |
-
COPY llm_server.py /app/llm_server.py
|
79 |
|
|
|
80 |
EXPOSE 7860
|
81 |
-
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a base image
|
2 |
+
FROM python:3.10
|
3 |
|
4 |
+
# Set non-interactive for apt
|
|
|
|
|
|
|
|
|
5 |
ENV DEBIAN_FRONTEND=noninteractive \
|
6 |
RUSTUP_HOME=/root/.rustup \
|
7 |
CARGO_HOME=/root/.cargo \
|
8 |
PATH=/root/.cargo/bin:$PATH \
|
9 |
+
TRANSFORMERS_CACHE=/app/cache \
|
10 |
+
HF_HOME=/app/cache
|
|
|
|
|
11 |
|
12 |
+
# Set working directory
|
13 |
WORKDIR /app
|
14 |
|
15 |
+
# Install system dependencies, Rust, and build tools
|
16 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
17 |
build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
|
18 |
+
gcc g++ libffi-dev libgcc-s1 libstdc++6 libopenblas-dev \
|
|
|
19 |
&& rm -rf /var/lib/apt/lists/* \
|
20 |
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
|
21 |
&& rustup default stable
|
22 |
|
23 |
+
# Prepare cache directory
|
24 |
+
RUN mkdir -p /app/cache && chmod -R 777 /app/cache
|
|
|
25 |
|
26 |
+
# Copy and install Python requirements (excluding llama-cpp-python)
|
27 |
COPY requirements.txt .
|
28 |
+
RUN sed -i '/llama-cpp-python/d' requirements.txt \
|
29 |
+
&& pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
# Clone & build llama-cpp-python (with its llama.cpp submodule)
|
33 |
+
RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python \
|
34 |
+
&& cd /tmp/llama-cpp-python \
|
35 |
+
# ensure we have all submodules
|
36 |
+
&& git submodule update --init --recursive \
|
37 |
+
# install from source
|
38 |
+
&& python -m pip install --no-cache-dir . \
|
39 |
+
&& rm -rf /tmp/llama-cpp-python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# Expose the port your FastAPI app runs on
|
42 |
EXPOSE 7860
|
43 |
+
|
44 |
+
# Launch
|
45 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
llm_server.py
CHANGED
@@ -1,44 +1,137 @@
|
|
1 |
-
import
|
2 |
-
from pathlib import Path
|
3 |
from fastapi import FastAPI, HTTPException
|
4 |
from fastapi.responses import JSONResponse
|
5 |
-
from huggingface_hub import login
|
6 |
from llama_cpp import Llama
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
-
#
|
9 |
-
logging.basicConfig(level=logging.INFO
|
10 |
-
logger = logging.getLogger(
|
11 |
|
12 |
-
# βββ FastAPI setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
13 |
app = FastAPI()
|
14 |
|
15 |
-
#
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# βββ OpenAIβcompatible endpoint βββββββββββββββββββββββββββββββββββββββββββββ
|
44 |
@app.post("/v1/chat/completions")
|
|
|
1 |
+
import time
|
|
|
2 |
from fastapi import FastAPI, HTTPException
|
3 |
from fastapi.responses import JSONResponse
|
|
|
4 |
from llama_cpp import Llama
|
5 |
+
from huggingface_hub import login, hf_hub_download
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
import asyncio
|
9 |
+
import psutil # Added for RAM tracking
|
10 |
|
11 |
+
# Set up logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
|
|
|
15 |
app = FastAPI()
|
16 |
|
17 |
+
# Global lock for model access
|
18 |
+
model_lock = asyncio.Lock()
|
19 |
+
|
20 |
+
# Authenticate with Hugging Face
|
21 |
+
hf_token = os.getenv("HF_TOKEN")
|
22 |
+
if not hf_token:
|
23 |
+
logger.error("HF_TOKEN environment variable not set.")
|
24 |
+
raise ValueError("HF_TOKEN not set")
|
25 |
+
login(token=hf_token)
|
26 |
+
|
27 |
+
# Models Configuration
|
28 |
+
repo_id = "unsloth/Qwen3-1.7B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
|
29 |
+
filename = "Qwen3-1.7B-Q4_K_M.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
|
30 |
+
|
31 |
+
|
32 |
+
try:
|
33 |
+
# Load the model with optimized parameters
|
34 |
+
logger.info(f"Loading {filename} model")
|
35 |
+
model_path = hf_hub_download(
|
36 |
+
repo_id=repo_id,
|
37 |
+
filename=filename,
|
38 |
+
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
39 |
+
token=hf_token,
|
40 |
+
)
|
41 |
+
llm = Llama(
|
42 |
+
model_path=model_path,
|
43 |
+
n_ctx=3072,
|
44 |
+
n_threads=2,
|
45 |
+
n_batch=64,
|
46 |
+
n_gpu_layers=0,
|
47 |
+
use_mlock=True,
|
48 |
+
f16_kv=True,
|
49 |
+
verbose=True,
|
50 |
+
batch_prefill=True,
|
51 |
+
prefill_logits=False,
|
52 |
+
)
|
53 |
+
logger.info(f"{filename} model loaded")
|
54 |
+
|
55 |
+
except Exception as e:
|
56 |
+
logger.error(f"Startup error: {str(e)}", exc_info=True)
|
57 |
+
raise
|
58 |
+
|
59 |
+
|
60 |
+
# RAM Usage Tracking Function
|
61 |
+
def get_ram_usage():
|
62 |
+
memory = psutil.virtual_memory()
|
63 |
+
total_ram = memory.total / (1024 ** 3) # Convert to GB
|
64 |
+
used_ram = memory.used / (1024 ** 3) # Convert to GB
|
65 |
+
free_ram = memory.available / (1024 ** 3) # Convert to GB
|
66 |
+
percent_used = memory.percent
|
67 |
+
return {
|
68 |
+
"total_ram_gb": round(total_ram, 2),
|
69 |
+
"used_ram_gb": round(used_ram, 2),
|
70 |
+
"free_ram_gb": round(free_ram, 2),
|
71 |
+
"percent_used": percent_used
|
72 |
+
}
|
73 |
+
|
74 |
+
@app.get("/health")
|
75 |
+
async def health_check():
|
76 |
+
return {"status": "healthy"}
|
77 |
+
|
78 |
+
@app.get("/model_info")
|
79 |
+
async def model_info():
|
80 |
+
return {
|
81 |
+
"model_name": repo_id,
|
82 |
+
"model_size": "1.7B",
|
83 |
+
"quantization": "Q4_K_M",
|
84 |
+
}
|
85 |
+
|
86 |
+
@app.get("/ram_usage")
|
87 |
+
async def ram_usage():
|
88 |
+
"""Endpoint to get current RAM usage."""
|
89 |
+
try:
|
90 |
+
ram_stats = get_ram_usage()
|
91 |
+
return ram_stats
|
92 |
+
except Exception as e:
|
93 |
+
logger.error(f"Error retrieving RAM usage: {str(e)}")
|
94 |
+
raise HTTPException(status_code=500, detail=f"Error retrieving RAM usage: {str(e)}")
|
95 |
+
|
96 |
+
# @app.on_event("startup")
|
97 |
+
# async def warm_up_model():
|
98 |
+
# logger.info("Warming up the model...")
|
99 |
+
# dummy_query = "Hello"
|
100 |
+
# dummy_history = []
|
101 |
+
# async for _ in stream_response(dummy_query, dummy_history):
|
102 |
+
# pass
|
103 |
+
# logger.info("Model warm-up completed.")
|
104 |
+
# # Log initial RAM usage
|
105 |
+
# ram_stats = get_ram_usage()
|
106 |
+
# logger.info(f"Initial RAM usage after startup: {ram_stats}")
|
107 |
+
|
108 |
+
# Add a background task to keep the model warm
|
109 |
+
@app.on_event("startup")
|
110 |
+
async def setup_periodic_tasks():
|
111 |
+
asyncio.create_task(keep_model_warm())
|
112 |
+
logger.info("Periodic model warm-up task scheduled")
|
113 |
+
|
114 |
+
async def keep_model_warm():
|
115 |
+
"""Background task that keeps the model warm by sending periodic requests"""
|
116 |
+
while True:
|
117 |
+
try:
|
118 |
+
logger.info("Performing periodic model warm-up")
|
119 |
+
dummy_query = "Say only the word 'ok.'"
|
120 |
+
dummy_history = []
|
121 |
+
# Process a dummy query through the generator to keep it warm
|
122 |
+
resp = llm.create_chat_completion(
|
123 |
+
messages=[{"role": "user", "content": dummy_query}],
|
124 |
+
max_tokens=1,
|
125 |
+
temperature=0.0,
|
126 |
+
top_p=1.0,
|
127 |
+
stream=False,
|
128 |
+
)
|
129 |
+
logger.info("Periodic warm-up completed")
|
130 |
+
except Exception as e:
|
131 |
+
logger.error(f"Error in periodic warm-up: {str(e)}")
|
132 |
+
|
133 |
+
# Wait for 13 minutes before the next warm-up
|
134 |
+
await asyncio.sleep(13 * 60)
|
135 |
|
136 |
# βββ OpenAIβcompatible endpoint βββββββββββββββββββββββββββββββββββββββββββββ
|
137 |
@app.post("/v1/chat/completions")
|