Tim Luka Horstmann commited on
Commit
7e32345
Β·
1 Parent(s): c6ab136

Tried to copy other approach

Browse files
Files changed (2) hide show
  1. Dockerfile +26 -62
  2. llm_server.py +127 -34
Dockerfile CHANGED
@@ -1,81 +1,45 @@
1
- # 1) Base
2
- FROM python:3.10-slim
3
 
4
- # 2) Build‐time HF token
5
- ARG HF_TOKEN
6
- ENV HF_TOKEN=${HF_TOKEN}
7
-
8
- # 3) Environment
9
  ENV DEBIAN_FRONTEND=noninteractive \
10
  RUSTUP_HOME=/root/.rustup \
11
  CARGO_HOME=/root/.cargo \
12
  PATH=/root/.cargo/bin:$PATH \
13
- MODEL_REPO="unsloth/Qwen3-0.6B-GGUF" \
14
- MODEL_FILE="Qwen3-0.6B-Q4_K_M.gguf" \
15
- HF_HOME=/app/cache \
16
- TRANSFORMERS_CACHE=/app/cache
17
 
 
18
  WORKDIR /app
19
 
20
- # 4) System deps + Rust
21
- RUN apt-get update && \
22
- apt-get install -y --no-install-recommends \
23
  build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
24
- gcc g++ libffi-dev libopenblas-dev libstdc++6 libgcc-s1 \
25
- libcurl4-openssl-dev \ # ← add this
26
  && rm -rf /var/lib/apt/lists/* \
27
  && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
28
  && rustup default stable
29
 
30
- # 5) Prepare directories
31
- RUN mkdir -p /app/cache /app/pretrained_models/llm && \
32
- chmod -R 777 /app/cache /app/pretrained_models/llm
33
 
34
- # 6) Install Python deps (except llama-cpp-python)
35
  COPY requirements.txt .
36
- RUN sed -i '/llama-cpp-python/d' requirements.txt && \
37
- pip install --no-cache-dir -r requirements.txt
38
-
39
- # 7) Build llama-cpp-python from source
40
- RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
41
- cd /tmp/llama-cpp-python && \
42
- git submodule update --init --recursive && \
43
- python -m pip install --no-cache-dir . && \
44
- rm -rf /tmp/llama-cpp-python
45
-
46
- # 8) Install huggingface_hub CLI
47
- RUN pip install --no-cache-dir huggingface_hub
48
 
49
- # 9) Clone llama.cpp and build just the quantize tool
50
- RUN git clone https://github.com/ggerganov/llama.cpp.git /tmp/llama.cpp && \
51
- cd /tmp/llama.cpp && \
52
- cmake -B build -DLLAMA_BUILD_QUANTIZE=ON . && \
53
- cmake --build build --target quantize
54
 
55
- # 10) Download & pre-quantize model (no runtime repack)
56
- RUN python3 - <<EOF
57
- import os
58
- from huggingface_hub import login, hf_hub_download
59
- if os.getenv('HF_TOKEN'):
60
- login(token=os.getenv('HF_TOKEN'))
61
- hf_hub_download(
62
- repo_id=os.getenv('MODEL_REPO'),
63
- filename=os.getenv('MODEL_FILE'),
64
- local_dir='/app/pretrained_models/llm',
65
- token=os.getenv('HF_TOKEN')
66
- )
67
- EOF
68
-
69
- RUN /tmp/llama.cpp/build/quantize \
70
- /app/pretrained_models/llm/${MODEL_FILE} \
71
- /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
72
- q4_K_M && \
73
- mv /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
74
- /app/pretrained_models/llm/${MODEL_FILE} && \
75
- rm -rf /tmp/llama.cpp
76
-
77
- # 11) Copy server
78
- COPY llm_server.py /app/llm_server.py
79
 
 
80
  EXPOSE 7860
81
- CMD ["uvicorn", "llm_server:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
1
+ # Use an official Python runtime as a base image
2
+ FROM python:3.10
3
 
4
+ # Set non-interactive for apt
 
 
 
 
5
  ENV DEBIAN_FRONTEND=noninteractive \
6
  RUSTUP_HOME=/root/.rustup \
7
  CARGO_HOME=/root/.cargo \
8
  PATH=/root/.cargo/bin:$PATH \
9
+ TRANSFORMERS_CACHE=/app/cache \
10
+ HF_HOME=/app/cache
 
 
11
 
12
+ # Set working directory
13
  WORKDIR /app
14
 
15
+ # Install system dependencies, Rust, and build tools
16
+ RUN apt-get update && apt-get install -y --no-install-recommends \
 
17
  build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
18
+ gcc g++ libffi-dev libgcc-s1 libstdc++6 libopenblas-dev \
 
19
  && rm -rf /var/lib/apt/lists/* \
20
  && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
21
  && rustup default stable
22
 
23
+ # Prepare cache directory
24
+ RUN mkdir -p /app/cache && chmod -R 777 /app/cache
 
25
 
26
+ # Copy and install Python requirements (excluding llama-cpp-python)
27
  COPY requirements.txt .
28
+ RUN sed -i '/llama-cpp-python/d' requirements.txt \
29
+ && pip install --no-cache-dir -r requirements.txt
 
 
 
 
 
 
 
 
 
 
30
 
 
 
 
 
 
31
 
32
+ # Clone & build llama-cpp-python (with its llama.cpp submodule)
33
+ RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python \
34
+ && cd /tmp/llama-cpp-python \
35
+ # ensure we have all submodules
36
+ && git submodule update --init --recursive \
37
+ # install from source
38
+ && python -m pip install --no-cache-dir . \
39
+ && rm -rf /tmp/llama-cpp-python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # Expose the port your FastAPI app runs on
42
  EXPOSE 7860
43
+
44
+ # Launch
45
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
llm_server.py CHANGED
@@ -1,44 +1,137 @@
1
- import os, time, logging, asyncio
2
- from pathlib import Path
3
  from fastapi import FastAPI, HTTPException
4
  from fastapi.responses import JSONResponse
5
- from huggingface_hub import login
6
  from llama_cpp import Llama
 
 
 
 
 
7
 
8
- # ─── Logging ────────────────────────────────────────────────────────────────
9
- logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
10
- logger = logging.getLogger("llm_server")
11
 
12
- # ─── FastAPI setup ─────────────────────────────────────────────────────────
13
  app = FastAPI()
14
 
15
- # ─── Model paths & token ────────────────────────────────────────────────────
16
- MODEL_DIR = Path("/app/pretrained_models/llm")
17
- MODEL_DIR.mkdir(exist_ok=True)
18
- MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3-0.6B-Q4_K_M.gguf")
19
- MODEL_PATH = MODEL_DIR / MODEL_FILE
20
- HF_TOKEN = os.getenv("HF_TOKEN")
21
-
22
- # ─── Instantiate & warm at import time ─────────────────────────────────────
23
- if HF_TOKEN:
24
- login(token=HF_TOKEN)
25
-
26
- logger.info("Loading & warming model (this may take a moment)...")
27
- llm = Llama(
28
- model_path=str(MODEL_PATH),
29
- n_ctx=1024,
30
- n_threads=os.cpu_count(), # saturate all vCPUs
31
- n_gpu_layers=0,
32
- use_mlock=True,
33
- f16_kv=True,
34
- )
35
- # one‐token warm-up to avoid any first‐request penalty
36
- llm.create_chat_completion(
37
- messages=[{"role":"user","content":"/no_think ok"}],
38
- max_tokens=1,
39
- stream=False,
40
- )
41
- logger.info("Model ready")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
44
  @app.post("/v1/chat/completions")
 
1
+ import time
 
2
  from fastapi import FastAPI, HTTPException
3
  from fastapi.responses import JSONResponse
 
4
  from llama_cpp import Llama
5
+ from huggingface_hub import login, hf_hub_download
6
+ import logging
7
+ import os
8
+ import asyncio
9
+ import psutil # Added for RAM tracking
10
 
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
 
 
15
  app = FastAPI()
16
 
17
+ # Global lock for model access
18
+ model_lock = asyncio.Lock()
19
+
20
+ # Authenticate with Hugging Face
21
+ hf_token = os.getenv("HF_TOKEN")
22
+ if not hf_token:
23
+ logger.error("HF_TOKEN environment variable not set.")
24
+ raise ValueError("HF_TOKEN not set")
25
+ login(token=hf_token)
26
+
27
+ # Models Configuration
28
+ repo_id = "unsloth/Qwen3-1.7B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
29
+ filename = "Qwen3-1.7B-Q4_K_M.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
30
+
31
+
32
+ try:
33
+ # Load the model with optimized parameters
34
+ logger.info(f"Loading {filename} model")
35
+ model_path = hf_hub_download(
36
+ repo_id=repo_id,
37
+ filename=filename,
38
+ local_dir="/app/cache" if os.getenv("HF_HOME") else None,
39
+ token=hf_token,
40
+ )
41
+ llm = Llama(
42
+ model_path=model_path,
43
+ n_ctx=3072,
44
+ n_threads=2,
45
+ n_batch=64,
46
+ n_gpu_layers=0,
47
+ use_mlock=True,
48
+ f16_kv=True,
49
+ verbose=True,
50
+ batch_prefill=True,
51
+ prefill_logits=False,
52
+ )
53
+ logger.info(f"{filename} model loaded")
54
+
55
+ except Exception as e:
56
+ logger.error(f"Startup error: {str(e)}", exc_info=True)
57
+ raise
58
+
59
+
60
+ # RAM Usage Tracking Function
61
+ def get_ram_usage():
62
+ memory = psutil.virtual_memory()
63
+ total_ram = memory.total / (1024 ** 3) # Convert to GB
64
+ used_ram = memory.used / (1024 ** 3) # Convert to GB
65
+ free_ram = memory.available / (1024 ** 3) # Convert to GB
66
+ percent_used = memory.percent
67
+ return {
68
+ "total_ram_gb": round(total_ram, 2),
69
+ "used_ram_gb": round(used_ram, 2),
70
+ "free_ram_gb": round(free_ram, 2),
71
+ "percent_used": percent_used
72
+ }
73
+
74
+ @app.get("/health")
75
+ async def health_check():
76
+ return {"status": "healthy"}
77
+
78
+ @app.get("/model_info")
79
+ async def model_info():
80
+ return {
81
+ "model_name": repo_id,
82
+ "model_size": "1.7B",
83
+ "quantization": "Q4_K_M",
84
+ }
85
+
86
+ @app.get("/ram_usage")
87
+ async def ram_usage():
88
+ """Endpoint to get current RAM usage."""
89
+ try:
90
+ ram_stats = get_ram_usage()
91
+ return ram_stats
92
+ except Exception as e:
93
+ logger.error(f"Error retrieving RAM usage: {str(e)}")
94
+ raise HTTPException(status_code=500, detail=f"Error retrieving RAM usage: {str(e)}")
95
+
96
+ # @app.on_event("startup")
97
+ # async def warm_up_model():
98
+ # logger.info("Warming up the model...")
99
+ # dummy_query = "Hello"
100
+ # dummy_history = []
101
+ # async for _ in stream_response(dummy_query, dummy_history):
102
+ # pass
103
+ # logger.info("Model warm-up completed.")
104
+ # # Log initial RAM usage
105
+ # ram_stats = get_ram_usage()
106
+ # logger.info(f"Initial RAM usage after startup: {ram_stats}")
107
+
108
+ # Add a background task to keep the model warm
109
+ @app.on_event("startup")
110
+ async def setup_periodic_tasks():
111
+ asyncio.create_task(keep_model_warm())
112
+ logger.info("Periodic model warm-up task scheduled")
113
+
114
+ async def keep_model_warm():
115
+ """Background task that keeps the model warm by sending periodic requests"""
116
+ while True:
117
+ try:
118
+ logger.info("Performing periodic model warm-up")
119
+ dummy_query = "Say only the word 'ok.'"
120
+ dummy_history = []
121
+ # Process a dummy query through the generator to keep it warm
122
+ resp = llm.create_chat_completion(
123
+ messages=[{"role": "user", "content": dummy_query}],
124
+ max_tokens=1,
125
+ temperature=0.0,
126
+ top_p=1.0,
127
+ stream=False,
128
+ )
129
+ logger.info("Periodic warm-up completed")
130
+ except Exception as e:
131
+ logger.error(f"Error in periodic warm-up: {str(e)}")
132
+
133
+ # Wait for 13 minutes before the next warm-up
134
+ await asyncio.sleep(13 * 60)
135
 
136
  # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
137
  @app.post("/v1/chat/completions")