Spaces:

Luka512
/

Qwen3

Runtime error

App Files Files Community

Tim Luka Horstmann commited on May 21

Commit

7e32345

1 Parent(s): c6ab136

Tried to copy other approach

Browse files

Files changed (2) hide show

Dockerfile +26 -62
llm_server.py +127 -34

Dockerfile CHANGED Viewed

@@ -1,81 +1,45 @@
-# 1) Base
-FROM python:3.10-slim
-# 2) Build‐time HF token
-ARG HF_TOKEN
-ENV HF_TOKEN=${HF_TOKEN}
-# 3) Environment
 ENV DEBIAN_FRONTEND=noninteractive \
     RUSTUP_HOME=/root/.rustup \
     CARGO_HOME=/root/.cargo \
     PATH=/root/.cargo/bin:$PATH \
-    MODEL_REPO="unsloth/Qwen3-0.6B-GGUF" \
-    MODEL_FILE="Qwen3-0.6B-Q4_K_M.gguf" \
-    HF_HOME=/app/cache \
-    TRANSFORMERS_CACHE=/app/cache
 WORKDIR /app
-# 4) System deps + Rust
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
       build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
-      gcc g++ libffi-dev libopenblas-dev libstdc++6 libgcc-s1 \
-      libcurl4-openssl-dev \                  # ← add this
     && rm -rf /var/lib/apt/lists/* \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && rustup default stable
-# 5) Prepare directories
-RUN mkdir -p /app/cache /app/pretrained_models/llm && \
-    chmod -R 777 /app/cache /app/pretrained_models/llm
-# 6) Install Python deps (except llama-cpp-python)
 COPY requirements.txt .
-RUN sed -i '/llama-cpp-python/d' requirements.txt && \
-    pip install --no-cache-dir -r requirements.txt
-# 7) Build llama-cpp-python from source
-RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python && \
-    cd /tmp/llama-cpp-python && \
-    git submodule update --init --recursive && \
-    python -m pip install --no-cache-dir . && \
-    rm -rf /tmp/llama-cpp-python
-# 8) Install huggingface_hub CLI
-RUN pip install --no-cache-dir huggingface_hub
-# 9) Clone llama.cpp and build just the quantize tool
-RUN git clone https://github.com/ggerganov/llama.cpp.git /tmp/llama.cpp && \
-    cd /tmp/llama.cpp && \
-    cmake -B build -DLLAMA_BUILD_QUANTIZE=ON . && \
-    cmake --build build --target quantize
-# 10) Download & pre-quantize model (no runtime repack)
-RUN python3 - <<EOF
-import os
-from huggingface_hub import login, hf_hub_download
-if os.getenv('HF_TOKEN'):
-    login(token=os.getenv('HF_TOKEN'))
-hf_hub_download(
-    repo_id=os.getenv('MODEL_REPO'),
-    filename=os.getenv('MODEL_FILE'),
-    local_dir='/app/pretrained_models/llm',
-    token=os.getenv('HF_TOKEN')
-)
-EOF
-RUN /tmp/llama.cpp/build/quantize \
-      /app/pretrained_models/llm/${MODEL_FILE} \
-      /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
-      q4_K_M && \
-    mv /app/pretrained_models/llm/${MODEL_FILE}.packed.gguf \
-       /app/pretrained_models/llm/${MODEL_FILE} && \
-    rm -rf /tmp/llama.cpp
-# 11) Copy server
-COPY llm_server.py /app/llm_server.py
 EXPOSE 7860
-CMD ["uvicorn", "llm_server:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use an official Python runtime as a base image
+FROM python:3.10
+# Set non-interactive for apt
 ENV DEBIAN_FRONTEND=noninteractive \
     RUSTUP_HOME=/root/.rustup \
     CARGO_HOME=/root/.cargo \
     PATH=/root/.cargo/bin:$PATH \
+    TRANSFORMERS_CACHE=/app/cache \
+    HF_HOME=/app/cache
+# Set working directory
 WORKDIR /app
+# Install system dependencies, Rust, and build tools
+RUN apt-get update && apt-get install -y --no-install-recommends \
       build-essential cmake git curl wget ninja-build libgomp1 ca-certificates \
+      gcc g++ libffi-dev libgcc-s1 libstdc++6 libopenblas-dev \
     && rm -rf /var/lib/apt/lists/* \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && rustup default stable
+# Prepare cache directory
+RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+# Copy and install Python requirements (excluding llama-cpp-python)
 COPY requirements.txt .
+RUN sed -i '/llama-cpp-python/d' requirements.txt \
+ && pip install --no-cache-dir -r requirements.txt
+# Clone & build llama-cpp-python (with its llama.cpp submodule)
+RUN git clone --recursive https://github.com/abetlen/llama-cpp-python.git /tmp/llama-cpp-python \
+ && cd /tmp/llama-cpp-python \
+ # ensure we have all submodules
+ && git submodule update --init --recursive \
+ # install from source
+ && python -m pip install --no-cache-dir . \
+ && rm -rf /tmp/llama-cpp-python
+# Expose the port your FastAPI app runs on
 EXPOSE 7860
+# Launch
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

llm_server.py CHANGED Viewed

@@ -1,44 +1,137 @@
-import os, time, logging, asyncio
-from pathlib import Path
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
-from huggingface_hub import login
 from llama_cpp import Llama
-# ─── Logging ────────────────────────────────────────────────────────────────
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
-logger = logging.getLogger("llm_server")
-# ─── FastAPI setup ─────────────────────────────────────────────────────────
 app = FastAPI()
-# ─── Model paths & token ────────────────────────────────────────────────────
-MODEL_DIR = Path("/app/pretrained_models/llm")
-MODEL_DIR.mkdir(exist_ok=True)
-MODEL_FILE = os.getenv("MODEL_FILE", "Qwen3-0.6B-Q4_K_M.gguf")
-MODEL_PATH = MODEL_DIR / MODEL_FILE
-HF_TOKEN   = os.getenv("HF_TOKEN")
-# ─── Instantiate & warm at import time ─────────────────────────────────────
-if HF_TOKEN:
-    login(token=HF_TOKEN)
-logger.info("Loading & warming model (this may take a moment)...")
-llm = Llama(
-    model_path=str(MODEL_PATH),
-    n_ctx=1024,
-    n_threads=os.cpu_count(),  # saturate all vCPUs
-    n_gpu_layers=0,
-    use_mlock=True,
-    f16_kv=True,
-)
-# one‐token warm-up to avoid any first‐request penalty
-llm.create_chat_completion(
-    messages=[{"role":"user","content":"/no_think ok"}],
-    max_tokens=1,
-    stream=False,
-)
-logger.info("Model ready")
 # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
 @app.post("/v1/chat/completions")

+import time
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from llama_cpp import Llama
+from huggingface_hub import login, hf_hub_download
+import logging
+import os
+import asyncio
+import psutil  # Added for RAM tracking
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = FastAPI()
+# Global lock for model access
+model_lock = asyncio.Lock()
+# Authenticate with Hugging Face
+hf_token = os.getenv("HF_TOKEN")
+if not hf_token:
+    logger.error("HF_TOKEN environment variable not set.")
+    raise ValueError("HF_TOKEN not set")
+login(token=hf_token)
+# Models Configuration
+repo_id = "unsloth/Qwen3-1.7B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF" # "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
+filename = "Qwen3-1.7B-Q4_K_M.gguf" # "deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf"
+try:
+    # Load the model with optimized parameters
+    logger.info(f"Loading {filename} model")
+    model_path = hf_hub_download(
+        repo_id=repo_id,
+        filename=filename,
+        local_dir="/app/cache" if os.getenv("HF_HOME") else None,
+        token=hf_token,
+    )
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=3072,
+        n_threads=2,
+        n_batch=64,
+        n_gpu_layers=0,
+        use_mlock=True,
+        f16_kv=True,
+        verbose=True,
+        batch_prefill=True,
+        prefill_logits=False,
+    )
+    logger.info(f"{filename} model loaded")
+except Exception as e:
+    logger.error(f"Startup error: {str(e)}", exc_info=True)
+    raise
+# RAM Usage Tracking Function
+def get_ram_usage():
+    memory = psutil.virtual_memory()
+    total_ram = memory.total / (1024 ** 3)  # Convert to GB
+    used_ram = memory.used / (1024 ** 3)   # Convert to GB
+    free_ram = memory.available / (1024 ** 3)  # Convert to GB
+    percent_used = memory.percent
+    return {
+        "total_ram_gb": round(total_ram, 2),
+        "used_ram_gb": round(used_ram, 2),
+        "free_ram_gb": round(free_ram, 2),
+        "percent_used": percent_used
+    }
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+@app.get("/model_info")
+async def model_info():
+    return {
+        "model_name": repo_id,
+        "model_size": "1.7B",
+        "quantization": "Q4_K_M",
+    }
+@app.get("/ram_usage")
+async def ram_usage():
+    """Endpoint to get current RAM usage."""
+    try:
+        ram_stats = get_ram_usage()
+        return ram_stats
+    except Exception as e:
+        logger.error(f"Error retrieving RAM usage: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error retrieving RAM usage: {str(e)}")
+# @app.on_event("startup")
+# async def warm_up_model():
+#     logger.info("Warming up the model...")
+#     dummy_query = "Hello"
+#     dummy_history = []
+#     async for _ in stream_response(dummy_query, dummy_history):
+#         pass
+#     logger.info("Model warm-up completed.")
+#     # Log initial RAM usage
+#     ram_stats = get_ram_usage()
+#     logger.info(f"Initial RAM usage after startup: {ram_stats}")
+# Add a background task to keep the model warm
+@app.on_event("startup")
+async def setup_periodic_tasks():
+    asyncio.create_task(keep_model_warm())
+    logger.info("Periodic model warm-up task scheduled")
+async def keep_model_warm():
+    """Background task that keeps the model warm by sending periodic requests"""
+    while True:
+        try:
+            logger.info("Performing periodic model warm-up")
+            dummy_query = "Say only the word 'ok.'"
+            dummy_history = []
+            # Process a dummy query through the generator to keep it warm
+            resp = llm.create_chat_completion(
+                messages=[{"role": "user", "content": dummy_query}],
+                max_tokens=1,
+                temperature=0.0,
+                top_p=1.0,
+                stream=False,
+            )
+            logger.info("Periodic warm-up completed")
+        except Exception as e:
+            logger.error(f"Error in periodic warm-up: {str(e)}")
+        # Wait for 13 minutes before the next warm-up
+        await asyncio.sleep(13 * 60)
 # ─── OpenAI‐compatible endpoint ─────────────────────────────────────────────
 @app.post("/v1/chat/completions")