vscode-python312 / Dockerfile
miike-ai's picture
Update Dockerfile
f3df9f4 verified
raw
history blame
10.2 kB
FROM runpod/pytorch:2.8.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04
# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV CUDA_VISIBLE_DEVICES=all
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
ENV PATH=/usr/local/cuda/bin:$PATH
# Install system dependencies
RUN apt-get update && apt-get install -y \
wget \
curl \
git \
vim \
tmux \
htop \
nvtop \
build-essential \
software-properties-common \
ca-certificates \
gnupg \
lsb-release \
sudo \
openssh-server \
nginx \
supervisor \
&& rm -rf /var/lib/apt/lists/*
# Install code-server (VSCode in browser)
RUN curl -fsSL https://code-server.dev/install.sh | sh
# Install Ollama
RUN curl -fsSL https://ollama.com/install.sh | sh
# Upgrade pip and install base Python packages
RUN pip install --upgrade pip setuptools wheel
# Install hf_transfer first for faster downloads
RUN pip install hf_transfer
# Install critical ML infrastructure
RUN pip install \
accelerate \
transformers \
datasets \
peft \
bitsandbytes \
safetensors \
sentencepiece \
protobuf \
scipy \
einops \
wandb \
tensorboard
# Install vLLM with CUDA 12.8 support
RUN pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
# Install Flash Attention 2 (critical for 5090)
RUN pip install flash-attn --no-build-isolation
# Install Unsloth with 5090 patches
# Using the approach from the referenced repo
RUN git clone https://github.com/unslothai/unsloth.git /tmp/unsloth && \
cd /tmp/unsloth && \
pip install -e . && \
cd / && \
rm -rf /tmp/unsloth/.git
# Install Axolotl
RUN git clone https://github.com/axolotl-ai-cloud/axolotl /tmp/axolotl && \
cd /tmp/axolotl && \
pip install -e . && \
cd / && \
rm -rf /tmp/axolotl/.git
# Install Open-WebUI dependencies
RUN apt-get update && apt-get install -y \
nodejs \
npm \
&& rm -rf /var/lib/apt/lists/*
# Clone and setup Open-WebUI
RUN git clone https://github.com/open-webui/open-webui.git /opt/open-webui && \
cd /opt/open-webui && \
npm install && \
npm run build
# Create workspace directory
RUN mkdir -p /workspace
# Configure code-server
RUN mkdir -p /root/.config/code-server
RUN echo "bind-addr: 0.0.0.0:8080\nauth: none\ncert: false" > /root/.config/code-server/config.yaml
# Configure SSH (optional but useful)
RUN mkdir /var/run/sshd
RUN echo 'root:runpod' | chpasswd
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
# Create supervisor config to run all services
RUN mkdir -p /etc/supervisor/conf.d
RUN cat > /etc/supervisor/conf.d/services.conf << 'EOF'
[supervisord]
nodaemon=true
[program:code-server]
command=code-server --bind-addr 0.0.0.0:8080 --auth none
autostart=true
autorestart=true
stderr_logfile=/var/log/code-server.err.log
stdout_logfile=/var/log/code-server.out.log
[program:ollama]
command=ollama serve
autostart=true
autorestart=true
environment=OLLAMA_HOST="0.0.0.0"
stderr_logfile=/var/log/ollama.err.log
stdout_logfile=/var/log/ollama.out.log
[program:sshd]
command=/usr/sbin/sshd -D
autostart=true
autorestart=true
[program:open-webui]
command=cd /opt/open-webui && npm start
autostart=true
autorestart=true
environment=PORT="3000",OLLAMA_BASE_URL="http://localhost:11434"
stderr_logfile=/var/log/open-webui.err.log
stdout_logfile=/var/log/open-webui.out.log
EOF
# Create a startup script for vLLM (runs on demand)
RUN cat > /usr/local/bin/start-vllm << 'EOF'
#!/bin/bash
python -m vllm.entrypoints.openai.api_server \
--model $1 \
--tensor-parallel-size ${CUDA_DEVICE_COUNT:-1} \
--gpu-memory-utilization 0.9 \
--max-model-len 32768 \
--host 0.0.0.0 \
--port 8000
EOF
RUN chmod +x /usr/local/bin/start-vllm
# Create multi-GPU training helper script
RUN cat > /usr/local/bin/train-multi-gpu << 'EOF'
#!/bin/bash
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
accelerate launch \
--num_processes $GPU_COUNT \
--num_machines 1 \
--mixed_precision bf16 \
--dynamo_backend no \
$@
EOF
RUN chmod +x /usr/local/bin/train-multi-gpu
# Create accelerate config
RUN cat > /workspace/accelerate_config.yaml << 'EOF'
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
EOF
# Create setup script
RUN cat > /workspace/setup.sh << 'EOF'
#!/bin/bash
echo "πŸš€ RunPod ML Stack Setup"
echo "========================"
# Check GPU availability
echo -e "\nπŸ“Š GPU Status:"
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader,nounits | nl -v 0
# Count GPUs
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo -e "\nβœ… Found $GPU_COUNT GPU(s)"
# Update accelerate config with correct GPU count
sed -i "s/num_processes: 8/num_processes: $GPU_COUNT/g" /workspace/accelerate_config.yaml
# Pull a default model for Ollama if not exists
if ! ollama list | grep -q "llama3.2"; then
echo -e "\nπŸ“₯ Pulling default Ollama model (llama3.2)..."
ollama pull llama3.2
fi
echo -e "\nπŸŽ‰ Setup complete! Services available at:"
echo " VSCode: http://localhost:8080"
echo " Ollama: http://localhost:11434"
echo " Open-WebUI: http://localhost:3000"
echo " vLLM: http://localhost:8000 (start with: start-vllm <model>)"
EOF
RUN chmod +x /workspace/setup.sh
# Create example multi-GPU training script
RUN cat > /workspace/example_multi_gpu_train.py << 'EOF'
import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
def main():
# Initialize accelerator
accelerator = Accelerator()
# Setup
model_name = "meta-llama/Llama-2-7b-hf"
batch_size = 4
gradient_accumulation_steps = 4
learning_rate = 2e-5
num_epochs = 3
# Print GPU info
if accelerator.is_main_process:
print(f"πŸš€ Training on {accelerator.num_processes} GPU(s)")
print(f"πŸ’Ύ Total batch size: {batch_size * accelerator.num_processes * gradient_accumulation_steps}")
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
use_cache=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Load dataset
dataset = load_dataset("imdb", split="train[:1000]") # Small subset for demo
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])
# Create DataLoader
dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True)
# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)
# Prepare for distributed training
model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
# Training loop
model.train()
for epoch in range(num_epochs):
if accelerator.is_main_process:
print(f"\nEpoch {epoch + 1}/{num_epochs}")
progress_bar = tqdm(total=len(dataloader), desc="Training")
for step, batch in enumerate(dataloader):
with accelerator.accumulate(model):
outputs = model(
input_ids=batch["input_ids"],
attention_mask=batch["attention_mask"],
labels=batch["input_ids"],
)
loss = outputs.loss
accelerator.backward(loss)
optimizer.step()
optimizer.zero_grad()
if accelerator.is_main_process:
progress_bar.update(1)
if step % 10 == 0:
progress_bar.set_postfix({"loss": loss.item()})
if accelerator.is_main_process:
progress_bar.close()
# Save model
if accelerator.is_main_process:
model.save_pretrained("./trained_model")
print("βœ… Training complete! Model saved to ./trained_model")
if __name__ == "__main__":
main()
EOF
# Create a helpful README
RUN cat > /workspace/README.md << 'EOF'
# RunPod Multi-GPU ML Stack πŸš€
## Quick Start
Run `/workspace/setup.sh` first to detect GPUs and pull models!
## Services:
- **VSCode**: http://localhost:8080
- **Ollama API**: http://localhost:11434
- **vLLM API**: http://localhost:8000 (start with: `start-vllm <model-name>`)
- **Open-WebUI**: http://localhost:3000
## Multi-GPU Commands:
- Training: `train-multi-gpu your_script.py`
- vLLM: `start-vllm meta-llama/Llama-2-7b-hf`
- Test multi-GPU: `python example_multi_gpu_train.py`
## RTX 5090 Support:
This image includes patched Unsloth and Flash Attention 2 for RTX 5090 compatibility.
## SSH Access:
Default password is `runpod`. Change it with `passwd`.
## Tips:
- Check GPU status: `nvidia-smi`
- Monitor GPUs: `nvtop`
- List Ollama models: `ollama list`
- Pull new models: `ollama pull <model>`
EOF
# Expose all necessary ports
EXPOSE 22 # SSH
EXPOSE 8080 # Code-server (VSCode)
EXPOSE 11434 # Ollama API
EXPOSE 8000 # vLLM API
EXPOSE 3000 # Open-WebUI
EXPOSE 6006 # TensorBoard
EXPOSE 8888 # Jupyter (if needed)
EXPOSE 5000 # Flask/FastAPI apps
EXPOSE 7860 # Gradio apps
EXPOSE 29500 # Distributed training master port
# Set working directory
WORKDIR /workspace
# Run setup on first start and then supervisor
CMD bash -c "/workspace/setup.sh && /usr/bin/supervisord -c /etc/supervisor/supervisord.conf"