Spaces:
Running
Running
FROM runpod/pytorch:2.8.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04 | |
# Set environment variables | |
ENV DEBIAN_FRONTEND=noninteractive | |
ENV PYTHONUNBUFFERED=1 | |
ENV HF_HUB_ENABLE_HF_TRANSFER=1 | |
ENV CUDA_VISIBLE_DEVICES=all | |
ENV NVIDIA_VISIBLE_DEVICES=all | |
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility | |
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH | |
ENV PATH=/usr/local/cuda/bin:$PATH | |
# Install system dependencies | |
RUN apt-get update && apt-get install -y \ | |
wget \ | |
curl \ | |
git \ | |
vim \ | |
tmux \ | |
htop \ | |
nvtop \ | |
build-essential \ | |
software-properties-common \ | |
ca-certificates \ | |
gnupg \ | |
lsb-release \ | |
sudo \ | |
openssh-server \ | |
nginx \ | |
supervisor \ | |
&& rm -rf /var/lib/apt/lists/* | |
# Install code-server (VSCode in browser) | |
RUN curl -fsSL https://code-server.dev/install.sh | sh | |
# Install Ollama | |
RUN curl -fsSL https://ollama.com/install.sh | sh | |
# Upgrade pip and install base Python packages | |
RUN pip install --upgrade pip setuptools wheel | |
# Install hf_transfer first for faster downloads | |
RUN pip install hf_transfer | |
# Install critical ML infrastructure | |
RUN pip install \ | |
accelerate \ | |
transformers \ | |
datasets \ | |
peft \ | |
bitsandbytes \ | |
safetensors \ | |
sentencepiece \ | |
protobuf \ | |
scipy \ | |
einops \ | |
wandb \ | |
tensorboard | |
# Install vLLM with CUDA 12.8 support | |
RUN pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128 | |
# Install Flash Attention 2 (critical for 5090) | |
RUN pip install flash-attn --no-build-isolation | |
# Install Unsloth with 5090 patches | |
# Using the approach from the referenced repo | |
RUN git clone https://github.com/unslothai/unsloth.git /tmp/unsloth && \ | |
cd /tmp/unsloth && \ | |
pip install -e . && \ | |
cd / && \ | |
rm -rf /tmp/unsloth/.git | |
# Install Axolotl | |
RUN git clone https://github.com/axolotl-ai-cloud/axolotl /tmp/axolotl && \ | |
cd /tmp/axolotl && \ | |
pip install -e . && \ | |
cd / && \ | |
rm -rf /tmp/axolotl/.git | |
# Install Open-WebUI dependencies | |
RUN apt-get update && apt-get install -y \ | |
nodejs \ | |
npm \ | |
&& rm -rf /var/lib/apt/lists/* | |
# Clone and setup Open-WebUI | |
RUN git clone https://github.com/open-webui/open-webui.git /opt/open-webui && \ | |
cd /opt/open-webui && \ | |
npm install && \ | |
npm run build | |
# Create workspace directory | |
RUN mkdir -p /workspace | |
# Configure code-server | |
RUN mkdir -p /root/.config/code-server | |
RUN echo "bind-addr: 0.0.0.0:8080\nauth: none\ncert: false" > /root/.config/code-server/config.yaml | |
# Configure SSH (optional but useful) | |
RUN mkdir /var/run/sshd | |
RUN echo 'root:runpod' | chpasswd | |
RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config | |
# Create supervisor config to run all services | |
RUN mkdir -p /etc/supervisor/conf.d | |
RUN cat > /etc/supervisor/conf.d/services.conf << 'EOF' | |
[supervisord] | |
nodaemon=true | |
[program:code-server] | |
command=code-server --bind-addr 0.0.0.0:8080 --auth none | |
autostart=true | |
autorestart=true | |
stderr_logfile=/var/log/code-server.err.log | |
stdout_logfile=/var/log/code-server.out.log | |
[program:ollama] | |
command=ollama serve | |
autostart=true | |
autorestart=true | |
environment=OLLAMA_HOST="0.0.0.0" | |
stderr_logfile=/var/log/ollama.err.log | |
stdout_logfile=/var/log/ollama.out.log | |
[program:sshd] | |
command=/usr/sbin/sshd -D | |
autostart=true | |
autorestart=true | |
[program:open-webui] | |
command=cd /opt/open-webui && npm start | |
autostart=true | |
autorestart=true | |
environment=PORT="3000",OLLAMA_BASE_URL="http://localhost:11434" | |
stderr_logfile=/var/log/open-webui.err.log | |
stdout_logfile=/var/log/open-webui.out.log | |
EOF | |
# Create a startup script for vLLM (runs on demand) | |
RUN cat > /usr/local/bin/start-vllm << 'EOF' | |
#!/bin/bash | |
python -m vllm.entrypoints.openai.api_server \ | |
--model $1 \ | |
--tensor-parallel-size ${CUDA_DEVICE_COUNT:-1} \ | |
--gpu-memory-utilization 0.9 \ | |
--max-model-len 32768 \ | |
--host 0.0.0.0 \ | |
--port 8000 | |
EOF | |
RUN chmod +x /usr/local/bin/start-vllm | |
# Create multi-GPU training helper script | |
RUN cat > /usr/local/bin/train-multi-gpu << 'EOF' | |
#!/bin/bash | |
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) | |
accelerate launch \ | |
--num_processes $GPU_COUNT \ | |
--num_machines 1 \ | |
--mixed_precision bf16 \ | |
--dynamo_backend no \ | |
$@ | |
EOF | |
RUN chmod +x /usr/local/bin/train-multi-gpu | |
# Create accelerate config | |
RUN cat > /workspace/accelerate_config.yaml << 'EOF' | |
compute_environment: LOCAL_MACHINE | |
debug: false | |
distributed_type: MULTI_GPU | |
downcast_bf16: 'no' | |
gpu_ids: all | |
machine_rank: 0 | |
main_training_function: main | |
mixed_precision: bf16 | |
num_machines: 1 | |
num_processes: 8 | |
rdzv_backend: static | |
same_network: true | |
tpu_env: [] | |
tpu_use_cluster: false | |
tpu_use_sudo: false | |
use_cpu: false | |
EOF | |
# Create setup script | |
RUN cat > /workspace/setup.sh << 'EOF' | |
#!/bin/bash | |
echo "π RunPod ML Stack Setup" | |
echo "========================" | |
# Check GPU availability | |
echo -e "\nπ GPU Status:" | |
nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader,nounits | nl -v 0 | |
# Count GPUs | |
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) | |
echo -e "\nβ Found $GPU_COUNT GPU(s)" | |
# Update accelerate config with correct GPU count | |
sed -i "s/num_processes: 8/num_processes: $GPU_COUNT/g" /workspace/accelerate_config.yaml | |
# Pull a default model for Ollama if not exists | |
if ! ollama list | grep -q "llama3.2"; then | |
echo -e "\nπ₯ Pulling default Ollama model (llama3.2)..." | |
ollama pull llama3.2 | |
fi | |
echo -e "\nπ Setup complete! Services available at:" | |
echo " VSCode: http://localhost:8080" | |
echo " Ollama: http://localhost:11434" | |
echo " Open-WebUI: http://localhost:3000" | |
echo " vLLM: http://localhost:8000 (start with: start-vllm <model>)" | |
EOF | |
RUN chmod +x /workspace/setup.sh | |
# Create example multi-GPU training script | |
RUN cat > /workspace/example_multi_gpu_train.py << 'EOF' | |
import torch | |
from accelerate import Accelerator | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from datasets import load_dataset | |
from torch.optim import AdamW | |
from torch.utils.data import DataLoader | |
from tqdm import tqdm | |
def main(): | |
# Initialize accelerator | |
accelerator = Accelerator() | |
# Setup | |
model_name = "meta-llama/Llama-2-7b-hf" | |
batch_size = 4 | |
gradient_accumulation_steps = 4 | |
learning_rate = 2e-5 | |
num_epochs = 3 | |
# Print GPU info | |
if accelerator.is_main_process: | |
print(f"π Training on {accelerator.num_processes} GPU(s)") | |
print(f"πΎ Total batch size: {batch_size * accelerator.num_processes * gradient_accumulation_steps}") | |
# Load model and tokenizer | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.bfloat16, | |
use_cache=False, | |
) | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
tokenizer.pad_token = tokenizer.eos_token | |
# Load dataset | |
dataset = load_dataset("imdb", split="train[:1000]") # Small subset for demo | |
def tokenize_function(examples): | |
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) | |
tokenized_dataset = dataset.map(tokenize_function, batched=True) | |
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"]) | |
# Create DataLoader | |
dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True) | |
# Optimizer | |
optimizer = AdamW(model.parameters(), lr=learning_rate) | |
# Prepare for distributed training | |
model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader) | |
# Training loop | |
model.train() | |
for epoch in range(num_epochs): | |
if accelerator.is_main_process: | |
print(f"\nEpoch {epoch + 1}/{num_epochs}") | |
progress_bar = tqdm(total=len(dataloader), desc="Training") | |
for step, batch in enumerate(dataloader): | |
with accelerator.accumulate(model): | |
outputs = model( | |
input_ids=batch["input_ids"], | |
attention_mask=batch["attention_mask"], | |
labels=batch["input_ids"], | |
) | |
loss = outputs.loss | |
accelerator.backward(loss) | |
optimizer.step() | |
optimizer.zero_grad() | |
if accelerator.is_main_process: | |
progress_bar.update(1) | |
if step % 10 == 0: | |
progress_bar.set_postfix({"loss": loss.item()}) | |
if accelerator.is_main_process: | |
progress_bar.close() | |
# Save model | |
if accelerator.is_main_process: | |
model.save_pretrained("./trained_model") | |
print("β Training complete! Model saved to ./trained_model") | |
if __name__ == "__main__": | |
main() | |
EOF | |
# Create a helpful README | |
RUN cat > /workspace/README.md << 'EOF' | |
# RunPod Multi-GPU ML Stack π | |
## Quick Start | |
Run `/workspace/setup.sh` first to detect GPUs and pull models! | |
## Services: | |
- **VSCode**: http://localhost:8080 | |
- **Ollama API**: http://localhost:11434 | |
- **vLLM API**: http://localhost:8000 (start with: `start-vllm <model-name>`) | |
- **Open-WebUI**: http://localhost:3000 | |
## Multi-GPU Commands: | |
- Training: `train-multi-gpu your_script.py` | |
- vLLM: `start-vllm meta-llama/Llama-2-7b-hf` | |
- Test multi-GPU: `python example_multi_gpu_train.py` | |
## RTX 5090 Support: | |
This image includes patched Unsloth and Flash Attention 2 for RTX 5090 compatibility. | |
## SSH Access: | |
Default password is `runpod`. Change it with `passwd`. | |
## Tips: | |
- Check GPU status: `nvidia-smi` | |
- Monitor GPUs: `nvtop` | |
- List Ollama models: `ollama list` | |
- Pull new models: `ollama pull <model>` | |
EOF | |
# Expose all necessary ports | |
EXPOSE 22 # SSH | |
EXPOSE 8080 # Code-server (VSCode) | |
EXPOSE 11434 # Ollama API | |
EXPOSE 8000 # vLLM API | |
EXPOSE 3000 # Open-WebUI | |
EXPOSE 6006 # TensorBoard | |
EXPOSE 8888 # Jupyter (if needed) | |
EXPOSE 5000 # Flask/FastAPI apps | |
EXPOSE 7860 # Gradio apps | |
EXPOSE 29500 # Distributed training master port | |
# Set working directory | |
WORKDIR /workspace | |
# Run setup on first start and then supervisor | |
CMD bash -c "/workspace/setup.sh && /usr/bin/supervisord -c /etc/supervisor/supervisord.conf" |