Spaces:

miike-ai
/

vscode-python312

Running

App Files Files Community

vscode-python312 / Dockerfile

miike-ai

Update Dockerfile

f3df9f4 verified about 2 months ago

raw

history blame

10.2 kB

	FROM runpod/pytorch:2.8.0-py3.11-cuda12.8.1-cudnn-devel-ubuntu22.04

	# Set environment variables
	ENV DEBIAN_FRONTEND=noninteractive
	ENV PYTHONUNBUFFERED=1
	ENV HF_HUB_ENABLE_HF_TRANSFER=1
	ENV CUDA_VISIBLE_DEVICES=all
	ENV NVIDIA_VISIBLE_DEVICES=all
	ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
	ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
	ENV PATH=/usr/local/cuda/bin:$PATH

	# Install system dependencies
	RUN apt-get update && apt-get install -y \
	wget \
	curl \
	git \
	vim \
	tmux \
	htop \
	nvtop \
	build-essential \
	software-properties-common \
	ca-certificates \
	gnupg \
	lsb-release \
	sudo \
	openssh-server \
	nginx \
	supervisor \
	&& rm -rf /var/lib/apt/lists/*

	# Install code-server (VSCode in browser)
	RUN curl -fsSL https://code-server.dev/install.sh \| sh

	# Install Ollama
	RUN curl -fsSL https://ollama.com/install.sh \| sh

	# Upgrade pip and install base Python packages
	RUN pip install --upgrade pip setuptools wheel

	# Install hf_transfer first for faster downloads
	RUN pip install hf_transfer

	# Install critical ML infrastructure
	RUN pip install \
	accelerate \
	transformers \
	datasets \
	peft \
	bitsandbytes \
	safetensors \
	sentencepiece \
	protobuf \
	scipy \
	einops \
	wandb \
	tensorboard

	# Install vLLM with CUDA 12.8 support
	RUN pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128

	# Install Flash Attention 2 (critical for 5090)
	RUN pip install flash-attn --no-build-isolation

	# Install Unsloth with 5090 patches
	# Using the approach from the referenced repo
	RUN git clone https://github.com/unslothai/unsloth.git /tmp/unsloth && \
	cd /tmp/unsloth && \
	pip install -e . && \
	cd / && \
	rm -rf /tmp/unsloth/.git

	# Install Axolotl
	RUN git clone https://github.com/axolotl-ai-cloud/axolotl /tmp/axolotl && \
	cd /tmp/axolotl && \
	pip install -e . && \
	cd / && \
	rm -rf /tmp/axolotl/.git

	# Install Open-WebUI dependencies
	RUN apt-get update && apt-get install -y \
	nodejs \
	npm \
	&& rm -rf /var/lib/apt/lists/*

	# Clone and setup Open-WebUI
	RUN git clone https://github.com/open-webui/open-webui.git /opt/open-webui && \
	cd /opt/open-webui && \
	npm install && \
	npm run build

	# Create workspace directory
	RUN mkdir -p /workspace

	# Configure code-server
	RUN mkdir -p /root/.config/code-server
	RUN echo "bind-addr: 0.0.0.0:8080\nauth: none\ncert: false" > /root/.config/code-server/config.yaml

	# Configure SSH (optional but useful)
	RUN mkdir /var/run/sshd
	RUN echo 'root:runpod' \| chpasswd
	RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config

	# Create supervisor config to run all services
	RUN mkdir -p /etc/supervisor/conf.d
	RUN cat > /etc/supervisor/conf.d/services.conf << 'EOF'
	[supervisord]
	nodaemon=true

	[program:code-server]
	command=code-server --bind-addr 0.0.0.0:8080 --auth none
	autostart=true
	autorestart=true
	stderr_logfile=/var/log/code-server.err.log
	stdout_logfile=/var/log/code-server.out.log

	[program:ollama]
	command=ollama serve
	autostart=true
	autorestart=true
	environment=OLLAMA_HOST="0.0.0.0"
	stderr_logfile=/var/log/ollama.err.log
	stdout_logfile=/var/log/ollama.out.log

	[program:sshd]
	command=/usr/sbin/sshd -D
	autostart=true
	autorestart=true

	[program:open-webui]
	command=cd /opt/open-webui && npm start
	autostart=true
	autorestart=true
	environment=PORT="3000",OLLAMA_BASE_URL="http://localhost:11434"
	stderr_logfile=/var/log/open-webui.err.log
	stdout_logfile=/var/log/open-webui.out.log
	EOF

	# Create a startup script for vLLM (runs on demand)
	RUN cat > /usr/local/bin/start-vllm << 'EOF'
	#!/bin/bash
	python -m vllm.entrypoints.openai.api_server \
	--model $1 \
	--tensor-parallel-size ${CUDA_DEVICE_COUNT:-1} \
	--gpu-memory-utilization 0.9 \
	--max-model-len 32768 \
	--host 0.0.0.0 \
	--port 8000
	EOF
	RUN chmod +x /usr/local/bin/start-vllm

	# Create multi-GPU training helper script
	RUN cat > /usr/local/bin/train-multi-gpu << 'EOF'
	#!/bin/bash
	GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader \| wc -l)
	accelerate launch \
	--num_processes $GPU_COUNT \
	--num_machines 1 \
	--mixed_precision bf16 \
	--dynamo_backend no \
	$@
	EOF
	RUN chmod +x /usr/local/bin/train-multi-gpu

	# Create accelerate config
	RUN cat > /workspace/accelerate_config.yaml << 'EOF'
	compute_environment: LOCAL_MACHINE
	debug: false
	distributed_type: MULTI_GPU
	downcast_bf16: 'no'
	gpu_ids: all
	machine_rank: 0
	main_training_function: main
	mixed_precision: bf16
	num_machines: 1
	num_processes: 8
	rdzv_backend: static
	same_network: true
	tpu_env: []
	tpu_use_cluster: false
	tpu_use_sudo: false
	use_cpu: false
	EOF

	# Create setup script
	RUN cat > /workspace/setup.sh << 'EOF'
	#!/bin/bash
	echo "🚀 RunPod ML Stack Setup"
	echo "========================"

	# Check GPU availability
	echo -e "\n📊 GPU Status:"
	nvidia-smi --query-gpu=name,driver_version,memory.total --format=csv,noheader,nounits \| nl -v 0

	# Count GPUs
	GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader \| wc -l)
	echo -e "\n✅ Found $GPU_COUNT GPU(s)"

	# Update accelerate config with correct GPU count
	sed -i "s/num_processes: 8/num_processes: $GPU_COUNT/g" /workspace/accelerate_config.yaml

	# Pull a default model for Ollama if not exists
	if ! ollama list \| grep -q "llama3.2"; then
	echo -e "\n📥 Pulling default Ollama model (llama3.2)..."
	ollama pull llama3.2
	fi

	echo -e "\n🎉 Setup complete! Services available at:"
	echo " VSCode: http://localhost:8080"
	echo " Ollama: http://localhost:11434"
	echo " Open-WebUI: http://localhost:3000"
	echo " vLLM: http://localhost:8000 (start with: start-vllm <model>)"
	EOF
	RUN chmod +x /workspace/setup.sh

	# Create example multi-GPU training script
	RUN cat > /workspace/example_multi_gpu_train.py << 'EOF'
	import torch
	from accelerate import Accelerator
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from datasets import load_dataset
	from torch.optim import AdamW
	from torch.utils.data import DataLoader
	from tqdm import tqdm

	def main():
	# Initialize accelerator
	accelerator = Accelerator()

	# Setup
	model_name = "meta-llama/Llama-2-7b-hf"
	batch_size = 4
	gradient_accumulation_steps = 4
	learning_rate = 2e-5
	num_epochs = 3

	# Print GPU info
	if accelerator.is_main_process:
	print(f"🚀 Training on {accelerator.num_processes} GPU(s)")
	print(f"💾 Total batch size: {batch_size * accelerator.num_processes * gradient_accumulation_steps}")

	# Load model and tokenizer
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	use_cache=False,
	)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	tokenizer.pad_token = tokenizer.eos_token

	# Load dataset
	dataset = load_dataset("imdb", split="train[:1000]") # Small subset for demo

	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

	tokenized_dataset = dataset.map(tokenize_function, batched=True)
	tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask"])

	# Create DataLoader
	dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True)

	# Optimizer
	optimizer = AdamW(model.parameters(), lr=learning_rate)

	# Prepare for distributed training
	model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)

	# Training loop
	model.train()
	for epoch in range(num_epochs):
	if accelerator.is_main_process:
	print(f"\nEpoch {epoch + 1}/{num_epochs}")
	progress_bar = tqdm(total=len(dataloader), desc="Training")

	for step, batch in enumerate(dataloader):
	with accelerator.accumulate(model):
	outputs = model(
	input_ids=batch["input_ids"],
	attention_mask=batch["attention_mask"],
	labels=batch["input_ids"],
	)
	loss = outputs.loss
	accelerator.backward(loss)
	optimizer.step()
	optimizer.zero_grad()

	if accelerator.is_main_process:
	progress_bar.update(1)
	if step % 10 == 0:
	progress_bar.set_postfix({"loss": loss.item()})

	if accelerator.is_main_process:
	progress_bar.close()

	# Save model
	if accelerator.is_main_process:
	model.save_pretrained("./trained_model")
	print("✅ Training complete! Model saved to ./trained_model")

	if __name__ == "__main__":
	main()
	EOF

	# Create a helpful README
	RUN cat > /workspace/README.md << 'EOF'
	# RunPod Multi-GPU ML Stack 🚀

	## Quick Start
	Run `/workspace/setup.sh` first to detect GPUs and pull models!

	## Services:
	- VSCode: http://localhost:8080
	- Ollama API: http://localhost:11434
	- vLLM API: http://localhost:8000 (start with: `start-vllm <model-name>`)
	- Open-WebUI: http://localhost:3000

	## Multi-GPU Commands:
	- Training: `train-multi-gpu your_script.py`
	- vLLM: `start-vllm meta-llama/Llama-2-7b-hf`
	- Test multi-GPU: `python example_multi_gpu_train.py`

	## RTX 5090 Support:
	This image includes patched Unsloth and Flash Attention 2 for RTX 5090 compatibility.

	## SSH Access:
	Default password is `runpod`. Change it with `passwd`.

	## Tips:
	- Check GPU status: `nvidia-smi`
	- Monitor GPUs: `nvtop`
	- List Ollama models: `ollama list`
	- Pull new models: `ollama pull <model>`
	EOF

	# Expose all necessary ports
	EXPOSE 22 # SSH
	EXPOSE 8080 # Code-server (VSCode)
	EXPOSE 11434 # Ollama API
	EXPOSE 8000 # vLLM API
	EXPOSE 3000 # Open-WebUI
	EXPOSE 6006 # TensorBoard
	EXPOSE 8888 # Jupyter (if needed)
	EXPOSE 5000 # Flask/FastAPI apps
	EXPOSE 7860 # Gradio apps
	EXPOSE 29500 # Distributed training master port

	# Set working directory
	WORKDIR /workspace

	# Run setup on first start and then supervisor
	CMD bash -c "/workspace/setup.sh && /usr/bin/supervisord -c /etc/supervisor/supervisord.conf"