FROM ubuntu:22.04

# Prevent interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=/usr/local/cuda/bin:${PATH}
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH}

# Install base system dependencies
RUN apt-get update && apt-get install -y \
    wget \
    curl \
    git \
    vim \
    tmux \
    htop \
    build-essential \
    software-properties-common \
    ca-certificates \
    gnupg \
    lsb-release \
    sudo \
    openssh-server \
    nginx \
    supervisor \
    python3.11 \
    python3.11-dev \
    python3-pip \
    && rm -rf /var/lib/apt/lists/*

# Add NVIDIA package repositories
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
    dpkg -i cuda-keyring_1.1-1_all.deb && \
    apt-get update

# Install CUDA 12.8
RUN apt-get install -y cuda-toolkit-12-8 && \
    rm -rf /var/lib/apt/lists/*

# Install cuDNN
RUN apt-get update && apt-get install -y \
    libcudnn9-cuda-12 \
    libcudnn9-dev-cuda-12 \
    && rm -rf /var/lib/apt/lists/*

# Set Python 3.11 as default
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1

# Upgrade pip
RUN python -m pip install --upgrade pip setuptools wheel

# Install PyTorch with CUDA 12.8 support
RUN pip install torch==2.5.1+cu128 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

# Create non-root user for HF Spaces compatibility
RUN useradd -m -u 1000 user && \
    echo "user ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

# Install code-server (VSCode in browser)
RUN curl -fsSL https://code-server.dev/install.sh | sh

# Install Ollama
RUN curl -fsSL https://ollama.com/install.sh | sh

# Install hf_transfer first
RUN pip install hf_transfer

# Install core ML packages
RUN pip install \
    accelerate \
    transformers \
    datasets \
    peft \
    bitsandbytes \
    safetensors \
    sentencepiece \
    protobuf \
    scipy \
    einops \
    wandb \
    tensorboard \
    gradio \
    streamlit

# Install vLLM
RUN pip install vllm

# Install Flash Attention 2
RUN pip install ninja packaging && \
    pip install flash-attn --no-build-isolation

# Install Triton for 5090 support
RUN pip install triton

# Clone and install Unsloth with patches
RUN git clone https://github.com/unslothai/unsloth.git /tmp/unsloth && \
    cd /tmp/unsloth && \
    pip install -e . && \
    cd / && \
    rm -rf /tmp/unsloth/.git

# Clone and install Axolotl
RUN git clone https://github.com/axolotl-ai-cloud/axolotl /tmp/axolotl && \
    cd /tmp/axolotl && \
    pip install -e . && \
    cd / && \
    rm -rf /tmp/axolotl/.git

# Install Node.js for Open-WebUI
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
    apt-get install -y nodejs && \
    rm -rf /var/lib/apt/lists/*

# Clone and setup Open-WebUI
RUN git clone https://github.com/open-webui/open-webui.git /opt/open-webui && \
    cd /opt/open-webui && \
    npm install && \
    npm run build

# Create directories with proper permissions
RUN mkdir -p /home/user/app /home/user/.cache /home/user/.config && \
    chown -R user:user /home/user

# Configure code-server for user
RUN mkdir -p /home/user/.config/code-server && \
    echo "bind-addr: 0.0.0.0:8080\nauth: none\ncert: false" > /home/user/.config/code-server/config.yaml && \
    chown -R user:user /home/user/.config

# Setup SSH
RUN mkdir /var/run/sshd && \
    echo 'user:spaces' | chpasswd && \
    sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin no/' /etc/ssh/sshd_config

# Create supervisor config
RUN mkdir -p /etc/supervisor/conf.d
RUN cat > /etc/supervisor/conf.d/services.conf << 'EOF'
[supervisord]
nodaemon=true
user=root

[program:code-server]
command=sudo -u user code-server --bind-addr 0.0.0.0:8080 --auth none
autostart=true
autorestart=true
stderr_logfile=/var/log/code-server.err.log
stdout_logfile=/var/log/code-server.out.log

[program:ollama]
command=ollama serve
autostart=true
autorestart=true
environment=OLLAMA_HOST="0.0.0.0",HOME="/home/user"
stderr_logfile=/var/log/ollama.err.log
stdout_logfile=/var/log/ollama.out.log

[program:open-webui]
command=cd /opt/open-webui && npm start
autostart=true
autorestart=true
environment=PORT="3000",OLLAMA_BASE_URL="http://localhost:11434"
stderr_logfile=/var/log/open-webui.err.log
stdout_logfile=/var/log/open-webui.out.log
user=user
EOF

# Create Gradio app for HF Spaces
RUN cat > /home/user/app/app.py << 'EOF'
import gradio as gr
import subprocess
import os

def get_services_status():
    services = {
        "VSCode": "http://localhost:8080",
        "Ollama API": "http://localhost:11434", 
        "Open-WebUI": "http://localhost:3000",
        "vLLM": "http://localhost:8000"
    }
    
    status = "# 🚀 ML Stack Services Status\n\n"
    for service, url in services.items():
        status += f"- **{service}**: {url}\n"
    
    # Get GPU info
    try:
        gpu_info = subprocess.check_output(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'], text=True)
        status += f"\n## 🎮 GPU Status\n```\n{gpu_info}```"
    except:
        status += "\n## ⚠️ No GPUs detected"
    
    return status

def launch_vllm(model_name):
    try:
        cmd = f"python -m vllm.entrypoints.openai.api_server --model {model_name} --host 0.0.0.0 --port 8000"
        subprocess.Popen(cmd, shell=True)
        return f"✅ Launching vLLM with model: {model_name}"
    except Exception as e:
        return f"❌ Error: {str(e)}"

def pull_ollama_model(model_name):
    try:
        result = subprocess.run(['ollama', 'pull', model_name], capture_output=True, text=True)
        return f"✅ {result.stdout}\n{result.stderr}"
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="ML Stack Control Panel") as demo:
    gr.Markdown("# 🎮 RunPod ML Stack Control Panel")
    
    with gr.Tab("Status"):
        status_btn = gr.Button("🔄 Refresh Status")
        status_output = gr.Markdown()
        status_btn.click(get_services_status, outputs=status_output)
    
    with gr.Tab("vLLM"):
        model_input = gr.Textbox(label="Model Name", value="meta-llama/Llama-2-7b-hf")
        vllm_btn = gr.Button("🚀 Launch vLLM")
        vllm_output = gr.Textbox(label="Output")
        vllm_btn.click(launch_vllm, inputs=model_input, outputs=vllm_output)
    
    with gr.Tab("Ollama"):
        ollama_model = gr.Textbox(label="Model Name", value="llama3.2")
        ollama_btn = gr.Button("📥 Pull Model")
        ollama_output = gr.Textbox(label="Output")
        ollama_btn.click(pull_ollama_model, inputs=ollama_model, outputs=ollama_output)
    
    # Load initial status
    demo.load(get_services_status, outputs=status_output)

if __name__ == "__main__":
    # Start supervisor in background
    subprocess.Popen(["/usr/bin/supervisord", "-c", "/etc/supervisor/supervisord.conf"])
    
    # Launch Gradio
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
EOF

# Create startup script
RUN cat > /home/user/app/start.sh << 'EOF'
#!/bin/bash
cd /home/user/app
python app.py
EOF
RUN chmod +x /home/user/app/start.sh

# Fix permissions
RUN chown -R user:user /home/user /opt/open-webui

# Expose ports
EXPOSE 22      # SSH
EXPOSE 7860    # Gradio (HF Spaces default)
EXPOSE 8080    # Code-server
EXPOSE 11434   # Ollama
EXPOSE 8000    # vLLM
EXPOSE 3000    # Open-WebUI

# Switch to user
USER user
WORKDIR /home/user/app

# Set the entrypoint for HF Spaces
CMD ["python", "app.py"]