Spaces:

mlopez6132
/

nano-coder-free

Running

File size: 7,924 Bytes

"""
Hugging Face Space App for Free H200 Training
This app runs nano-coder training on HF's free H200 GPU (4 minutes daily)
"""

import os
import subprocess
import time
import gradio as gr
from datetime import datetime, timedelta

# Configuration
MAX_TRAINING_TIME = 3.5 * 60  # 3.5 minutes to be safe
TRAINING_SCRIPT = "hf_free_training.py"
DATA_PREP_SCRIPT = "prepare_code_dataset.py"

def check_daily_limit():
    """Check if we've used today's free H200 time."""
    today = datetime.now().date()
    limit_file = f"daily_limit_{today}.txt"
    
    # For debugging, let's check what's in the file
    if os.path.exists(limit_file):
        try:
            with open(limit_file, 'r') as f:
                last_run = f.read().strip()
            print(f"Debug: Found limit file with content: '{last_run}' for date: {today}")
            if last_run == str(today):
                return False, f"Daily H200 limit reached. Try again tomorrow! (Last run: {last_run})"
        except Exception as e:
            print(f"Debug: Error reading limit file: {e}")
            # If there's an error reading the file, let's allow training
            return True, "Ready to train! (Limit file error, allowing training)"
    else:
        print(f"Debug: No limit file found for today: {today}")
    
    return True, "Ready to train!"

def mark_daily_usage():
    """Mark that we've used today's free time."""
    today = datetime.now().date()
    limit_file = f"daily_limit_{today}.txt"
    
    with open(limit_file, 'w') as f:
        f.write(str(today))
    print(f"Debug: Marked daily usage for {today}")

def reset_daily_limit():
    """Reset the daily limit (for testing)."""
    today = datetime.now().date()
    limit_file = f"daily_limit_{today}.txt"
    
    if os.path.exists(limit_file):
        os.remove(limit_file)
        return f"✅ Daily limit reset for {today}"
    else:
        return f"ℹ️ No limit file found for {today}"

def run_training():
    """Run the free H200 training."""
    
    # Check daily limit
    can_run, message = check_daily_limit()
    if not can_run:
        return message
    
    try:
        # Mark usage
        mark_daily_usage()
        
        # Prepare dataset if not already done
        if not os.path.exists("data/python-codes-25k/train.bin"):
            print("Preparing dataset...")
            subprocess.run(["python", DATA_PREP_SCRIPT], check=True)
        
        # Run training
        print("Starting free H200 training...")
        start_time = time.time()
        
        # Set environment variables for HF
        env = os.environ.copy()
        # HF Spaces automatically provides HF_TOKEN
        if 'HF_TOKEN' not in env:
            env['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')
        
        # Run training with timeout
        process = subprocess.Popen(
            ["python", TRAINING_SCRIPT],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
            env=env
        )
        
        output_lines = []
        while True:
            elapsed = time.time() - start_time
            if elapsed > MAX_TRAINING_TIME:
                process.terminate()
                output_lines.append(f"\n⏰ Time limit reached ({elapsed/60:.1f} minutes)")
                break
            
            line = process.stdout.readline()
            if not line and process.poll() is not None:
                break
            
            if line:
                output_lines.append(line.strip())
                print(line.strip())
        
        # Wait for process to finish
        process.wait()
        
        # Check if training completed successfully
        if process.returncode == 0:
            result = "✅ Training completed successfully!\n\n" + "\n".join(output_lines[-20:])  # Last 20 lines
        else:
            result = "❌ Training failed or was interrupted.\n\n" + "\n".join(output_lines[-20:])
        
        return result
        
    except Exception as e:
        return f"❌ Error during training: {str(e)}"

def check_model_status():
    """Check if trained model exists."""
    model_path = "out-nano-coder-free/ckpt.pt"
    if os.path.exists(model_path):
        # Get file size
        size = os.path.getsize(model_path) / (1024 * 1024)  # MB
        return f"✅ Model found! Size: {size:.1f} MB"
    else:
        return "❌ No trained model found. Run training first."

def generate_sample_code(prompt, max_tokens=100, temperature=0.8):
    """Generate code using the trained model."""
    if not os.path.exists("out-nano-coder-free/ckpt.pt"):
        return "❌ No trained model found. Please run training first."
    
    try:
        # Import and run sampling
        from sample_nano_coder import load_model, load_vocab, generate_code
        
        model, checkpoint = load_model()
        stoi, itos = load_vocab()
        
        # Generate code
        completion = generate_code(model, stoi, itos, prompt, max_tokens, temperature, 200)
        
        return f"Generated code:\n\n{completion}"
        
    except Exception as e:
        return f"❌ Error generating code: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Nano-Coder Free H200 Training") as demo:
    gr.Markdown("# 🚀 Nano-Coder Free H200 Training")
    gr.Markdown("Train a nanoGPT model for Python code generation using Hugging Face's free H200 GPU (4 minutes daily)")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎯 Training Control")
            train_button = gr.Button("🚀 Start Free H200 Training", variant="primary")
            reset_button = gr.Button("🔄 Reset Daily Limit", variant="secondary")
            status_text = gr.Textbox(label="Training Status", lines=10, interactive=False)
            
        with gr.Column():
            gr.Markdown("### 📊 Model Status")
            model_status_button = gr.Button("🔍 Check Model Status")
            model_status_text = gr.Textbox(label="Model Status", lines=2, interactive=False)
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎨 Code Generation")
            code_prompt = gr.Textbox(
                label="Code Prompt", 
                placeholder="def fibonacci(n):\n    ",
                lines=3
            )
            with gr.Row():
                max_tokens = gr.Slider(50, 500, 100, label="Max Tokens")
                temperature = gr.Slider(0.1, 2.0, 0.8, label="Temperature")
            generate_button = gr.Button("✨ Generate Code")
            generated_code = gr.Textbox(label="Generated Code", lines=10, interactive=False)
    
    # Event handlers
    train_button.click(
        fn=run_training,
        outputs=status_text
    )
    
    reset_button.click(
        fn=reset_daily_limit,
        outputs=status_text
    )
    
    model_status_button.click(
        fn=check_model_status,
        outputs=model_status_text
    )
    
    generate_button.click(
        fn=generate_sample_code,
        inputs=[code_prompt, max_tokens, temperature],
        outputs=generated_code
    )
    
    gr.Markdown("""
    ### 📋 Instructions
    
    1. **Daily Limit**: You get 4 minutes of free H200 GPU time per day
    2. **Training**: Click "Start Free H200 Training" to begin
    3. **Model**: Check model status after training
    4. **Generation**: Use the trained model to generate Python code
    
    ### ⚙️ Model Configuration (Free Tier)
    - **Layers**: 6 (reduced from 12)
    - **Heads**: 6 (reduced from 12) 
    - **Embedding**: 384 (reduced from 768)
    - **Context**: 512 tokens
    - **Parameters**: ~15M (vs 124M full model)
    
    ### 💡 Tips
    - Training automatically stops at 3.5 minutes to be safe
    - Model checkpoints are saved to HF Hub
    - Use shorter prompts for better results
    """)

if __name__ == "__main__":
    demo.launch()