File size: 7,924 Bytes
9d2bb4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee850c7
9d2bb4c
ee850c7
 
 
 
 
 
 
 
 
 
 
 
9d2bb4c
 
 
 
 
 
 
 
 
 
ee850c7
 
 
 
 
 
 
 
 
 
 
 
9d2bb4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee850c7
 
 
 
 
 
9d2bb4c
 
 
 
 
ee850c7
 
9d2bb4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee850c7
9d2bb4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee850c7
 
 
 
 
9d2bb4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
Hugging Face Space App for Free H200 Training
This app runs nano-coder training on HF's free H200 GPU (4 minutes daily)
"""

import os
import subprocess
import time
import gradio as gr
from datetime import datetime, timedelta

# Configuration
MAX_TRAINING_TIME = 3.5 * 60  # 3.5 minutes to be safe
TRAINING_SCRIPT = "hf_free_training.py"
DATA_PREP_SCRIPT = "prepare_code_dataset.py"

def check_daily_limit():
    """Check if we've used today's free H200 time."""
    today = datetime.now().date()
    limit_file = f"daily_limit_{today}.txt"
    
    # For debugging, let's check what's in the file
    if os.path.exists(limit_file):
        try:
            with open(limit_file, 'r') as f:
                last_run = f.read().strip()
            print(f"Debug: Found limit file with content: '{last_run}' for date: {today}")
            if last_run == str(today):
                return False, f"Daily H200 limit reached. Try again tomorrow! (Last run: {last_run})"
        except Exception as e:
            print(f"Debug: Error reading limit file: {e}")
            # If there's an error reading the file, let's allow training
            return True, "Ready to train! (Limit file error, allowing training)"
    else:
        print(f"Debug: No limit file found for today: {today}")
    
    return True, "Ready to train!"

def mark_daily_usage():
    """Mark that we've used today's free time."""
    today = datetime.now().date()
    limit_file = f"daily_limit_{today}.txt"
    
    with open(limit_file, 'w') as f:
        f.write(str(today))
    print(f"Debug: Marked daily usage for {today}")

def reset_daily_limit():
    """Reset the daily limit (for testing)."""
    today = datetime.now().date()
    limit_file = f"daily_limit_{today}.txt"
    
    if os.path.exists(limit_file):
        os.remove(limit_file)
        return f"βœ… Daily limit reset for {today}"
    else:
        return f"ℹ️ No limit file found for {today}"

def run_training():
    """Run the free H200 training."""
    
    # Check daily limit
    can_run, message = check_daily_limit()
    if not can_run:
        return message
    
    try:
        # Mark usage
        mark_daily_usage()
        
        # Prepare dataset if not already done
        if not os.path.exists("data/python-codes-25k/train.bin"):
            print("Preparing dataset...")
            subprocess.run(["python", DATA_PREP_SCRIPT], check=True)
        
        # Run training
        print("Starting free H200 training...")
        start_time = time.time()
        
        # Set environment variables for HF
        env = os.environ.copy()
        # HF Spaces automatically provides HF_TOKEN
        if 'HF_TOKEN' not in env:
            env['HF_TOKEN'] = os.environ.get('HF_TOKEN', '')
        
        # Run training with timeout
        process = subprocess.Popen(
            ["python", TRAINING_SCRIPT],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            universal_newlines=True,
            env=env
        )
        
        output_lines = []
        while True:
            elapsed = time.time() - start_time
            if elapsed > MAX_TRAINING_TIME:
                process.terminate()
                output_lines.append(f"\n⏰ Time limit reached ({elapsed/60:.1f} minutes)")
                break
            
            line = process.stdout.readline()
            if not line and process.poll() is not None:
                break
            
            if line:
                output_lines.append(line.strip())
                print(line.strip())
        
        # Wait for process to finish
        process.wait()
        
        # Check if training completed successfully
        if process.returncode == 0:
            result = "βœ… Training completed successfully!\n\n" + "\n".join(output_lines[-20:])  # Last 20 lines
        else:
            result = "❌ Training failed or was interrupted.\n\n" + "\n".join(output_lines[-20:])
        
        return result
        
    except Exception as e:
        return f"❌ Error during training: {str(e)}"

def check_model_status():
    """Check if trained model exists."""
    model_path = "out-nano-coder-free/ckpt.pt"
    if os.path.exists(model_path):
        # Get file size
        size = os.path.getsize(model_path) / (1024 * 1024)  # MB
        return f"βœ… Model found! Size: {size:.1f} MB"
    else:
        return "❌ No trained model found. Run training first."

def generate_sample_code(prompt, max_tokens=100, temperature=0.8):
    """Generate code using the trained model."""
    if not os.path.exists("out-nano-coder-free/ckpt.pt"):
        return "❌ No trained model found. Please run training first."
    
    try:
        # Import and run sampling
        from sample_nano_coder import load_model, load_vocab, generate_code
        
        model, checkpoint = load_model()
        stoi, itos = load_vocab()
        
        # Generate code
        completion = generate_code(model, stoi, itos, prompt, max_tokens, temperature, 200)
        
        return f"Generated code:\n\n{completion}"
        
    except Exception as e:
        return f"❌ Error generating code: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Nano-Coder Free H200 Training") as demo:
    gr.Markdown("# πŸš€ Nano-Coder Free H200 Training")
    gr.Markdown("Train a nanoGPT model for Python code generation using Hugging Face's free H200 GPU (4 minutes daily)")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎯 Training Control")
            train_button = gr.Button("πŸš€ Start Free H200 Training", variant="primary")
            reset_button = gr.Button("πŸ”„ Reset Daily Limit", variant="secondary")
            status_text = gr.Textbox(label="Training Status", lines=10, interactive=False)
            
        with gr.Column():
            gr.Markdown("### πŸ“Š Model Status")
            model_status_button = gr.Button("πŸ” Check Model Status")
            model_status_text = gr.Textbox(label="Model Status", lines=2, interactive=False)
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎨 Code Generation")
            code_prompt = gr.Textbox(
                label="Code Prompt", 
                placeholder="def fibonacci(n):\n    ",
                lines=3
            )
            with gr.Row():
                max_tokens = gr.Slider(50, 500, 100, label="Max Tokens")
                temperature = gr.Slider(0.1, 2.0, 0.8, label="Temperature")
            generate_button = gr.Button("✨ Generate Code")
            generated_code = gr.Textbox(label="Generated Code", lines=10, interactive=False)
    
    # Event handlers
    train_button.click(
        fn=run_training,
        outputs=status_text
    )
    
    reset_button.click(
        fn=reset_daily_limit,
        outputs=status_text
    )
    
    model_status_button.click(
        fn=check_model_status,
        outputs=model_status_text
    )
    
    generate_button.click(
        fn=generate_sample_code,
        inputs=[code_prompt, max_tokens, temperature],
        outputs=generated_code
    )
    
    gr.Markdown("""
    ### πŸ“‹ Instructions
    
    1. **Daily Limit**: You get 4 minutes of free H200 GPU time per day
    2. **Training**: Click "Start Free H200 Training" to begin
    3. **Model**: Check model status after training
    4. **Generation**: Use the trained model to generate Python code
    
    ### βš™οΈ Model Configuration (Free Tier)
    - **Layers**: 6 (reduced from 12)
    - **Heads**: 6 (reduced from 12) 
    - **Embedding**: 384 (reduced from 768)
    - **Context**: 512 tokens
    - **Parameters**: ~15M (vs 124M full model)
    
    ### πŸ’‘ Tips
    - Training automatically stops at 3.5 minutes to be safe
    - Model checkpoints are saved to HF Hub
    - Use shorter prompts for better results
    """)

if __name__ == "__main__":
    demo.launch()