mlopez6132 commited on
Commit
cc8cb1e
·
verified ·
1 Parent(s): 9d2bb4c

Upload hf_free_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hf_free_training.py +378 -0
hf_free_training.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Free H200 Training Script for Nano-Coder
3
+ Optimized for HF's free 4-minute daily H200 access
4
+ """
5
+
6
+ import os
7
+ import time
8
+ import math
9
+ import pickle
10
+ from contextlib import nullcontext
11
+
12
+ import numpy as np
13
+ import torch
14
+ from torch.nn.parallel import DistributedDataParallel as DDP
15
+ from torch.distributed import init_process_group, destroy_process_group
16
+
17
+ from model import GPTConfig, GPT
18
+
19
+ # Hugging Face specific imports
20
+ from huggingface_hub import HfApi, login
21
+ import wandb
22
+
23
+ # -----------------------------------------------------------------------------
24
+ # Configuration optimized for FREE H200 (4 minutes daily)
25
+ # I/O
26
+ out_dir = 'out-nano-coder-free'
27
+ eval_interval = 50 # Very frequent evaluation for short runs
28
+ log_interval = 2
29
+ eval_iters = 10 # Fewer eval iterations
30
+ eval_only = False
31
+ always_save_checkpoint = True
32
+ init_from = 'scratch'
33
+
34
+ # wandb logging - enabled for HF
35
+ wandb_log = True
36
+ wandb_project = 'nano-coder-free'
37
+ wandb_run_name = 'nano-coder-h200-free'
38
+
39
+ # data
40
+ dataset = 'python-codes-25k'
41
+ gradient_accumulation_steps = 1 * 8 # Minimal for H200
42
+ batch_size = 64 # Larger batch size for H200 efficiency
43
+ block_size = 512 # Smaller context for faster training
44
+
45
+ # model - smaller for free tier
46
+ n_layer = 6 # Reduced from 12
47
+ n_head = 6 # Reduced from 12
48
+ n_embd = 384 # Reduced from 768
49
+ dropout = 0.1
50
+ bias = False
51
+
52
+ # optimizer - optimized for H200
53
+ learning_rate = 1e-3 # Higher learning rate for faster convergence
54
+ max_iters = 1000 # Limited iterations for 4-minute runs
55
+ weight_decay = 1e-1
56
+ beta1 = 0.9
57
+ beta2 = 0.95
58
+ grad_clip = 1.0
59
+
60
+ # learning rate decay - faster for short runs
61
+ decay_lr = True
62
+ warmup_iters = 100 # Shorter warmup
63
+ lr_decay_iters = 1000
64
+ min_lr = 1e-4
65
+
66
+ # DDP settings
67
+ backend = 'nccl'
68
+
69
+ # system
70
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
71
+ dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
72
+ compile = True
73
+
74
+ # HF specific
75
+ hf_repo_id = "mlopez6132/nano-coder-free" # Free tier repo
76
+ push_to_hub = True
77
+
78
+ # Time tracking for 4-minute limit
79
+ start_time = time.time()
80
+ MAX_TRAINING_TIME = 3.5 * 60 # 3.5 minutes to be safe
81
+
82
+ # -----------------------------------------------------------------------------
83
+ config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
84
+ exec(open('configurator.py').read())
85
+ config = {k: globals()[k] for k in config_keys}
86
+
87
+ # -----------------------------------------------------------------------------
88
+
89
+ # HF setup
90
+ if push_to_hub:
91
+ login() # Will use HF_TOKEN environment variable
92
+ api = HfApi()
93
+
94
+ # various inits, derived attributes, I/O setup
95
+ ddp = int(os.environ.get('RANK', -1)) != -1
96
+ if ddp:
97
+ init_process_group(backend=backend)
98
+ ddp_rank = int(os.environ['RANK'])
99
+ ddp_local_rank = int(os.environ['LOCAL_RANK'])
100
+ ddp_world_size = int(os.environ['WORLD_SIZE'])
101
+ device = f'cuda:{ddp_local_rank}'
102
+ torch.cuda.set_device(device)
103
+ master_process = ddp_rank == 0
104
+ seed_offset = ddp_rank
105
+ assert gradient_accumulation_steps % ddp_world_size == 0
106
+ gradient_accumulation_steps //= ddp_world_size
107
+ else:
108
+ master_process = True
109
+ seed_offset = 0
110
+ ddp_world_size = 1
111
+
112
+ tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
113
+ print(f"tokens per iteration will be: {tokens_per_iter:,}")
114
+ print(f"FREE H200 TRAINING - MAX TIME: {MAX_TRAINING_TIME/60:.1f} minutes")
115
+
116
+ if master_process:
117
+ os.makedirs(out_dir, exist_ok=True)
118
+
119
+ torch.manual_seed(1337 + seed_offset)
120
+ torch.backends.cuda.matmul.allow_tf32 = True
121
+ torch.backends.cudnn.allow_tf32 = True
122
+ device_type = 'cuda' if 'cuda' in device else 'cpu'
123
+ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
124
+ ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
125
+
126
+ # data loader
127
+ data_dir = os.path.join('data', dataset)
128
+ def get_batch(split):
129
+ if split == 'train':
130
+ data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
131
+ else:
132
+ data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
133
+ ix = torch.randint(len(data) - block_size, (batch_size,))
134
+ x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
135
+ y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
136
+ if device_type == 'cuda':
137
+ x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
138
+ else:
139
+ x, y = x.to(device), y.to(device)
140
+ return x, y
141
+
142
+ # init these up here, can override if init_from='resume'
143
+ iter_num = 0
144
+ best_val_loss = 1e9
145
+
146
+ # attempt to derive vocab_size from the dataset
147
+ meta_path = os.path.join(data_dir, 'meta.pkl')
148
+ meta_vocab_size = None
149
+ if os.path.exists(meta_path):
150
+ with open(meta_path, 'rb') as f:
151
+ meta = pickle.load(f)
152
+ meta_vocab_size = meta['vocab_size']
153
+ print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
154
+
155
+ # model init
156
+ model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
157
+ bias=bias, vocab_size=None, dropout=dropout)
158
+
159
+ if init_from == 'scratch':
160
+ print("Initializing a new nano-coder model from scratch (FREE TIER)")
161
+ if meta_vocab_size is None:
162
+ print("defaulting to vocab_size of GPT-2 to 50304")
163
+ model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
164
+ gptconf = GPTConfig(**model_args)
165
+ model = GPT(gptconf)
166
+ elif init_from == 'resume':
167
+ print(f"Resuming training from {out_dir}")
168
+ ckpt_path = os.path.join(out_dir, 'ckpt.pt')
169
+ checkpoint = torch.load(ckpt_path, map_location=device)
170
+ checkpoint_model_args = checkpoint['model_args']
171
+ for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
172
+ model_args[k] = checkpoint_model_args[k]
173
+ gptconf = GPTConfig(**model_args)
174
+ model = GPT(gptconf)
175
+ state_dict = checkpoint['model']
176
+ unwanted_prefix = '_orig_mod.'
177
+ for k,v in list(state_dict.items()):
178
+ if k.startswith(unwanted_prefix):
179
+ state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
180
+ model.load_state_dict(state_dict)
181
+ iter_num = checkpoint['iter_num']
182
+ best_val_loss = checkpoint['best_val_loss']
183
+ elif init_from.startswith('gpt2'):
184
+ print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
185
+ override_args = dict(dropout=dropout)
186
+ model = GPT.from_pretrained(init_from, override_args)
187
+ for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
188
+ model_args[k] = getattr(model.config, k)
189
+
190
+ if block_size < model.config.block_size:
191
+ model.crop_block_size(block_size)
192
+ model_args['block_size'] = block_size
193
+
194
+ model.to(device)
195
+
196
+ # initialize a GradScaler
197
+ scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
198
+
199
+ # optimizer
200
+ optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
201
+ if init_from == 'resume':
202
+ optimizer.load_state_dict(checkpoint['optimizer'])
203
+ checkpoint = None
204
+
205
+ # compile the model
206
+ if compile:
207
+ print("compiling the model... (takes a ~minute)")
208
+ unoptimized_model = model
209
+ model = torch.compile(model)
210
+
211
+ # wrap model into DDP container
212
+ if ddp:
213
+ model = DDP(model, device_ids=[ddp_local_rank])
214
+
215
+ # helps estimate an arbitrarily accurate loss over either split using many batches
216
+ @torch.no_grad()
217
+ def estimate_loss():
218
+ out = {}
219
+ model.eval()
220
+ for split in ['train', 'val']:
221
+ losses = torch.zeros(eval_iters)
222
+ for k in range(eval_iters):
223
+ X, Y = get_batch(split)
224
+ with ctx:
225
+ logits, loss = model(X, Y)
226
+ losses[k] = loss.item()
227
+ out[split] = losses.mean()
228
+ model.train()
229
+ return out
230
+
231
+ # learning rate decay scheduler (cosine with warmup)
232
+ def get_lr(it):
233
+ if it < warmup_iters:
234
+ return learning_rate * (it + 1) / (warmup_iters + 1)
235
+ if it > lr_decay_iters:
236
+ return min_lr
237
+ decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
238
+ assert 0 <= decay_ratio <= 1
239
+ coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
240
+ return min_lr + coeff * (learning_rate - min_lr)
241
+
242
+ # logging
243
+ if wandb_log and master_process:
244
+ wandb.init(project=wandb_project, name=wandb_run_name, config=config)
245
+
246
+ # HF checkpoint upload function
247
+ def upload_checkpoint_to_hf(checkpoint_path, iter_num):
248
+ if push_to_hub and master_process:
249
+ try:
250
+ # Create a unique filename
251
+ filename = f"checkpoint_iter_{iter_num}.pt"
252
+ file_path = os.path.join(out_dir, filename)
253
+
254
+ # Copy checkpoint with new name
255
+ import shutil
256
+ shutil.copy2(checkpoint_path, file_path)
257
+
258
+ # Upload to HF
259
+ api.upload_file(
260
+ path_or_fileobj=file_path,
261
+ path_in_repo=filename,
262
+ repo_id=hf_repo_id,
263
+ repo_type="model"
264
+ )
265
+ print(f"Uploaded checkpoint to HF: {filename}")
266
+
267
+ # Clean up local copy
268
+ os.remove(file_path)
269
+ except Exception as e:
270
+ print(f"Failed to upload checkpoint: {e}")
271
+
272
+ # training loop
273
+ print("Starting FREE H200 nano-coder training...")
274
+ X, Y = get_batch('train')
275
+ t0 = time.time()
276
+ local_iter_num = 0
277
+ raw_model = model.module if ddp else model
278
+ running_mfu = -1.0
279
+
280
+ while True:
281
+ # Check time limit
282
+ elapsed_time = time.time() - start_time
283
+ if elapsed_time > MAX_TRAINING_TIME:
284
+ print(f"\n⏰ TIME LIMIT REACHED! Training stopped after {elapsed_time/60:.1f} minutes")
285
+ break
286
+
287
+ # determine and set the learning rate for this iteration
288
+ lr = get_lr(iter_num) if decay_lr else learning_rate
289
+ for param_group in optimizer.param_groups:
290
+ param_group['lr'] = lr
291
+
292
+ # evaluate the loss on train/val sets and write checkpoints
293
+ if iter_num % eval_interval == 0 and master_process:
294
+ losses = estimate_loss()
295
+ remaining_time = MAX_TRAINING_TIME - elapsed_time
296
+ print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, time left: {remaining_time/60:.1f}min")
297
+ if wandb_log:
298
+ wandb.log({
299
+ "iter": iter_num,
300
+ "train/loss": losses['train'],
301
+ "val/loss": losses['val'],
302
+ "lr": lr,
303
+ "mfu": running_mfu*100,
304
+ "elapsed_time": elapsed_time,
305
+ "remaining_time": remaining_time,
306
+ })
307
+ if losses['val'] < best_val_loss or always_save_checkpoint:
308
+ best_val_loss = losses['val']
309
+ if iter_num > 0:
310
+ checkpoint = {
311
+ 'model': raw_model.state_dict(),
312
+ 'optimizer': optimizer.state_dict(),
313
+ 'model_args': model_args,
314
+ 'iter_num': iter_num,
315
+ 'best_val_loss': best_val_loss,
316
+ 'config': config,
317
+ }
318
+ checkpoint_path = os.path.join(out_dir, 'ckpt.pt')
319
+ print(f"saving checkpoint to {out_dir}")
320
+ torch.save(checkpoint, checkpoint_path)
321
+
322
+ # Upload to HF every 200 iterations (frequent for short runs)
323
+ if iter_num % 200 == 0:
324
+ upload_checkpoint_to_hf(checkpoint_path, iter_num)
325
+ if iter_num == 0 and eval_only:
326
+ break
327
+
328
+ # forward backward update
329
+ for micro_step in range(gradient_accumulation_steps):
330
+ if ddp:
331
+ model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
332
+ with ctx:
333
+ logits, loss = model(X, Y)
334
+ loss = loss / gradient_accumulation_steps
335
+ X, Y = get_batch('train')
336
+ scaler.scale(loss).backward()
337
+
338
+ # clip the gradient
339
+ if grad_clip != 0.0:
340
+ scaler.unscale_(optimizer)
341
+ torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
342
+
343
+ # step the optimizer and scaler
344
+ scaler.step(optimizer)
345
+ scaler.update()
346
+ optimizer.zero_grad(set_to_none=True)
347
+
348
+ # timing and logging
349
+ t1 = time.time()
350
+ dt = t1 - t0
351
+ t0 = t1
352
+ if iter_num % log_interval == 0 and master_process:
353
+ lossf = loss.item() * gradient_accumulation_steps
354
+ if local_iter_num >= 5:
355
+ mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
356
+ running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
357
+ remaining_time = MAX_TRAINING_TIME - elapsed_time
358
+ print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%, remaining: {remaining_time/60:.1f}min")
359
+ iter_num += 1
360
+ local_iter_num += 1
361
+
362
+ # termination conditions
363
+ if iter_num > max_iters:
364
+ break
365
+
366
+ if ddp:
367
+ destroy_process_group()
368
+
369
+ # Final upload
370
+ if push_to_hub and master_process:
371
+ upload_checkpoint_to_hf(os.path.join(out_dir, 'ckpt.pt'), 'final')
372
+
373
+ total_time = time.time() - start_time
374
+ print(f"\n🎉 FREE H200 TRAINING COMPLETED!")
375
+ print(f"Total training time: {total_time/60:.1f} minutes")
376
+ print(f"Total iterations: {iter_num}")
377
+ print(f"Final validation loss: {best_val_loss:.4f}")
378
+ print(f"Model saved to: {out_dir}")