frugal-ai-challenge-submission

Sleeping

frugal-ai-challenge-submission / training /train.py

Nicolas Denier

ready for submission

5ad4868 4 months ago

5.01 kB

	import time
	import torch
	from torch import optim
	from torch import nn
	from torchmetrics.classification import BinaryAccuracy
	from torch.optim.lr_scheduler import OneCycleLR
	from torch.amp import autocast
	import mlflow
	from tqdm import tqdm
	import model
	import preprocess
	import dataset

	# MLflow server
	mlflow.set_tracking_uri(uri="http://localhost:8080")
	mlflow.set_experiment("Optimizations")

	start_time = time.time()

	# batch best lr
	# 8 1e-3
	# 16 5e-3
	# hyperparameters
	hp = {
	'batch_size': 16,
	'learning_rate': 5e-3,
	'num_epochs': 10,
	}

	device ='cuda' if torch.cuda.is_available() else 'cpu'

	preprocess.hflogin()

	# Prepare datasets
	custom_dataset = dataset.ChainsawDataset()
	train_dataset = preprocess.get_dataset('train', device)
	train_dataset = torch.utils.data.ConcatDataset([train_dataset, custom_dataset])
	val_dataset = preprocess.get_dataset('test', device)
	train_dataloader = preprocess.get_dataloader(train_dataset, batch_size=hp['batch_size'], shuffle=True)
	val_dataloader = preprocess.get_dataloader(val_dataset, batch_size=hp['batch_size'], shuffle=False)

	# Load model
	model = model.ChainsawDetector(hp['batch_size']).to(device, dtype=torch.bfloat16)
	model = torch.compile(model)
	model.load_state_dict(torch.load('backups/final-bf16.pth', weights_only=True), strict=True)


	hp['total_params'] = sum(p.numel() for p in model.parameters())
	print(f"model ready, {hp['total_params']} parameters")

	loss_fn = nn.BCELoss()
	hp["loss_fn"] = 'BinaryCrossEntropyLoss'
	optimizer = optim.AdamW(model.parameters(), lr=hp['learning_rate'])

	total_iterations=len(train_dataset)
	steps_per_epoch=total_iterations//hp['batch_size']
	total_steps = total_iterations*hp['num_epochs']
	print(f"batch_size = {hp['batch_size']}, num_epochs = {hp['num_epochs']}")
	print(f'{total_iterations=}, {steps_per_epoch=}, {total_steps=}')
	lrscheduler = OneCycleLR(optimizer, max_lr=hp['learning_rate'], steps_per_epoch=steps_per_epoch, epochs=hp['num_epochs'])
	hp["optimizer"] = 'AdamW'
	metric_fn = BinaryAccuracy(threshold=0.5)

	def train(loader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch):
	for batch_index, (data, targets) in enumerate(tqdm(loader)):
	# Move data and targets to the device (GPU/CPU)
	data = data.to(device, dtype=torch.bfloat16)
	data = preprocess.augment(data)
	targets = targets.to(device, dtype=torch.bfloat16)

	optimizer.zero_grad()
	# Forward pass: compute the model output
	with autocast(device_type=device, dtype=torch.bfloat16):
	predictions = model(data)
	loss = loss_fn(predictions, targets)

	# Backward pass: compute the gradients
	loss.backward()

	# Optimization step: update the model parameters
	optimizer.step()
	lrscheduler.step()

	if batch_index % 100 == 0:
	loss = loss.item()
	accuracy = metric_fn(predictions, targets)
	step = batch_index + epoch*steps_per_epoch
	mlflow.log_metric("lr", lrscheduler.get_last_lr()[0], step=step)
	mlflow.log_metric("train_loss", f"{loss:2f}", step=step)
	mlflow.log_metric("train_accuracy", f"{accuracy:2f}", step=step)

	def decide(x):
	return 1 if x>=0.5 else 0

	MAE = torch.nn.L1Loss()

	def evaluate(loader, model, epoch, loss_fn=loss_fn):
	num_correct = 0
	num_samples = 0
	num_batches = 0
	loss = 0
	confidence = 0
	model.eval()

	with torch.no_grad(), autocast(device_type=device):
	for X, y in loader:
	X = X.to(device, dtype=torch.bfloat16)
	y = y.to(device, dtype=torch.bfloat16)

	predictions = model(X)

	decisions = predictions.detach().clone()
	decisions.apply_(decide)

	confidence += MAE(decisions, predictions)
	loss += loss_fn(decisions, y).item()
	num_correct += (decisions == y).sum() # Count correct predictions
	num_samples += decisions.size(0) # Count total samples
	num_batches +=1

	# Calculate metrics
	accuracy = float(num_correct) / float(num_samples) * 100
	loss /= num_batches
	confidence /= num_batches
	confidence = 1-confidence
	mlflow.log_metric("val_loss", f"{loss:2f}", step=epoch)
	mlflow.log_metric("val_accuracy", f"{accuracy:2f}", step=epoch)
	mlflow.log_metric("val_confidence", f"{confidence:2f}", step=epoch)
	print(f"Got {num_correct}/{num_samples} with accuracy {accuracy:.2f}% and confidence {confidence:.2f}")
	model.train()


	with mlflow.start_run() as run:
	mlflow.log_params(hp)
	for epoch in range(0, hp['num_epochs']):
	print(f"Epoch [{epoch+1}/{hp['num_epochs']}]")
	train(train_dataloader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch)
	evaluate(val_dataloader, model, epoch)

	model.eval()

	elapsed = time.time() - start_time
	print(f"--- {elapsed:.2f} seconds ---")

	torch.save(model.state_dict(), 'backups/name.pth')