Nicolas Denier
ready for submission
5ad4868
import time
import torch
from torch import optim
from torch import nn
from torchmetrics.classification import BinaryAccuracy
from torch.optim.lr_scheduler import OneCycleLR
from torch.amp import autocast
import mlflow
from tqdm import tqdm
import model
import preprocess
import dataset
# MLflow server
mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("Optimizations")
start_time = time.time()
# batch best lr
# 8 1e-3
# 16 5e-3
# hyperparameters
hp = {
'batch_size': 16,
'learning_rate': 5e-3,
'num_epochs': 10,
}
device ='cuda' if torch.cuda.is_available() else 'cpu'
preprocess.hflogin()
# Prepare datasets
custom_dataset = dataset.ChainsawDataset()
train_dataset = preprocess.get_dataset('train', device)
train_dataset = torch.utils.data.ConcatDataset([train_dataset, custom_dataset])
val_dataset = preprocess.get_dataset('test', device)
train_dataloader = preprocess.get_dataloader(train_dataset, batch_size=hp['batch_size'], shuffle=True)
val_dataloader = preprocess.get_dataloader(val_dataset, batch_size=hp['batch_size'], shuffle=False)
# Load model
model = model.ChainsawDetector(hp['batch_size']).to(device, dtype=torch.bfloat16)
model = torch.compile(model)
model.load_state_dict(torch.load('backups/final-bf16.pth', weights_only=True), strict=True)
hp['total_params'] = sum(p.numel() for p in model.parameters())
print(f"model ready, {hp['total_params']} parameters")
loss_fn = nn.BCELoss()
hp["loss_fn"] = 'BinaryCrossEntropyLoss'
optimizer = optim.AdamW(model.parameters(), lr=hp['learning_rate'])
total_iterations=len(train_dataset)
steps_per_epoch=total_iterations//hp['batch_size']
total_steps = total_iterations*hp['num_epochs']
print(f"batch_size = {hp['batch_size']}, num_epochs = {hp['num_epochs']}")
print(f'{total_iterations=}, {steps_per_epoch=}, {total_steps=}')
lrscheduler = OneCycleLR(optimizer, max_lr=hp['learning_rate'], steps_per_epoch=steps_per_epoch, epochs=hp['num_epochs'])
hp["optimizer"] = 'AdamW'
metric_fn = BinaryAccuracy(threshold=0.5)
def train(loader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch):
for batch_index, (data, targets) in enumerate(tqdm(loader)):
# Move data and targets to the device (GPU/CPU)
data = data.to(device, dtype=torch.bfloat16)
data = preprocess.augment(data)
targets = targets.to(device, dtype=torch.bfloat16)
optimizer.zero_grad()
# Forward pass: compute the model output
with autocast(device_type=device, dtype=torch.bfloat16):
predictions = model(data)
loss = loss_fn(predictions, targets)
# Backward pass: compute the gradients
loss.backward()
# Optimization step: update the model parameters
optimizer.step()
lrscheduler.step()
if batch_index % 100 == 0:
loss = loss.item()
accuracy = metric_fn(predictions, targets)
step = batch_index + epoch*steps_per_epoch
mlflow.log_metric("lr", lrscheduler.get_last_lr()[0], step=step)
mlflow.log_metric("train_loss", f"{loss:2f}", step=step)
mlflow.log_metric("train_accuracy", f"{accuracy:2f}", step=step)
def decide(x):
return 1 if x>=0.5 else 0
MAE = torch.nn.L1Loss()
def evaluate(loader, model, epoch, loss_fn=loss_fn):
num_correct = 0
num_samples = 0
num_batches = 0
loss = 0
confidence = 0
model.eval()
with torch.no_grad(), autocast(device_type=device):
for X, y in loader:
X = X.to(device, dtype=torch.bfloat16)
y = y.to(device, dtype=torch.bfloat16)
predictions = model(X)
decisions = predictions.detach().clone()
decisions.apply_(decide)
confidence += MAE(decisions, predictions)
loss += loss_fn(decisions, y).item()
num_correct += (decisions == y).sum() # Count correct predictions
num_samples += decisions.size(0) # Count total samples
num_batches +=1
# Calculate metrics
accuracy = float(num_correct) / float(num_samples) * 100
loss /= num_batches
confidence /= num_batches
confidence = 1-confidence
mlflow.log_metric("val_loss", f"{loss:2f}", step=epoch)
mlflow.log_metric("val_accuracy", f"{accuracy:2f}", step=epoch)
mlflow.log_metric("val_confidence", f"{confidence:2f}", step=epoch)
print(f"Got {num_correct}/{num_samples} with accuracy {accuracy:.2f}% and confidence {confidence:.2f}")
model.train()
with mlflow.start_run() as run:
mlflow.log_params(hp)
for epoch in range(0, hp['num_epochs']):
print(f"Epoch [{epoch+1}/{hp['num_epochs']}]")
train(train_dataloader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch)
evaluate(val_dataloader, model, epoch)
model.eval()
elapsed = time.time() - start_time
print(f"--- {elapsed:.2f} seconds ---")
torch.save(model.state_dict(), 'backups/name.pth')