import time import torch from torch import optim from torch import nn from torchmetrics.classification import BinaryAccuracy from torch.optim.lr_scheduler import OneCycleLR from torch.amp import autocast import mlflow from tqdm import tqdm import model import preprocess import dataset # MLflow server mlflow.set_tracking_uri(uri="http://localhost:8080") mlflow.set_experiment("Optimizations") start_time = time.time() # batch best lr # 8 1e-3 # 16 5e-3 # hyperparameters hp = { 'batch_size': 16, 'learning_rate': 5e-3, 'num_epochs': 10, } device ='cuda' if torch.cuda.is_available() else 'cpu' preprocess.hflogin() # Prepare datasets custom_dataset = dataset.ChainsawDataset() train_dataset = preprocess.get_dataset('train', device) train_dataset = torch.utils.data.ConcatDataset([train_dataset, custom_dataset]) val_dataset = preprocess.get_dataset('test', device) train_dataloader = preprocess.get_dataloader(train_dataset, batch_size=hp['batch_size'], shuffle=True) val_dataloader = preprocess.get_dataloader(val_dataset, batch_size=hp['batch_size'], shuffle=False) # Load model model = model.ChainsawDetector(hp['batch_size']).to(device, dtype=torch.bfloat16) model = torch.compile(model) model.load_state_dict(torch.load('backups/final-bf16.pth', weights_only=True), strict=True) hp['total_params'] = sum(p.numel() for p in model.parameters()) print(f"model ready, {hp['total_params']} parameters") loss_fn = nn.BCELoss() hp["loss_fn"] = 'BinaryCrossEntropyLoss' optimizer = optim.AdamW(model.parameters(), lr=hp['learning_rate']) total_iterations=len(train_dataset) steps_per_epoch=total_iterations//hp['batch_size'] total_steps = total_iterations*hp['num_epochs'] print(f"batch_size = {hp['batch_size']}, num_epochs = {hp['num_epochs']}") print(f'{total_iterations=}, {steps_per_epoch=}, {total_steps=}') lrscheduler = OneCycleLR(optimizer, max_lr=hp['learning_rate'], steps_per_epoch=steps_per_epoch, epochs=hp['num_epochs']) hp["optimizer"] = 'AdamW' metric_fn = BinaryAccuracy(threshold=0.5) def train(loader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch): for batch_index, (data, targets) in enumerate(tqdm(loader)): # Move data and targets to the device (GPU/CPU) data = data.to(device, dtype=torch.bfloat16) data = preprocess.augment(data) targets = targets.to(device, dtype=torch.bfloat16) optimizer.zero_grad() # Forward pass: compute the model output with autocast(device_type=device, dtype=torch.bfloat16): predictions = model(data) loss = loss_fn(predictions, targets) # Backward pass: compute the gradients loss.backward() # Optimization step: update the model parameters optimizer.step() lrscheduler.step() if batch_index % 100 == 0: loss = loss.item() accuracy = metric_fn(predictions, targets) step = batch_index + epoch*steps_per_epoch mlflow.log_metric("lr", lrscheduler.get_last_lr()[0], step=step) mlflow.log_metric("train_loss", f"{loss:2f}", step=step) mlflow.log_metric("train_accuracy", f"{accuracy:2f}", step=step) def decide(x): return 1 if x>=0.5 else 0 MAE = torch.nn.L1Loss() def evaluate(loader, model, epoch, loss_fn=loss_fn): num_correct = 0 num_samples = 0 num_batches = 0 loss = 0 confidence = 0 model.eval() with torch.no_grad(), autocast(device_type=device): for X, y in loader: X = X.to(device, dtype=torch.bfloat16) y = y.to(device, dtype=torch.bfloat16) predictions = model(X) decisions = predictions.detach().clone() decisions.apply_(decide) confidence += MAE(decisions, predictions) loss += loss_fn(decisions, y).item() num_correct += (decisions == y).sum() # Count correct predictions num_samples += decisions.size(0) # Count total samples num_batches +=1 # Calculate metrics accuracy = float(num_correct) / float(num_samples) * 100 loss /= num_batches confidence /= num_batches confidence = 1-confidence mlflow.log_metric("val_loss", f"{loss:2f}", step=epoch) mlflow.log_metric("val_accuracy", f"{accuracy:2f}", step=epoch) mlflow.log_metric("val_confidence", f"{confidence:2f}", step=epoch) print(f"Got {num_correct}/{num_samples} with accuracy {accuracy:.2f}% and confidence {confidence:.2f}") model.train() with mlflow.start_run() as run: mlflow.log_params(hp) for epoch in range(0, hp['num_epochs']): print(f"Epoch [{epoch+1}/{hp['num_epochs']}]") train(train_dataloader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch) evaluate(val_dataloader, model, epoch) model.eval() elapsed = time.time() - start_time print(f"--- {elapsed:.2f} seconds ---") torch.save(model.state_dict(), 'backups/name.pth')