File size: 5,009 Bytes
5ad4868 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import time
import torch
from torch import optim
from torch import nn
from torchmetrics.classification import BinaryAccuracy
from torch.optim.lr_scheduler import OneCycleLR
from torch.amp import autocast
import mlflow
from tqdm import tqdm
import model
import preprocess
import dataset
# MLflow server
mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("Optimizations")
start_time = time.time()
# batch best lr
# 8 1e-3
# 16 5e-3
# hyperparameters
hp = {
'batch_size': 16,
'learning_rate': 5e-3,
'num_epochs': 10,
}
device ='cuda' if torch.cuda.is_available() else 'cpu'
preprocess.hflogin()
# Prepare datasets
custom_dataset = dataset.ChainsawDataset()
train_dataset = preprocess.get_dataset('train', device)
train_dataset = torch.utils.data.ConcatDataset([train_dataset, custom_dataset])
val_dataset = preprocess.get_dataset('test', device)
train_dataloader = preprocess.get_dataloader(train_dataset, batch_size=hp['batch_size'], shuffle=True)
val_dataloader = preprocess.get_dataloader(val_dataset, batch_size=hp['batch_size'], shuffle=False)
# Load model
model = model.ChainsawDetector(hp['batch_size']).to(device, dtype=torch.bfloat16)
model = torch.compile(model)
model.load_state_dict(torch.load('backups/final-bf16.pth', weights_only=True), strict=True)
hp['total_params'] = sum(p.numel() for p in model.parameters())
print(f"model ready, {hp['total_params']} parameters")
loss_fn = nn.BCELoss()
hp["loss_fn"] = 'BinaryCrossEntropyLoss'
optimizer = optim.AdamW(model.parameters(), lr=hp['learning_rate'])
total_iterations=len(train_dataset)
steps_per_epoch=total_iterations//hp['batch_size']
total_steps = total_iterations*hp['num_epochs']
print(f"batch_size = {hp['batch_size']}, num_epochs = {hp['num_epochs']}")
print(f'{total_iterations=}, {steps_per_epoch=}, {total_steps=}')
lrscheduler = OneCycleLR(optimizer, max_lr=hp['learning_rate'], steps_per_epoch=steps_per_epoch, epochs=hp['num_epochs'])
hp["optimizer"] = 'AdamW'
metric_fn = BinaryAccuracy(threshold=0.5)
def train(loader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch):
for batch_index, (data, targets) in enumerate(tqdm(loader)):
# Move data and targets to the device (GPU/CPU)
data = data.to(device, dtype=torch.bfloat16)
data = preprocess.augment(data)
targets = targets.to(device, dtype=torch.bfloat16)
optimizer.zero_grad()
# Forward pass: compute the model output
with autocast(device_type=device, dtype=torch.bfloat16):
predictions = model(data)
loss = loss_fn(predictions, targets)
# Backward pass: compute the gradients
loss.backward()
# Optimization step: update the model parameters
optimizer.step()
lrscheduler.step()
if batch_index % 100 == 0:
loss = loss.item()
accuracy = metric_fn(predictions, targets)
step = batch_index + epoch*steps_per_epoch
mlflow.log_metric("lr", lrscheduler.get_last_lr()[0], step=step)
mlflow.log_metric("train_loss", f"{loss:2f}", step=step)
mlflow.log_metric("train_accuracy", f"{accuracy:2f}", step=step)
def decide(x):
return 1 if x>=0.5 else 0
MAE = torch.nn.L1Loss()
def evaluate(loader, model, epoch, loss_fn=loss_fn):
num_correct = 0
num_samples = 0
num_batches = 0
loss = 0
confidence = 0
model.eval()
with torch.no_grad(), autocast(device_type=device):
for X, y in loader:
X = X.to(device, dtype=torch.bfloat16)
y = y.to(device, dtype=torch.bfloat16)
predictions = model(X)
decisions = predictions.detach().clone()
decisions.apply_(decide)
confidence += MAE(decisions, predictions)
loss += loss_fn(decisions, y).item()
num_correct += (decisions == y).sum() # Count correct predictions
num_samples += decisions.size(0) # Count total samples
num_batches +=1
# Calculate metrics
accuracy = float(num_correct) / float(num_samples) * 100
loss /= num_batches
confidence /= num_batches
confidence = 1-confidence
mlflow.log_metric("val_loss", f"{loss:2f}", step=epoch)
mlflow.log_metric("val_accuracy", f"{accuracy:2f}", step=epoch)
mlflow.log_metric("val_confidence", f"{confidence:2f}", step=epoch)
print(f"Got {num_correct}/{num_samples} with accuracy {accuracy:.2f}% and confidence {confidence:.2f}")
model.train()
with mlflow.start_run() as run:
mlflow.log_params(hp)
for epoch in range(0, hp['num_epochs']):
print(f"Epoch [{epoch+1}/{hp['num_epochs']}]")
train(train_dataloader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch)
evaluate(val_dataloader, model, epoch)
model.eval()
elapsed = time.time() - start_time
print(f"--- {elapsed:.2f} seconds ---")
torch.save(model.state_dict(), 'backups/name.pth')
|