|
import time |
|
import torch |
|
from torch import optim |
|
from torch import nn |
|
from torchmetrics.classification import BinaryAccuracy |
|
from torch.optim.lr_scheduler import OneCycleLR |
|
from torch.amp import autocast |
|
import mlflow |
|
from tqdm import tqdm |
|
import model |
|
import preprocess |
|
import dataset |
|
|
|
|
|
mlflow.set_tracking_uri(uri="http://localhost:8080") |
|
mlflow.set_experiment("Optimizations") |
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
|
|
|
hp = { |
|
'batch_size': 16, |
|
'learning_rate': 5e-3, |
|
'num_epochs': 10, |
|
} |
|
|
|
device ='cuda' if torch.cuda.is_available() else 'cpu' |
|
|
|
preprocess.hflogin() |
|
|
|
|
|
custom_dataset = dataset.ChainsawDataset() |
|
train_dataset = preprocess.get_dataset('train', device) |
|
train_dataset = torch.utils.data.ConcatDataset([train_dataset, custom_dataset]) |
|
val_dataset = preprocess.get_dataset('test', device) |
|
train_dataloader = preprocess.get_dataloader(train_dataset, batch_size=hp['batch_size'], shuffle=True) |
|
val_dataloader = preprocess.get_dataloader(val_dataset, batch_size=hp['batch_size'], shuffle=False) |
|
|
|
|
|
model = model.ChainsawDetector(hp['batch_size']).to(device, dtype=torch.bfloat16) |
|
model = torch.compile(model) |
|
model.load_state_dict(torch.load('backups/final-bf16.pth', weights_only=True), strict=True) |
|
|
|
|
|
hp['total_params'] = sum(p.numel() for p in model.parameters()) |
|
print(f"model ready, {hp['total_params']} parameters") |
|
|
|
loss_fn = nn.BCELoss() |
|
hp["loss_fn"] = 'BinaryCrossEntropyLoss' |
|
optimizer = optim.AdamW(model.parameters(), lr=hp['learning_rate']) |
|
|
|
total_iterations=len(train_dataset) |
|
steps_per_epoch=total_iterations//hp['batch_size'] |
|
total_steps = total_iterations*hp['num_epochs'] |
|
print(f"batch_size = {hp['batch_size']}, num_epochs = {hp['num_epochs']}") |
|
print(f'{total_iterations=}, {steps_per_epoch=}, {total_steps=}') |
|
lrscheduler = OneCycleLR(optimizer, max_lr=hp['learning_rate'], steps_per_epoch=steps_per_epoch, epochs=hp['num_epochs']) |
|
hp["optimizer"] = 'AdamW' |
|
metric_fn = BinaryAccuracy(threshold=0.5) |
|
|
|
def train(loader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch): |
|
for batch_index, (data, targets) in enumerate(tqdm(loader)): |
|
|
|
data = data.to(device, dtype=torch.bfloat16) |
|
data = preprocess.augment(data) |
|
targets = targets.to(device, dtype=torch.bfloat16) |
|
|
|
optimizer.zero_grad() |
|
|
|
with autocast(device_type=device, dtype=torch.bfloat16): |
|
predictions = model(data) |
|
loss = loss_fn(predictions, targets) |
|
|
|
|
|
loss.backward() |
|
|
|
|
|
optimizer.step() |
|
lrscheduler.step() |
|
|
|
if batch_index % 100 == 0: |
|
loss = loss.item() |
|
accuracy = metric_fn(predictions, targets) |
|
step = batch_index + epoch*steps_per_epoch |
|
mlflow.log_metric("lr", lrscheduler.get_last_lr()[0], step=step) |
|
mlflow.log_metric("train_loss", f"{loss:2f}", step=step) |
|
mlflow.log_metric("train_accuracy", f"{accuracy:2f}", step=step) |
|
|
|
def decide(x): |
|
return 1 if x>=0.5 else 0 |
|
|
|
MAE = torch.nn.L1Loss() |
|
|
|
def evaluate(loader, model, epoch, loss_fn=loss_fn): |
|
num_correct = 0 |
|
num_samples = 0 |
|
num_batches = 0 |
|
loss = 0 |
|
confidence = 0 |
|
model.eval() |
|
|
|
with torch.no_grad(), autocast(device_type=device): |
|
for X, y in loader: |
|
X = X.to(device, dtype=torch.bfloat16) |
|
y = y.to(device, dtype=torch.bfloat16) |
|
|
|
predictions = model(X) |
|
|
|
decisions = predictions.detach().clone() |
|
decisions.apply_(decide) |
|
|
|
confidence += MAE(decisions, predictions) |
|
loss += loss_fn(decisions, y).item() |
|
num_correct += (decisions == y).sum() |
|
num_samples += decisions.size(0) |
|
num_batches +=1 |
|
|
|
|
|
accuracy = float(num_correct) / float(num_samples) * 100 |
|
loss /= num_batches |
|
confidence /= num_batches |
|
confidence = 1-confidence |
|
mlflow.log_metric("val_loss", f"{loss:2f}", step=epoch) |
|
mlflow.log_metric("val_accuracy", f"{accuracy:2f}", step=epoch) |
|
mlflow.log_metric("val_confidence", f"{confidence:2f}", step=epoch) |
|
print(f"Got {num_correct}/{num_samples} with accuracy {accuracy:.2f}% and confidence {confidence:.2f}") |
|
model.train() |
|
|
|
|
|
with mlflow.start_run() as run: |
|
mlflow.log_params(hp) |
|
for epoch in range(0, hp['num_epochs']): |
|
print(f"Epoch [{epoch+1}/{hp['num_epochs']}]") |
|
train(train_dataloader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch) |
|
evaluate(val_dataloader, model, epoch) |
|
|
|
model.eval() |
|
|
|
elapsed = time.time() - start_time |
|
print(f"--- {elapsed:.2f} seconds ---") |
|
|
|
torch.save(model.state_dict(), 'backups/name.pth') |
|
|