File size: 5,009 Bytes
5ad4868
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import time
import torch
from torch import optim
from torch import nn
from torchmetrics.classification import BinaryAccuracy
from torch.optim.lr_scheduler import OneCycleLR
from torch.amp import autocast
import mlflow
from tqdm import tqdm
import model
import preprocess
import dataset

# MLflow server
mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("Optimizations")

start_time = time.time()

# batch best lr
# 8     1e-3
# 16    5e-3
# hyperparameters
hp = {
    'batch_size': 16,
    'learning_rate': 5e-3,
    'num_epochs': 10,
}

device ='cuda' if torch.cuda.is_available() else 'cpu'

preprocess.hflogin()

# Prepare datasets
custom_dataset = dataset.ChainsawDataset()
train_dataset = preprocess.get_dataset('train', device)
train_dataset = torch.utils.data.ConcatDataset([train_dataset, custom_dataset])
val_dataset = preprocess.get_dataset('test', device)
train_dataloader = preprocess.get_dataloader(train_dataset, batch_size=hp['batch_size'], shuffle=True)
val_dataloader = preprocess.get_dataloader(val_dataset, batch_size=hp['batch_size'], shuffle=False)

# Load model
model = model.ChainsawDetector(hp['batch_size']).to(device, dtype=torch.bfloat16)
model = torch.compile(model)
model.load_state_dict(torch.load('backups/final-bf16.pth', weights_only=True), strict=True)


hp['total_params'] = sum(p.numel() for p in model.parameters())
print(f"model ready, {hp['total_params']} parameters")

loss_fn = nn.BCELoss()
hp["loss_fn"] = 'BinaryCrossEntropyLoss'
optimizer = optim.AdamW(model.parameters(), lr=hp['learning_rate'])

total_iterations=len(train_dataset)
steps_per_epoch=total_iterations//hp['batch_size']
total_steps = total_iterations*hp['num_epochs']
print(f"batch_size = {hp['batch_size']}, num_epochs = {hp['num_epochs']}")
print(f'{total_iterations=}, {steps_per_epoch=}, {total_steps=}')
lrscheduler = OneCycleLR(optimizer, max_lr=hp['learning_rate'], steps_per_epoch=steps_per_epoch, epochs=hp['num_epochs'])
hp["optimizer"] = 'AdamW'
metric_fn = BinaryAccuracy(threshold=0.5)

def train(loader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch):
    for batch_index, (data, targets) in enumerate(tqdm(loader)):
        # Move data and targets to the device (GPU/CPU)
        data = data.to(device, dtype=torch.bfloat16)
        data = preprocess.augment(data)
        targets = targets.to(device, dtype=torch.bfloat16)

        optimizer.zero_grad()
        # Forward pass: compute the model output
        with autocast(device_type=device, dtype=torch.bfloat16):
            predictions = model(data)
            loss = loss_fn(predictions, targets)

        # Backward pass: compute the gradients
        loss.backward()
    
        # Optimization step: update the model parameters
        optimizer.step()
        lrscheduler.step()

        if batch_index % 100 == 0:
            loss = loss.item()
            accuracy = metric_fn(predictions, targets)
            step = batch_index + epoch*steps_per_epoch
            mlflow.log_metric("lr", lrscheduler.get_last_lr()[0], step=step)
            mlflow.log_metric("train_loss", f"{loss:2f}", step=step)
            mlflow.log_metric("train_accuracy", f"{accuracy:2f}", step=step)

def decide(x):
    return 1 if x>=0.5 else 0

MAE = torch.nn.L1Loss()

def evaluate(loader, model, epoch, loss_fn=loss_fn):
    num_correct = 0
    num_samples = 0
    num_batches = 0
    loss = 0
    confidence = 0
    model.eval() 

    with torch.no_grad(), autocast(device_type=device): 
        for X, y in loader:
            X = X.to(device, dtype=torch.bfloat16)
            y = y.to(device, dtype=torch.bfloat16)

            predictions = model(X)            
            
            decisions = predictions.detach().clone()
            decisions.apply_(decide) 

            confidence += MAE(decisions, predictions)
            loss += loss_fn(decisions, y).item()
            num_correct += (decisions == y).sum()  # Count correct predictions
            num_samples += decisions.size(0)  # Count total samples
            num_batches +=1

    # Calculate metrics
    accuracy = float(num_correct) / float(num_samples) * 100
    loss /= num_batches
    confidence /= num_batches
    confidence = 1-confidence
    mlflow.log_metric("val_loss", f"{loss:2f}", step=epoch)
    mlflow.log_metric("val_accuracy", f"{accuracy:2f}", step=epoch)
    mlflow.log_metric("val_confidence", f"{confidence:2f}", step=epoch)
    print(f"Got {num_correct}/{num_samples} with accuracy {accuracy:.2f}% and confidence {confidence:.2f}")
    model.train() 
    

with mlflow.start_run() as run:
    mlflow.log_params(hp)
    for epoch in range(0, hp['num_epochs']):
            print(f"Epoch [{epoch+1}/{hp['num_epochs']}]")
            train(train_dataloader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch)
            evaluate(val_dataloader, model, epoch)

model.eval()

elapsed = time.time() - start_time
print(f"--- {elapsed:.2f} seconds ---")

torch.save(model.state_dict(), 'backups/name.pth')