import time
import torch
from torch import optim
from torch import nn
from torchmetrics.classification import BinaryAccuracy
from torch.optim.lr_scheduler import OneCycleLR
from torch.amp import autocast
import mlflow
from tqdm import tqdm
import model
import preprocess
import dataset

# MLflow server
mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("Optimizations")

start_time = time.time()

# batch best lr
# 8     1e-3
# 16    5e-3
# hyperparameters
hp = {
    'batch_size': 16,
    'learning_rate': 5e-3,
    'num_epochs': 10,
}

device ='cuda' if torch.cuda.is_available() else 'cpu'

preprocess.hflogin()

# Prepare datasets
custom_dataset = dataset.ChainsawDataset()
train_dataset = preprocess.get_dataset('train', device)
train_dataset = torch.utils.data.ConcatDataset([train_dataset, custom_dataset])
val_dataset = preprocess.get_dataset('test', device)
train_dataloader = preprocess.get_dataloader(train_dataset, batch_size=hp['batch_size'], shuffle=True)
val_dataloader = preprocess.get_dataloader(val_dataset, batch_size=hp['batch_size'], shuffle=False)

# Load model
model = model.ChainsawDetector(hp['batch_size']).to(device, dtype=torch.bfloat16)
model = torch.compile(model)
model.load_state_dict(torch.load('backups/final-bf16.pth', weights_only=True), strict=True)


hp['total_params'] = sum(p.numel() for p in model.parameters())
print(f"model ready, {hp['total_params']} parameters")

loss_fn = nn.BCELoss()
hp["loss_fn"] = 'BinaryCrossEntropyLoss'
optimizer = optim.AdamW(model.parameters(), lr=hp['learning_rate'])

total_iterations=len(train_dataset)
steps_per_epoch=total_iterations//hp['batch_size']
total_steps = total_iterations*hp['num_epochs']
print(f"batch_size = {hp['batch_size']}, num_epochs = {hp['num_epochs']}")
print(f'{total_iterations=}, {steps_per_epoch=}, {total_steps=}')
lrscheduler = OneCycleLR(optimizer, max_lr=hp['learning_rate'], steps_per_epoch=steps_per_epoch, epochs=hp['num_epochs'])
hp["optimizer"] = 'AdamW'
metric_fn = BinaryAccuracy(threshold=0.5)

def train(loader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch):
    for batch_index, (data, targets) in enumerate(tqdm(loader)):
        # Move data and targets to the device (GPU/CPU)
        data = data.to(device, dtype=torch.bfloat16)
        data = preprocess.augment(data)
        targets = targets.to(device, dtype=torch.bfloat16)

        optimizer.zero_grad()
        # Forward pass: compute the model output
        with autocast(device_type=device, dtype=torch.bfloat16):
            predictions = model(data)
            loss = loss_fn(predictions, targets)

        # Backward pass: compute the gradients
        loss.backward()
    
        # Optimization step: update the model parameters
        optimizer.step()
        lrscheduler.step()

        if batch_index % 100 == 0:
            loss = loss.item()
            accuracy = metric_fn(predictions, targets)
            step = batch_index + epoch*steps_per_epoch
            mlflow.log_metric("lr", lrscheduler.get_last_lr()[0], step=step)
            mlflow.log_metric("train_loss", f"{loss:2f}", step=step)
            mlflow.log_metric("train_accuracy", f"{accuracy:2f}", step=step)

def decide(x):
    return 1 if x>=0.5 else 0

MAE = torch.nn.L1Loss()

def evaluate(loader, model, epoch, loss_fn=loss_fn):
    num_correct = 0
    num_samples = 0
    num_batches = 0
    loss = 0
    confidence = 0
    model.eval() 

    with torch.no_grad(), autocast(device_type=device): 
        for X, y in loader:
            X = X.to(device, dtype=torch.bfloat16)
            y = y.to(device, dtype=torch.bfloat16)

            predictions = model(X)            
            
            decisions = predictions.detach().clone()
            decisions.apply_(decide) 

            confidence += MAE(decisions, predictions)
            loss += loss_fn(decisions, y).item()
            num_correct += (decisions == y).sum()  # Count correct predictions
            num_samples += decisions.size(0)  # Count total samples
            num_batches +=1

    # Calculate metrics
    accuracy = float(num_correct) / float(num_samples) * 100
    loss /= num_batches
    confidence /= num_batches
    confidence = 1-confidence
    mlflow.log_metric("val_loss", f"{loss:2f}", step=epoch)
    mlflow.log_metric("val_accuracy", f"{accuracy:2f}", step=epoch)
    mlflow.log_metric("val_confidence", f"{confidence:2f}", step=epoch)
    print(f"Got {num_correct}/{num_samples} with accuracy {accuracy:.2f}% and confidence {confidence:.2f}")
    model.train() 
    

with mlflow.start_run() as run:
    mlflow.log_params(hp)
    for epoch in range(0, hp['num_epochs']):
            print(f"Epoch [{epoch+1}/{hp['num_epochs']}]")
            train(train_dataloader, model, loss_fn, metric_fn, optimizer, lrscheduler, epoch)
            evaluate(val_dataloader, model, epoch)

model.eval()

elapsed = time.time() - start_time
print(f"--- {elapsed:.2f} seconds ---")

torch.save(model.state_dict(), 'backups/name.pth')