Spaces:

AdrianM0
/

smi2iupac

Running on Zero

App Files Files Community

AdrianM0 commited on Apr 20

Commit

aaafea4

verified ·

1 Parent(s): 31e7bf8

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +2 -8
app.py +503 -0
enhanced_trainer.py +1421 -0
requirements.txt +16 -0

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
 ---
-title: Smi2iupac
-emoji: 👁
-colorFrom: purple
-colorTo: yellow
 sdk: gradio
 sdk_version: 5.25.2
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: smi2iupac
+app_file: app.py
 sdk: gradio
 sdk_version: 5.25.2
 ---

app.py ADDED Viewed

	@@ -0,0 +1,503 @@

+# app.py
+import gradio as gr
+import torch
+import torch.nn.functional as F # <--- Added import
+import pytorch_lightning as pl   # <--- Added import (needed for type hints, model access)
+import os
+import json
+import logging
+from tokenizers import Tokenizer
+from huggingface_hub import hf_hub_download
+import gc # For garbage collection on potential OOM
+import math # Needed for PositionalEncoding if moved here (or keep in enhanced_trainer)
+# --- Configuration ---
+MODEL_REPO_ID = "AdrianM0/smiles-to-iupac-translator"
+CHECKPOINT_FILENAME = "last.ckpt"
+SMILES_TOKENIZER_FILENAME = "smiles_bytelevel_bpe_tokenizer_scaled.json"
+IUPAC_TOKENIZER_FILENAME = "iupac_unigram_tokenizer_scaled.json"
+CONFIG_FILENAME = "config.json"
+# --- End Configuration ---
+# --- Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# --- Load Helper Code (Only Model Definition Needed) ---
+try:
+    # We only need the LightningModule definition and the mask function now
+    from enhanced_trainer import SmilesIupacLitModule, generate_square_subsequent_mask
+    logging.info("Successfully imported from enhanced_trainer.py.")
+    # We will define beam_search_decode and translate locally in this file
+    # REMOVED: from test_ckpt import beam_search_decode, translate
+except ImportError as e:
+    logging.error(f"Failed to import helper code from enhanced_trainer.py: {e}. Make sure enhanced_trainer.py is in the root of the Hugging Face repo '{MODEL_REPO_ID}'.")
+    gr.Error(f"Initialization Error: Could not load necessary Python modules (enhanced_trainer.py). Check Space logs. Error: {e}")
+    exit()
+except Exception as e:
+    logging.error(f"An unexpected error occurred during helper code import: {e}", exc_info=True)
+    gr.Error(f"Initialization Error: An unexpected error occurred loading helper modules. Check Space logs. Error: {e}")
+    exit()
+# --- Global Variables (Load Model Once) ---
+model: pl.LightningModule | None = None # Added type hint
+smiles_tokenizer: Tokenizer | None = None
+iupac_tokenizer: Tokenizer | None = None
+device: torch.device | None = None
+config: dict | None = None
+# --- Beam Search Decoding Logic (Moved from test_ckpt.py) ---
+def beam_search_decode(
+    model: pl.LightningModule,
+    src: torch.Tensor,
+    src_padding_mask: torch.Tensor,
+    max_len: int,
+    sos_idx: int,
+    eos_idx: int,
+    pad_idx: int, # Needed for padding mask check if src has padding
+    device: torch.device,
+    beam_width: int = 5,
+    n_best: int = 5, # Number of top sequences to return
+    length_penalty: float = 0.6 # Alpha for length normalization (0=no penalty, 1=full penalty)
+) -> list[torch.Tensor]:
+    """
+    Performs beam search decoding using the LightningModule's model.
+    (Code copied and pasted from test_ckpt.py)
+    """
+    # Ensure model is in eval mode (redundant if called after model.eval(), but safe)
+    model.eval()
+    transformer_model = model.model # Access the underlying Seq2SeqTransformer
+    n_best = min(n_best, beam_width) # Cannot return more than beam_width sequences
+    try:
+        with torch.no_grad():
+            # --- Encode Source ---
+            memory = transformer_model.encode(src, src_padding_mask) # [1, src_len, emb_size]
+            memory = memory.to(device)
+            # Ensure memory_key_padding_mask is also on the correct device for decode
+            memory_key_padding_mask = src_padding_mask.to(memory.device) # [1, src_len]
+            # --- Initialize Beams ---
+            initial_beam_seq = torch.ones(1, 1, dtype=torch.long, device=device).fill_(sos_idx) # [1, 1]
+            initial_beam_score = torch.zeros(1, dtype=torch.float, device=device) # [1]
+            active_beams = [(initial_beam_seq, initial_beam_score)]
+            finished_beams = []
+            # --- Decoding Loop ---
+            for step in range(max_len - 1):
+                if not active_beams:
+                    break
+                potential_next_beams = []
+                for current_seq, current_score in active_beams:
+                    if current_seq[0, -1].item() == eos_idx:
+                        finished_beams.append((current_seq, current_score))
+                        continue
+                    tgt_input = current_seq # [1, current_len]
+                    tgt_seq_len = tgt_input.shape[1]
+                    tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device).to(device) # [curr_len, curr_len]
+                    tgt_padding_mask = torch.zeros(tgt_input.shape, dtype=torch.bool, device=device) # [1, curr_len]
+                    decoder_output = transformer_model.decode(
+                        tgt=tgt_input,
+                        memory=memory,
+                        tgt_mask=tgt_mask,
+                        tgt_padding_mask=tgt_padding_mask,
+                        memory_key_padding_mask=memory_key_padding_mask
+                    ) # [1, curr_len, emb_size]
+                    next_token_logits = transformer_model.generator(decoder_output[:, -1, :]) # [1, tgt_vocab_size]
+                    log_probs = F.log_softmax(next_token_logits, dim=-1) # [1, tgt_vocab_size]
+                    topk_log_probs, topk_indices = torch.topk(log_probs + current_score, beam_width, dim=-1)
+                    for i in range(beam_width):
+                        next_token_id = topk_indices[0, i].item()
+                        next_score = topk_log_probs[0, i].reshape(1) # Keep as tensor [1]
+                        next_token_tensor = torch.tensor([[next_token_id]], dtype=torch.long, device=device) # [1, 1]
+                        new_seq = torch.cat([current_seq, next_token_tensor], dim=1) # [1, current_len + 1]
+                        potential_next_beams.append((new_seq, next_score))
+                potential_next_beams.sort(key=lambda x: x[1].item(), reverse=True)
+                active_beams = []
+                added_count = 0
+                for seq, score in potential_next_beams:
+                     is_finished = seq[0, -1].item() == eos_idx
+                     if is_finished:
+                         finished_beams.append((seq, score))
+                     elif added_count < beam_width:
+                         active_beams.append((seq, score))
+                         added_count += 1
+                     elif added_count >= beam_width:
+                         break
+            finished_beams.extend(active_beams)
+            # Apply length penalty and sort
+            # Handle potential division by zero if sequence length is 1 (or 0?)
+            def get_score(beam_tuple):
+                seq, score = beam_tuple
+                seq_len = seq.shape[1]
+                if length_penalty == 0.0 or seq_len <= 1:
+                    return score.item()
+                else:
+                    # Ensure seq_len is float for pow
+                    return score.item() / (float(seq_len) ** length_penalty)
+            finished_beams.sort(key=get_score, reverse=True) # Higher score is better
+            top_sequences = [seq[:, 1:] for seq, score in finished_beams[:n_best]] # seq shape [1, len] -> [1, len-1]
+            return top_sequences
+    except RuntimeError as e:
+        logging.error(f"Runtime error during beam search decode: {e}")
+        if "CUDA out of memory" in str(e):
+            gc.collect(); torch.cuda.empty_cache()
+        return [] # Return empty list on error
+    except Exception as e:
+        logging.error(f"Unexpected error during beam search decode: {e}", exc_info=True)
+        return []
+# --- Translation Function (Moved from test_ckpt.py) ---
+def translate(
+    model: pl.LightningModule,
+    src_sentence: str,
+    smiles_tokenizer: Tokenizer,
+    iupac_tokenizer: Tokenizer,
+    device: torch.device,
+    max_len: int,
+    sos_idx: int,
+    eos_idx: int,
+    pad_idx: int,
+    beam_width: int = 5,
+    n_best: int = 5,
+    length_penalty: float = 0.6
+) -> list[str]:
+    """
+    Translates a single SMILES string using beam search.
+    (Code copied and pasted from test_ckpt.py)
+    """
+    model.eval() # Ensure model is in eval mode
+    translations = []
+    # --- Tokenize Source ---
+    try:
+        src_encoded = smiles_tokenizer.encode(src_sentence)
+        if not src_encoded or not src_encoded.ids:
+            logging.warning(f"Encoding failed or empty for SMILES: {src_sentence}")
+            return ["[Encoding Error]"] * n_best
+        src_ids = src_encoded.ids[:max_len] # Truncate source
+        if not src_ids:
+             logging.warning(f"Source empty after truncation: {src_sentence}")
+             return ["[Encoding Error - Empty Src]"] * n_best
+    except Exception as e:
+        logging.error(f"Error tokenizing SMILES '{src_sentence}': {e}")
+        return ["[Encoding Error]"] * n_best
+    # --- Prepare Input Tensor and Mask ---
+    src = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device) # [1, src_len]
+    src_padding_mask = (src == pad_idx).to(device) # [1, src_len]
+    # --- Perform Beam Search Decoding ---
+    # Calls the beam_search_decode function defined above in this file
+    tgt_tokens_list = beam_search_decode(
+        model=model,
+        src=src,
+        src_padding_mask=src_padding_mask,
+        max_len=max_len,
+        sos_idx=sos_idx,
+        eos_idx=eos_idx,
+        pad_idx=pad_idx,
+        device=device,
+        beam_width=beam_width,
+        n_best=n_best,
+        length_penalty=length_penalty
+    ) # Returns list of tensors
+    # --- Decode Generated Tokens ---
+    if not tgt_tokens_list:
+         logging.warning(f"Beam search returned empty list for SMILES: {src_sentence}")
+         return ["[Decoding Error - Empty Output]"] * n_best
+    for tgt_tokens_tensor in tgt_tokens_list:
+        if tgt_tokens_tensor.numel() > 0:
+            tgt_tokens = tgt_tokens_tensor.flatten().cpu().numpy().tolist()
+            try:
+                translation = iupac_tokenizer.decode(tgt_tokens, skip_special_tokens=True)
+                translations.append(translation)
+            except Exception as e:
+                logging.error(f"Error decoding target tokens {tgt_tokens}: {e}")
+                translations.append("[Decoding Error]")
+        else:
+            translations.append("[Decoding Error - Empty Tensor]")
+    # Pad with error messages if fewer than n_best results were generated
+    while len(translations) < n_best:
+        translations.append("[Decoding Error - Fewer Results]")
+    return translations
+# --- Model/Tokenizer Loading Function (Unchanged) ---
+def load_model_and_tokenizers():
+    """Loads tokenizers, config, and model from Hugging Face Hub."""
+    global model, smiles_tokenizer, iupac_tokenizer, device, config
+    if model is not None: # Already loaded
+        logging.info("Model and tokenizers already loaded.")
+        return
+    logging.info(f"Starting model and tokenizer loading from {MODEL_REPO_ID}...")
+    try:
+        device = torch.device("cpu")
+        logging.info(f"Using device: {device}")
+        # Download files from HF Hub
+        logging.info("Downloading files from Hugging Face Hub...")
+        try:
+            checkpoint_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=CHECKPOINT_FILENAME)
+            smiles_tokenizer_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=SMILES_TOKENIZER_FILENAME)
+            iupac_tokenizer_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=IUPAC_TOKENIZER_FILENAME)
+            config_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=CONFIG_FILENAME)
+            logging.info("Files downloaded successfully.")
+        except Exception as e:
+            logging.error(f"Failed to download files from {MODEL_REPO_ID}. Check filenames and repo status. Error: {e}", exc_info=True)
+            raise gr.Error(f"Download Error: Could not download required files from {MODEL_REPO_ID}. Check Space logs. Error: {e}")
+        # Load config
+        logging.info("Loading configuration...")
+        try:
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+            logging.info("Configuration loaded.")
+            # --- Validate essential config keys ---
+            required_keys = [
+                'src_vocab_size', 'tgt_vocab_size', 'emb_size', 'nhead',
+                'ffn_hid_dim', 'num_encoder_layers', 'num_decoder_layers',
+                'dropout', 'max_len', 'bos_token_id', 'eos_token_id', 'pad_token_id'
+            ]
+            missing_keys = [key for key in required_keys if key not in config]
+            if missing_keys:
+                raise ValueError(f"Config file '{CONFIG_FILENAME}' is missing required keys: {missing_keys}")
+            # --- End Validation ---
+        except FileNotFoundError:
+             logging.error(f"Config file not found locally after download attempt: {config_path}")
+             raise gr.Error(f"Config Error: Config file '{CONFIG_FILENAME}' not found. Check file exists in repo.")
+        except json.JSONDecodeError as e:
+            logging.error(f"Error decoding JSON from config file {config_path}: {e}")
+            raise gr.Error(f"Config Error: Could not parse '{CONFIG_FILENAME}'. Check its format. Error: {e}")
+        except ValueError as e:
+            logging.error(f"Config validation error: {e}")
+            raise gr.Error(f"Config Error: {e}")
+        # Load tokenizers
+        logging.info("Loading tokenizers...")
+        try:
+            smiles_tokenizer = Tokenizer.from_file(smiles_tokenizer_path)
+            iupac_tokenizer = Tokenizer.from_file(iupac_tokenizer_path)
+            logging.info("Tokenizers loaded.")
+            # --- Validate Tokenizer Special Tokens ---
+            # Add more robust checks if necessary
+            if smiles_tokenizer.token_to_id("<pad>") != config['pad_token_id'] or \
+               smiles_tokenizer.token_to_id("<unk>") is None:
+                 logging.warning("SMILES tokenizer special tokens might not match config or are missing.")
+            if iupac_tokenizer.token_to_id("<pad>") != config['pad_token_id'] or \
+               iupac_tokenizer.token_to_id("<sos>") != config['bos_token_id'] or \
+               iupac_tokenizer.token_to_id("<eos>") != config['eos_token_id'] or \
+               iupac_tokenizer.token_to_id("<unk>") is None:
+                 logging.warning("IUPAC tokenizer special tokens might not match config or are missing.")
+            # --- End Validation ---
+        except Exception as e:
+            logging.error(f"Failed to load tokenizers from {smiles_tokenizer_path} or {iupac_tokenizer_path}: {e}", exc_info=True)
+            raise gr.Error(f"Tokenizer Error: Could not load tokenizer files. Check Space logs. Error: {e}")
+        # Load model
+        logging.info("Loading model from checkpoint...")
+        try:
+            model = SmilesIupacLitModule.load_from_checkpoint(
+                checkpoint_path,
+                src_vocab_size=config['src_vocab_size'],
+                tgt_vocab_size=config['tgt_vocab_size'],
+                map_location=device,
+                hparams_dict=config,
+                strict=False,
+                device="cpu"
+            )
+            model.to(device)
+            model.eval()
+            model.freeze()
+            logging.info("Model loaded successfully, set to eval mode, frozen, and moved to device.")
+        except FileNotFoundError:
+            logging.error(f"Checkpoint file not found locally after download attempt: {checkpoint_path}")
+            raise gr.Error(f"Model Error: Checkpoint file '{CHECKPOINT_FILENAME}' not found.")
+        except Exception as e:
+            logging.error(f"Error loading model from checkpoint {checkpoint_path}: {e}", exc_info=True)
+            if "memory" in str(e).lower():
+                gc.collect()
+                if device == torch.device("cuda"):
+                    torch.cuda.empty_cache()
+            raise gr.Error(f"Model Error: Failed to load model checkpoint. Check Space logs. Error: {e}")
+    except gr.Error:
+         raise
+    except Exception as e:
+        logging.error(f"Unexpected error during model/tokenizer loading: {e}", exc_info=True)
+        raise gr.Error(f"Initialization Error: An unexpected error occurred. Check Space logs. Error: {e}")
+# --- Inference Function for Gradio (Unchanged, calls local translate) ---
+def predict_iupac(smiles_string, beam_width, n_best, length_penalty):
+    """
+    Performs SMILES to IUPAC translation using the loaded model and beam search.
+    """
+    global model, smiles_tokenizer, iupac_tokenizer, device, config
+    if not all([model, smiles_tokenizer, iupac_tokenizer, device, config]):
+        error_msg = "Error: Model or tokenizers not loaded properly. Check Space logs."
+        # Ensure n_best is int for range, default to 1 if conversion fails early
+        try: n_best_int = int(n_best)
+        except: n_best_int = 1
+        return "\n".join([f"{i+1}. {error_msg}" for i in range(n_best_int)])
+    if not smiles_string or not smiles_string.strip():
+        error_msg = "Error: Please enter a valid SMILES string."
+        try: n_best_int = int(n_best)
+        except: n_best_int = 1
+        return "\n".join([f"{i+1}. {error_msg}" for i in range(n_best_int)])
+    smiles_input = smiles_string.strip()
+    try:
+        beam_width = int(beam_width)
+        n_best = int(n_best)
+        length_penalty = float(length_penalty)
+    except ValueError as e:
+         error_msg = f"Error: Invalid input parameter type ({e})."
+         return f"1. {error_msg}" # Cannot determine n_best here
+    logging.info(f"Translating SMILES: '{smiles_input}' (Beam={beam_width}, N={n_best}, Penalty={length_penalty})")
+    try:
+        # Calls the translate function defined *above in this file*
+        predicted_names = translate(
+            model=model,
+            src_sentence=smiles_input,
+            smiles_tokenizer=smiles_tokenizer,
+            iupac_tokenizer=iupac_tokenizer,
+            device=device,
+            max_len=config['max_len'],
+            sos_idx=config['bos_token_id'],
+            eos_idx=config['eos_token_id'],
+            pad_idx=config['pad_token_id'],
+            beam_width=beam_width,
+            n_best=n_best,
+            length_penalty=length_penalty
+        )
+        logging.info(f"Predictions returned: {predicted_names}")
+        if not predicted_names:
+             output_text = f"Input SMILES: {smiles_input}\n\nNo predictions generated."
+        else:
+            output_text = f"Input SMILES: {smiles_input}\n\nTop {len(predicted_names)} Predictions (Beam Width={beam_width}, Length Penalty={length_penalty:.2f}):\n"
+            output_text += "\n".join([f"{i+1}. {name}" for i, name in enumerate(predicted_names)])
+        return output_text
+    except RuntimeError as e:
+        logging.error(f"Runtime error during translation: {e}", exc_info=True)
+        error_msg = f"Runtime Error during translation: {e}"
+        if "memory" in str(e).lower():
+            gc.collect()
+            if device == torch.device("cuda"):
+                torch.cuda.empty_cache()
+            error_msg += " (Potential OOM)"
+        return "\n".join([f"{i+1}. {error_msg}" for i in range(n_best)])
+    except Exception as e:
+        logging.error(f"Unexpected error during translation: {e}", exc_info=True)
+        error_msg = f"Unexpected Error during translation: {e}"
+        return "\n".join([f"{i+1}. {error_msg}" for i in range(n_best)])
+# --- Load Model on App Start (Unchanged) ---
+try:
+    load_model_and_tokenizers()
+except gr.Error:
+    pass # Error already raised for Gradio UI
+except Exception as e:
+    logging.error(f"Critical error during initial model loading sequence: {e}", exc_info=True)
+    gr.Error(f"Fatal Initialization Error: {e}. Check Space logs.")
+# --- Create Gradio Interface (Unchanged) ---
+title = "SMILES to IUPAC Name Translator"
+description = f"""
+Enter a SMILES string to translate it into its IUPAC chemical name using a Transformer model and beam search decoding.
+Model repository: <a href='https://huggingface.co/{MODEL_REPO_ID}' target='_blank'>{MODEL_REPO_ID}</a>.
+Adjust beam search parameters below. Higher beam width explores more possibilities but is slower. Length penalty influences the preference for shorter/longer names.
+"""
+examples = [
+    ["CCO", 5, 3, 0.6],
+    ["C1=CC=CC=C1", 5, 3, 0.6],
+    ["CC(=O)Oc1ccccc1C(=O)O", 5, 3, 0.6], # Aspirin
+    ["CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 5, 3, 0.6], # Ibuprofen
+    ["CC(=O)O[C@@H]1C[C@@H]2[C@]3(CCCC([C@@H]3CC[C@]2([C@H]4[C@]1([C@H]5[C@@H](OC(=O)C5=CC4)OC)C)C)(C)C)C", 5, 1, 0.6], # Complex example
+    ["INVALID_SMILES", 5, 1, 0.6],
+]
+smiles_input = gr.Textbox(
+    label="SMILES String",
+    placeholder="Enter SMILES string here (e.g., CCO for Ethanol)",
+    lines=1
+)
+beam_width_input = gr.Slider(
+    minimum=1,
+    maximum=10,
+    value=5,
+    step=1,
+    label="Beam Width (k)",
+    info="Number of sequences to keep at each decoding step (higher = more exploration, slower)."
+)
+n_best_input = gr.Slider(
+    minimum=1,
+    maximum=10,
+    value=3,
+    step=1,
+    label="Number of Results (n_best)",
+    info="How many top-scoring sequences to return (must be <= Beam Width)."
+)
+length_penalty_input = gr.Slider(
+    minimum=0.0,
+    maximum=2.0,
+    value=0.6,
+    step=0.1,
+    label="Length Penalty (alpha)",
+    info="Controls preference for sequence length. >1 prefers longer, <1 prefers shorter, 0 no penalty."
+)
+output_text = gr.Textbox(
+    label="Predicted IUPAC Name(s)",
+    lines=5,
+    show_copy_button=True
+)
+iface = gr.Interface(
+    fn=predict_iupac,
+    inputs=[smiles_input, beam_width_input, n_best_input, length_penalty_input],
+    outputs=output_text,
+    title=title,
+    description=description,
+    examples=examples,
+    allow_flagging="never",
+    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan"),
+    article="Note: Translation quality depends on the training data and model size. Complex molecules might yield less accurate results."
+)
+# --- Launch the App (Unchanged) ---
+if __name__ == "__main__":
+    iface.launch(share=True)

enhanced_trainer.py ADDED Viewed

	@@ -0,0 +1,1421 @@

+# -*- coding: utf-8 -*-
+import torch
+import torch.nn as nn
+from torch.nn import Transformer
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+import pytorch_lightning as pl  # Import PyTorch Lightning
+from pytorch_lightning.loggers import WandbLogger  # Import WandbLogger
+from pytorch_lightning.callbacks import (
+    ModelCheckpoint,
+    EarlyStopping,
+)  # Import Callbacks
+import math
+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import time
+import wandb  # Import wandb
+from tokenizers import (
+    Tokenizer,
+    models,
+    pre_tokenizers,
+    decoders,
+    trainers,
+)
+import logging
+import gc
+# --- Basic Logging Setup ---
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# --- 1. Configuration & Hyperparameters ---
+# Model Hyperparameters (Scaled up for H100s - ADJUST AS NEEDED based on memory)
+# Note: BPE might benefit from a slightly larger vocab size than the regex approach
+SRC_VOCAB_SIZE_ESTIMATE = 10000  # Increased estimate for SMILES BPE
+TGT_VOCAB_SIZE_ESTIMATE = 14938  # Increased estimate for IUPAC
+EMB_SIZE = 2048  # Embedding dimension (d_model) - Increased significantly
+NHEAD = 8  # Number of attention heads (must divide EMB_SIZE) - Increased
+FFN_HID_DIM = (
+    4096  # Feedforward network hidden dimension (e.g., 4 * EMB_SIZE) - Increased
+)
+NUM_ENCODER_LAYERS = 12  # Number of layers in Encoder - Increased
+NUM_DECODER_LAYERS = 12  # Number of layers in Decoder - Increased
+DROPOUT = 0.1  # Dropout rate (can sometimes be reduced slightly for larger models)
+MAX_LEN = 384  # Maximum sequence length (consider increasing if needed/possible)
+# Training Hyperparameters
+ACCELERATOR = "gpu"
+DEVICES = 6  # Number of H100 GPUs to use
+STRATEGY = "ddp"  # Distributed Data Parallel Strategy
+PRECISION = "16-mixed"  # Use mixed precision for speed and memory saving on H100s
+BATCH_SIZE_PER_GPU = 48  # Adjust based on H100 GPU memory (e.g., 32, 48, 64) - Effective BS = BATCH_SIZE_PER_GPU * DEVICES
+ACCUMULATE_GRAD_BATCHES = (
+    1  # Increase if BATCH_SIZE_PER_GPU needs to be smaller due to memory
+)
+NUM_EPOCHS = 50  # Increase for potentially longer training needed for larger models
+LEARNING_RATE = 5e-5  # Might need adjustment for larger models/batch sizes
+WEIGHT_DECAY = 1e-2
+GRAD_CLIP_NORM = 1.0
+VALIDATION_SPLIT = 0.05  # Use a smaller validation split if the dataset is huge
+RANDOM_SEED = 42
+PATIENCE = 5  # Early stopping patience
+NUM_WORKERS = 8  # Adjust based on CPU cores and system capabilities
+# Special Token Indices
+PAD_IDX = 0
+SOS_IDX = 1
+EOS_IDX = 2
+UNK_IDX = 3
+# File Paths
+# *** CHANGED SMILES TOKENIZER FILENAME ***
+SMILES_TOKENIZER_FILE = "smiles_bytelevel_bpe_tokenizer_scaled.json"
+IUPAC_TOKENIZER_FILE = "iupac_unigram_tokenizer_scaled.json"
+INPUT_CSV_FILE = "data_clean.csv"  # <--- Your input CSV file path
+# Output files for data splits
+TRAIN_SMILES_FILE = "train.smi"
+TRAIN_IUPAC_FILE = "train.iupac"
+VAL_SMILES_FILE = "val.smi"
+VAL_IUPAC_FILE = "val.iupac"
+CHECKPOINT_DIR = "checkpoints"  # Directory to save model checkpoints
+BEST_MODEL_FILENAME = (
+    "smiles-to-iupac-transformer-best"  # Filename format for checkpoints
+)
+# WandB Configuration
+WANDB_PROJECT = "SMILES-to-IUPAC-Large-BPE"  # Updated project name slightly
+WANDB_ENTITY = (
+    "adrianmirza"  # Replace with your WandB entity (username or team name) if desired
+)
+WANDB_RUN_NAME = f"transformer_BPE_E{EMB_SIZE}_H{NHEAD}_L{NUM_ENCODER_LAYERS}_BS{BATCH_SIZE_PER_GPU * DEVICES}_LR{LEARNING_RATE}"
+# Store hparams for logging
+hparams = {
+    "src_tokenizer_type": "ByteLevelBPE",  # Added tokenizer type info
+    "tgt_tokenizer_type": "Unigram",
+    "src_vocab_size_estimate": SRC_VOCAB_SIZE_ESTIMATE,
+    "tgt_vocab_size_estimate": TGT_VOCAB_SIZE_ESTIMATE,
+    "emb_size": EMB_SIZE,
+    "nhead": NHEAD,
+    "ffn_hid_dim": FFN_HID_DIM,
+    "num_encoder_layers": NUM_ENCODER_LAYERS,
+    "num_decoder_layers": NUM_DECODER_LAYERS,
+    "dropout": DROPOUT,
+    "max_len": MAX_LEN,
+    "batch_size_per_gpu": BATCH_SIZE_PER_GPU,
+    "effective_batch_size": BATCH_SIZE_PER_GPU * DEVICES * ACCUMULATE_GRAD_BATCHES,
+    "num_epochs": NUM_EPOCHS,
+    "learning_rate": LEARNING_RATE,
+    "weight_decay": WEIGHT_DECAY,
+    "grad_clip_norm": GRAD_CLIP_NORM,
+    "validation_split": VALIDATION_SPLIT,
+    "random_seed": RANDOM_SEED,
+    "patience": PATIENCE,
+    "precision": PRECISION,
+    "gpus": DEVICES,
+    "strategy": STRATEGY,
+    "num_workers": NUM_WORKERS,
+}
+# --- 2. Token izers (Modified SMILES Tokenizer) ---
+# --- 2.a SMILES ByteLevel BPE Tokenizer (Replaced WordLevel Regex) ---
+def get_smiles_tokenizer(
+    train_files=None,
+    vocab_size=30000,
+    min_frequency=2,
+    tokenizer_path=SMILES_TOKENIZER_FILE,
+):
+    """Creates or loads a Byte-Level BPE tokenizer for SMILES."""
+    if os.path.exists(tokenizer_path):
+        logging.info(f"Loading existing SMILES tokenizer from {tokenizer_path}")
+        try:
+            tokenizer = Tokenizer.from_file(tokenizer_path)
+            # Verify special tokens after loading
+            if (
+                tokenizer.token_to_id("<pad>") != PAD_IDX
+                or tokenizer.token_to_id("<sos>") != SOS_IDX
+                or tokenizer.token_to_id("<eos>") != EOS_IDX
+                or tokenizer.token_to_id("<unk>") != UNK_IDX
+            ):
+                logging.warning(
+                    "Special token ID mismatch after loading SMILES tokenizer. Re-check config."
+                )
+            # Check if it's actually a BPE model (basic check)
+            if not isinstance(tokenizer.model, models.BPE):
+                logging.warning(
+                    f"Loaded tokenizer from {tokenizer_path} is not a BPE model. Retraining."
+                )
+                raise TypeError("Incorrect tokenizer model type loaded.")
+            return tokenizer
+        except Exception as e:
+            logging.error(f"Failed to load SMILES tokenizer: {e}. Retraining...")
+    logging.info("Creating and training SMILES Byte-Level BPE tokenizer...")
+    # Use BPE model
+    tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
+    # Use ByteLevel pre-tokenizer - this handles any character sequence
+    # add_prefix_space=False is generally suitable for SMILES as it doesn't rely on spaces
+    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
+    # Use ByteLevel decoder
+    tokenizer.decoder = decoders.ByteLevel()
+    special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]
+    # Use BpeTrainer
+    trainer = trainers.BpeTrainer(
+        vocab_size=vocab_size,
+        min_frequency=min_frequency,
+        special_tokens=special_tokens,
+        # BPE specific options can be added here if needed, e.g.:
+        # initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), # Usually inferred
+        # show_progress=True,
+    )
+    if train_files and all(os.path.exists(f) for f in train_files):
+        logging.info(f"Training SMILES BPE tokenizer on: {train_files}")
+        tokenizer.train(files=train_files, trainer=trainer)
+        logging.info(
+            f"SMILES BPE tokenizer trained. Final Vocab size: {tokenizer.get_vocab_size()}"
+        )
+        # Verify special token IDs after training
+        if (
+            tokenizer.token_to_id("<pad>") != PAD_IDX
+            or tokenizer.token_to_id("<sos>") != SOS_IDX
+            or tokenizer.token_to_id("<eos>") != EOS_IDX
+            or tokenizer.token_to_id("<unk>") != UNK_IDX
+        ):
+            logging.warning(
+                "Special token ID mismatch after training SMILES BPE tokenizer. Check trainer setup."
+            )
+        try:
+            tokenizer.save(tokenizer_path)
+            logging.info(f"SMILES BPE tokenizer saved to {tokenizer_path}")
+        except Exception as e:
+            logging.error(f"Failed to save SMILES BPE tokenizer: {e}")
+    else:
+        logging.error(
+            "Training files not provided or not found for SMILES tokenizer. Cannot train."
+        )
+        # Manually add special tokens if training fails, so basic encoding/decoding might work
+        tokenizer.add_special_tokens(special_tokens)
+    return tokenizer
+# --- 2.b IUPAC Unigram Tokenizer (No changes needed here) ---
+def get_iupac_tokenizer(
+    train_files=None,
+    vocab_size=30000,
+    min_frequency=2,
+    tokenizer_path=IUPAC_TOKENIZER_FILE,
+):
+    """Creates or loads a Unigram tokenizer for IUPAC names."""
+    if os.path.exists(tokenizer_path):
+        logging.info(f"Loading existing IUPAC tokenizer from {tokenizer_path}")
+        try:
+            tokenizer = Tokenizer.from_file(tokenizer_path)
+            if (
+                tokenizer.token_to_id("<pad>") != PAD_IDX
+                or tokenizer.token_to_id("<sos>") != SOS_IDX
+                or tokenizer.token_to_id("<eos>") != EOS_IDX
+                or tokenizer.token_to_id("<unk>") != UNK_IDX
+            ):
+                logging.warning(
+                    "Special token ID mismatch after loading IUPAC tokenizer. Re-check config."
+                )
+            return tokenizer
+        except Exception as e:
+            logging.error(f"Failed to load IUPAC tokenizer: {e}. Retraining...")
+    logging.info("Creating and training IUPAC Unigram tokenizer...")
+    tokenizer = Tokenizer(models.Unigram())
+    # Using Sequence of pre-tokenizers for IUPAC is reasonable
+    pre_tokenizer_list = [
+        pre_tokenizers.WhitespaceSplit(),  # Split by whitespace first
+        pre_tokenizers.Punctuation(),  # Split punctuation
+        pre_tokenizers.Digits(individual_digits=True),  # Split digits
+    ]
+    # Consider adding Metaspace if Unigram struggles with word boundaries after splits
+    # tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() # Alternative
+    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(pre_tokenizer_list)
+    tokenizer.decoder = (
+        decoders.Metaspace()
+    )  # Metaspace decoder often works well with Unigram/BPE
+    special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]
+    trainer = trainers.UnigramTrainer(
+        vocab_size=vocab_size,
+        special_tokens=special_tokens,
+        unk_token="<unk>",
+        # Unigram specific options can be added here
+        # shrinking_factor=0.75,
+        # n_sub_iterations=2,
+    )
+    if train_files and all(os.path.exists(f) for f in train_files):
+        logging.info(f"Training IUPAC tokenizer on: {train_files}")
+        tokenizer.train(files=train_files, trainer=trainer)
+        logging.info(
+            f"IUPAC tokenizer trained. Final Vocab size: {tokenizer.get_vocab_size()}"
+        )
+        # Verify special token IDs after training
+        if (
+            tokenizer.token_to_id("<pad>") != PAD_IDX
+            or tokenizer.token_to_id("<sos>") != SOS_IDX
+            or tokenizer.token_to_id("<eos>") != EOS_IDX
+            or tokenizer.token_to_id("<unk>") != UNK_IDX
+        ):
+            logging.warning(
+                "Special token ID mismatch after training IUPAC tokenizer. Check trainer setup."
+            )
+        try:
+            tokenizer.save(tokenizer_path)
+            logging.info(f"IUPAC tokenizer saved to {tokenizer_path}")
+        except Exception as e:
+            logging.error(f"Failed to save IUPAC tokenizer: {e}")
+    else:
+        logging.error(
+            "Training files not provided or not found for IUPAC tokenizer. Cannot train."
+        )
+        tokenizer.add_special_tokens(special_tokens)
+    return tokenizer
+# --- 3. Model Definition (No changes needed) ---
+class PositionalEncoding(nn.Module):
+    """Injects positional information into the input embeddings."""
+    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
+        super().__init__()
+        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
+        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
+        pos_embedding = torch.zeros((maxlen, emb_size))
+        pos_embedding[:, 0::2] = torch.sin(pos * den)
+        pos_embedding[:, 1::2] = torch.cos(pos * den)
+        pos_embedding = pos_embedding.unsqueeze(
+            0
+        )  # Add batch dimension for broadcasting
+        self.dropout = nn.Dropout(dropout)
+        self.register_buffer(
+            "pos_embedding", pos_embedding
+        )  # Shape [1, maxlen, emb_size]
+    def forward(self, token_embedding: torch.Tensor):
+        # token_embedding: Expected shape [batch_size, seq_len, emb_size]
+        seq_len = token_embedding.size(1)
+        # Slicing pos_embedding: [1, seq_len, emb_size]
+        # Handle cases where seq_len might exceed buffer's maxlen during inference/edge cases
+        if seq_len > self.pos_embedding.size(1):
+            logging.warning(
+                f"Input sequence length ({seq_len}) exceeds PositionalEncoding maxlen ({self.pos_embedding.size(1)}). Truncating positional encoding."
+            )
+            pos_to_add = self.pos_embedding[:, : self.pos_embedding.size(1), :]
+            # Pad token_embedding if needed? Or error out? For now, just use available encoding.
+            # This scenario shouldn't happen if MAX_LEN config is respected.
+            output = token_embedding[:, : self.pos_embedding.size(1), :] + pos_to_add
+        else:
+            pos_to_add = self.pos_embedding[:, :seq_len, :]
+            output = token_embedding + pos_to_add
+        return self.dropout(output)
+class TokenEmbedding(nn.Module):
+    """Converts token indices to embeddings."""
+    def __init__(self, vocab_size: int, emb_size):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
+        self.emb_size = emb_size
+    def forward(self, tokens: torch.Tensor):
+        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
+class Seq2SeqTransformer(nn.Module):
+    """The main Encoder-Decoder Transformer model."""
+    def __init__(
+        self,
+        num_encoder_layers: int,
+        num_decoder_layers: int,
+        emb_size: int,
+        nhead: int,
+        src_vocab_size: int,
+        tgt_vocab_size: int,
+        dim_feedforward: int,
+        dropout: float = 0.1,
+        max_len: int = MAX_LEN,
+    ):  # Use MAX_LEN from config
+        super().__init__()
+        if emb_size % nhead != 0:
+            raise ValueError(
+                f"Embedding size ({emb_size}) must be divisible by the number of heads ({nhead})"
+            )
+        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
+        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
+        # Ensure PositionalEncoding maxlen is sufficient
+        pe_maxlen = max(
+            max_len, 5000
+        )  # Use config MAX_LEN or default 5000, whichever is larger
+        self.positional_encoding = PositionalEncoding(
+            emb_size, dropout=dropout, maxlen=pe_maxlen
+        )
+        self.transformer = Transformer(
+            d_model=emb_size,
+            nhead=nhead,
+            num_encoder_layers=num_encoder_layers,
+            num_decoder_layers=num_decoder_layers,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            batch_first=True,
+        )  # Use batch_first=True
+        self.generator = nn.Linear(emb_size, tgt_vocab_size)
+        self._init_weights()
+    def _init_weights(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def forward(
+        self,
+        src: torch.Tensor,  # Input sequence (batch_size, src_len)
+        trg: torch.Tensor,  # Target sequence (batch_size, tgt_len)
+        tgt_mask: torch.Tensor,  # Target causal mask (tgt_len, tgt_len)
+        src_padding_mask: torch.Tensor,  # Source padding mask (batch_size, src_len)
+        tgt_padding_mask: torch.Tensor,  # Target padding mask (batch_size, tgt_len)
+        memory_key_padding_mask: torch.Tensor,
+    ):  # Memory padding mask (batch_size, src_len)
+        # --- Ensure masks have correct dtype and device ---
+        # Pytorch Transformer expects boolean masks where True indicates masking
+        src_padding_mask = src_padding_mask.to(src.device)
+        tgt_padding_mask = tgt_padding_mask.to(trg.device)
+        memory_key_padding_mask = memory_key_padding_mask.to(src.device)
+        # tgt_mask needs to be float for '-inf' filling, keep on target device
+        tgt_mask = tgt_mask.to(trg.device)
+        src_emb = self.positional_encoding(
+            self.src_tok_emb(src)
+        )  # [batch, src_len, dim]
+        tgt_emb = self.positional_encoding(
+            self.tgt_tok_emb(trg)
+        )  # [batch, tgt_len, dim]
+        outs = self.transformer(
+            src=src_emb,
+            tgt=tgt_emb,
+            src_mask=None,  # Not typically needed for encoder unless custom masking
+            tgt_mask=tgt_mask,  # Causal mask for decoder self-attn
+            memory_mask=None,  # Not typically needed unless masking specific memory parts
+            src_key_padding_mask=src_padding_mask,  # Mask padding in src K,V
+            tgt_key_padding_mask=tgt_padding_mask,  # Mask padding in tgt Q
+            memory_key_padding_mask=memory_key_padding_mask,
+        )  # Mask padding in memory K,V for cross-attn
+        # outs: [batch_size, tgt_len, emb_size]
+        return self.generator(outs)  # [batch_size, tgt_len, tgt_vocab_size]
+    def encode(self, src: torch.Tensor, src_padding_mask: torch.Tensor):
+        src_padding_mask = src_padding_mask.to(
+            src.device
+        )  # Ensure mask is on correct device
+        src_emb = self.positional_encoding(
+            self.src_tok_emb(src)
+        )  # [batch, src_len, dim]
+        memory = self.transformer.encoder(
+            src_emb, mask=None, src_key_padding_mask=src_padding_mask
+        )
+        return memory  # Returns memory: [batch_size, src_len, emb_size]
+    def decode(
+        self,
+        tgt: torch.Tensor,
+        memory: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        tgt_padding_mask: torch.Tensor,
+        memory_key_padding_mask: torch.Tensor,
+    ):
+        # Ensure masks are on correct device
+        tgt_mask = tgt_mask.to(tgt.device)
+        tgt_padding_mask = tgt_padding_mask.to(tgt.device)
+        memory_key_padding_mask = memory_key_padding_mask.to(memory.device)
+        tgt_emb = self.positional_encoding(
+            self.tgt_tok_emb(tgt)
+        )  # [batch, tgt_len, dim]
+        output = self.transformer.decoder(
+            tgt=tgt_emb,
+            memory=memory,
+            tgt_mask=tgt_mask,
+            memory_mask=None,
+            tgt_key_padding_mask=tgt_padding_mask,
+            memory_key_padding_mask=memory_key_padding_mask,
+        )
+        return output  # Returns decoder output: [batch_size, tgt_len, emb_size]
+# --- Helper function for mask creation (No changes needed) ---
+def generate_square_subsequent_mask(sz: int, device: torch.device) -> torch.Tensor:
+    """Generates an upper-triangular matrix for causal masking."""
+    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
+    mask = (
+        mask.float()
+        .masked_fill(mask == 0, float("-inf"))
+        .masked_fill(mask == 1, float(0.0))
+    )
+    return mask  # Shape [sz, sz]
+def create_masks(
+    src: torch.Tensor, tgt: torch.Tensor, pad_idx: int, device: torch.device
+):
+    """
+    Creates all necessary masks for the Transformer model.
+    Assumes src and tgt are inputs to the forward pass (tgt includes SOS, excludes EOS).
+    Returns boolean masks where True indicates the position should be masked (ignored).
+    """
+    src_seq_len = src.shape[1]
+    tgt_seq_len = tgt.shape[1]
+    # Causal mask for decoder self-attention (float mask for PyTorch Transformer)
+    tgt_mask = generate_square_subsequent_mask(
+        tgt_seq_len, device
+    )  # [tgt_len, tgt_len]
+    # Padding masks (boolean, True where padded)
+    src_padding_mask = src == pad_idx  # [batch_size, src_len]
+    tgt_padding_mask = tgt == pad_idx  # [batch_size, tgt_len]
+    memory_key_padding_mask = (
+        src_padding_mask  # Used in decoder cross-attention [batch_size, src_len]
+    )
+    return tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask
+# --- 4. Data Handling (Dataset and Collate Function - No changes needed) ---
+class SmilesIupacDataset(Dataset):
+    """Dataset class for SMILES-IUPAC pairs, reading from pre-split files."""
+    def __init__(self, smiles_file: str, iupac_file: str):
+        logging.info(f"Loading data from {smiles_file} and {iupac_file}")
+        try:
+            with open(smiles_file, "r", encoding="utf-8") as f_smi:
+                self.smiles = [line.strip() for line in f_smi if line.strip()]
+            with open(iupac_file, "r", encoding="utf-8") as f_iupac:
+                self.iupac = [line.strip() for line in f_iupac if line.strip()]
+            if len(self.smiles) != len(self.iupac):
+                logging.warning(
+                    f"Mismatch in number of lines: {smiles_file} ({len(self.smiles)}) vs {iupac_file} ({len(self.iupac)}). Trimming."
+                )
+                min_len = min(len(self.smiles), len(self.iupac))
+                self.smiles = self.smiles[:min_len]
+                self.iupac = self.iupac[:min_len]
+            logging.info(
+                f"Loaded {len(self.smiles)} pairs from {smiles_file}/{iupac_file}."
+            )
+            if len(self.smiles) == 0:
+                logging.warning(f"Loaded 0 data pairs. Check files.")
+        except FileNotFoundError:
+            logging.error(
+                f"Error: One or both files not found: {smiles_file}, {iupac_file}"
+            )
+            raise
+        except Exception as e:
+            logging.error(f"Error loading data: {e}")
+            raise
+    def __len__(self):
+        return len(self.smiles)
+    def __getitem__(self, idx):
+        return self.smiles[idx], self.iupac[idx]
+def collate_fn(
+    batch, smiles_tokenizer, iupac_tokenizer, pad_idx, sos_idx, eos_idx, max_len
+):
+    """Collates data samples into batches."""
+    src_batch, tgt_batch = [], []
+    skipped_count = 0
+    for src_sample, tgt_sample in batch:
+        try:
+            # Encode source (SMILES)
+            src_encoded = smiles_tokenizer.encode(src_sample)
+            # Truncate source if needed (including potential special tokens if added by encode)
+            src_ids = src_encoded.ids[:max_len]
+            if not src_ids:  # Skip if encoding results in empty sequence
+                skipped_count += 1
+                continue
+            src_tensor = torch.tensor(src_ids, dtype=torch.long)
+            # Encode target (IUPAC)
+            tgt_encoded = iupac_tokenizer.encode(tgt_sample)
+            # Truncate target allowing space for SOS and EOS
+            tgt_ids = tgt_encoded.ids[: max_len - 2]
+            if (
+                not tgt_ids
+            ):  # Skip if encoding results in empty sequence (after truncation)
+                skipped_count += 1
+                continue
+            # Add SOS and EOS tokens
+            tgt_tensor = torch.tensor([sos_idx] + tgt_ids + [eos_idx], dtype=torch.long)
+            src_batch.append(src_tensor)
+            tgt_batch.append(tgt_tensor)
+        except Exception as e:
+            # Log infrequent warnings for skipping
+            # if skipped_count < 5: # Log only the first few skips per batch
+            #     logging.warning(f"Skipping sample due to error during tokenization/tensor creation: {e}. SMILES: '{src_sample[:50]}...', IUPAC: '{tgt_sample[:50]}...'")
+            skipped_count += 1
+            continue
+    # if skipped_count > 0:
+    #     logging.debug(f"Skipped {skipped_count} samples in this batch during collation.")
+    if not src_batch or not tgt_batch:
+        # Return empty tensors if the whole batch was skipped
+        return torch.tensor([]), torch.tensor([])
+    try:
+        # Pad sequences
+        src_batch_padded = pad_sequence(
+            src_batch, batch_first=True, padding_value=pad_idx
+        )
+        tgt_batch_padded = pad_sequence(
+            tgt_batch, batch_first=True, padding_value=pad_idx
+        )
+    except Exception as e:
+        logging.error(
+            f"Error during padding: {e}. Src lengths: {[len(s) for s in src_batch]}, Tgt lengths: {[len(t) for t in tgt_batch]}"
+        )
+        # Return empty tensors on padding error
+        return torch.tensor([]), torch.tensor([])
+    return src_batch_padded, tgt_batch_padded
+# --- 5. PyTorch Lightning Module (No changes needed) ---
+class SmilesIupacLitModule(pl.LightningModule):
+    def __init__(
+        self, src_vocab_size: int, tgt_vocab_size: int, hparams_dict: dict
+    ):  # Pass hparams dictionary
+        super().__init__()
+        # Use save_hyperparameters() to automatically save args to self.hparams
+        # and make them accessible in checkpoints and loggers
+        self.save_hyperparameters(hparams_dict)
+        self.model = Seq2SeqTransformer(
+            num_encoder_layers=self.hparams.num_encoder_layers,
+            num_decoder_layers=self.hparams.num_decoder_layers,
+            emb_size=self.hparams.emb_size,
+            nhead=self.hparams.nhead,
+            src_vocab_size=src_vocab_size,  # Pass actual vocab size
+            tgt_vocab_size=tgt_vocab_size,  # Pass actual vocab size
+            dim_feedforward=self.hparams.ffn_hid_dim,
+            dropout=self.hparams.dropout,
+            max_len=self.hparams.max_len,  # Pass max_len here
+        )
+        self.criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
+        # --- Count Parameters --- (Done once at initialization)
+        total_params = sum(p.numel() for p in self.model.parameters())
+        trainable_params = sum(
+            p.numel() for p in self.model.parameters() if p.requires_grad
+        )
+        logging.info(f"Model Initialized:")
+        logging.info(f"  Total Parameters: {total_params / 1_000_000:.2f} M")
+        logging.info(f"  Trainable Parameters: {trainable_params / 1_000_000:.2f} M")
+        # Log params to wandb hparams if logger is available
+        # self.hparams are automatically logged by WandbLogger if passed to Trainer
+        # We can add them explicitly if needed, but save_hyperparameters usually handles it.
+        self.hparams.total_params_M = round(total_params / 1_000_000, 2)
+        self.hparams.trainable_params_M = round(trainable_params / 1_000_000, 2)
+    def forward(self, src, tgt):
+        # This is the main forward pass used for inference/prediction if needed
+        # For training/validation, we call the model directly in step methods
+        # to handle mask creation explicitly.
+        tgt_input = tgt[:, :-1]  # Prepare target input (remove EOS)
+        tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask = (
+            create_masks(
+                src,
+                tgt_input,
+                PAD_IDX,
+                self.device,  # Use self.device provided by Lightning
+            )
+        )
+        logits = self.model(
+            src,
+            tgt_input,
+            tgt_mask,
+            src_padding_mask,
+            tgt_padding_mask,
+            memory_key_padding_mask,
+        )
+        return logits
+    def training_step(self, batch, batch_idx):
+        src, tgt = batch
+        if src.numel() == 0 or tgt.numel() == 0:
+            # logging.debug(f"Skipping empty batch {batch_idx} in training.")
+            return None  # Skip empty batches
+        tgt_input = tgt[:, :-1]  # Exclude EOS for input
+        tgt_out = tgt[:, 1:]  # Exclude SOS for target labels
+        # Create masks on the current device
+        tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask = (
+            create_masks(src, tgt_input, PAD_IDX, self.device)
+        )
+        try:
+            logits = self.model(
+                src=src,
+                trg=tgt_input,
+                tgt_mask=tgt_mask,
+                src_padding_mask=src_padding_mask,
+                tgt_padding_mask=tgt_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+            )
+            # logits: [batch_size, tgt_len-1, tgt_vocab_size]
+            # Calculate loss
+            # Reshape logits to [batch_size * (tgt_len-1), tgt_vocab_size]
+            # Reshape tgt_out to [batch_size * (tgt_len-1)]
+            loss = self.criterion(
+                logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)
+            )
+            # Check for NaN/Inf loss (important with mixed precision)
+            if not torch.isfinite(loss):
+                logging.warning(
+                    f"Non-finite loss encountered in training step {batch_idx}: {loss.item()}. Skipping update."
+                )
+                # Manually skip optimizer step if using manual optimization,
+                # otherwise returning None might be sufficient for automatic opt.
+                return None  # Returning None should prevent optimizer step
+            # Log training loss
+            # sync_dist=True is important for DDP to average loss across GPUs
+            self.log(
+                "train_loss",
+                loss,
+                on_step=True,
+                on_epoch=True,
+                prog_bar=True,
+                logger=True,
+                sync_dist=True,
+                batch_size=src.size(0),
+            )
+            return loss
+        except RuntimeError as e:
+            if "CUDA out of memory" in str(e):
+                logging.warning(
+                    f"CUDA OOM error during training step {batch_idx} with shape src: {src.shape}, tgt: {tgt.shape}. Skipping batch."
+                )
+                gc.collect()
+                torch.cuda.empty_cache()
+                return None  # Skip update
+            else:
+                logging.error(f"Runtime error during training step {batch_idx}: {e}")
+                # Optionally log shapes for debugging other runtime errors
+                logging.error(f"Shapes - src: {src.shape}, tgt: {tgt.shape}")
+                return None  # Skip update
+    def validation_step(self, batch, batch_idx):
+        src, tgt = batch
+        if src.numel() == 0 or tgt.numel() == 0:
+            # logging.debug(f"Skipping empty batch {batch_idx} in validation.")
+            return None
+        tgt_input = tgt[:, :-1]
+        tgt_out = tgt[:, 1:]
+        tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask = (
+            create_masks(src, tgt_input, PAD_IDX, self.device)
+        )
+        try:
+            logits = self.model(
+                src,
+                tgt_input,
+                tgt_mask,
+                src_padding_mask,
+                tgt_padding_mask,
+                memory_key_padding_mask,
+            )
+            loss = self.criterion(
+                logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1)
+            )
+            if torch.isfinite(loss):
+                # Log validation loss (accumulated across batches and synced across GPUs at epoch end)
+                # sync_dist=True ensures correct aggregation in DDP
+                self.log(
+                    "val_loss",
+                    loss,
+                    on_step=False,
+                    on_epoch=True,
+                    prog_bar=True,
+                    logger=True,
+                    sync_dist=True,
+                    batch_size=src.size(0),
+                )
+            else:
+                logging.warning(
+                    f"Non-finite loss encountered during validation step {batch_idx}: {loss.item()}."
+                )
+            # PTL aggregates logged values automatically for the epoch
+            # Returning the loss value itself isn't strictly necessary when using self.log
+            # return loss
+        except RuntimeError as e:
+            # Don't crash validation if one batch fails (e.g., OOM on a particularly long sequence)
+            logging.error(f"Runtime error during validation step {batch_idx}: {e}")
+            if "CUDA out of memory" in str(e):
+                logging.warning(
+                    f"CUDA OOM error during validation step {batch_idx} with shape src: {src.shape}, tgt: {tgt.shape}. Skipping batch."
+                )
+                gc.collect()
+                torch.cuda.empty_cache()
+            else:
+                logging.error(f"Shapes - src: {src.shape}, tgt: {tgt.shape}")
+            # Return None or a placeholder if needed by some aggregation logic,
+            # but self.log should handle the metric correctly even if some steps fail.
+            return None
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(),  # self.parameters() includes all model parameters
+            lr=self.hparams.learning_rate,
+            weight_decay=self.hparams.weight_decay,
+        )
+        # --- Add Learning Rate Scheduler ---
+        # Use linear warmup followed by linear decay (common for transformers)
+        # Requires the 'transformers' library: pip install transformers
+        try:
+            from transformers import get_linear_schedule_with_warmup
+            # Estimate total training steps if trainer is available
+            # estimated_stepping_batches gives steps per epoch * num_epochs / num_devices (if using DDP)
+            # For total steps across all devices * epochs, we might need to calculate differently or use a fixed large number if estimate isn't ready
+            # Let's rely on estimated_stepping_batches, assuming it gives a reasonable estimate of steps the optimizer will take.
+            # Note: Accessing self.trainer here might be tricky if it's not fully initialized yet.
+            # A safer approach might be to calculate based on dataset size and epochs if possible,
+            # or use a very large number for num_training_steps if decay to zero is desired eventually.
+            # Let's try accessing trainer, but add a fallback.
+            try:
+                # This attribute is available after trainer setup, might work here.
+                num_training_steps = self.trainer.estimated_stepping_batches
+                logging.info(
+                    f"Estimated stepping batches for LR schedule: {num_training_steps}"
+                )
+                if num_training_steps is None or num_training_steps <= 0:
+                    logging.warning(
+                        "Could not estimate stepping batches, using fallback for LR schedule."
+                    )
+                    # Fallback: Calculate based on assumed dataset size / effective batch size * epochs
+                    # This requires knowing the dataset size, which isn't directly available here.
+                    # Using a large fixed number as a simpler fallback if decay is desired eventually.
+                    # Or, calculate based on hparams if dataset size was stored? No.
+                    # Let's default to a large number if estimate fails.
+                    num_training_steps = 1_000_000  # Adjust this large number if needed
+            except AttributeError:
+                logging.warning(
+                    "self.trainer not available yet in configure_optimizers. Using fallback step count for LR schedule."
+                )
+                num_training_steps = 1_000_000  # Adjust this large number if needed
+            # Set warmup steps (e.g., 5% of total steps)
+            num_warmup_steps = int(0.05 * num_training_steps)
+            logging.info(
+                f"LR Scheduler: Total steps ~{num_training_steps}, Warmup steps: {num_warmup_steps}"
+            )
+            scheduler = get_linear_schedule_with_warmup(
+                optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=num_training_steps,
+            )
+            lr_scheduler_config = {
+                "scheduler": scheduler,
+                "interval": "step",  # Call scheduler after each training step
+                "frequency": 1,
+                "name": "linear_warmup_decay_lr",  # Optional: Name for logging
+            }
+            logging.info("Using Linear Warmup/Decay LR Scheduler.")
+            return {"optimizer": optimizer, "lr_scheduler": lr_scheduler_config}
+        except ImportError:
+            logging.warning(
+                "'transformers' library not found. Cannot create linear warmup scheduler. Using constant LR."
+            )
+            return optimizer
+        except Exception as e:
+            logging.error(
+                f"Error setting up LR scheduler: {e}. Using constant LR.", exc_info=True
+            )
+            return optimizer
+# --- 6. Inference (Translation) (No changes needed) ---
+# These functions remain largely the same but will take the LightningModule instance
+def greedy_decode(
+    model: pl.LightningModule,  # Takes the LightningModule
+    src: torch.Tensor,
+    src_padding_mask: torch.Tensor,
+    max_len: int,
+    sos_idx: int,
+    eos_idx: int,
+    device: torch.device,
+) -> torch.Tensor:
+    """Performs greedy decoding using the LightningModule's model."""
+    # model.eval() # Lightning handles eval mode during inference/testing
+    transformer_model = model.model  # Access the underlying Seq2SeqTransformer
+    try:
+        with torch.no_grad():
+            # Use the model's encode/decode methods
+            memory = transformer_model.encode(
+                src, src_padding_mask
+            )  # [1, src_len, emb_size]
+            memory = memory.to(device)
+            # Ensure memory_key_padding_mask is also on the correct device for decode
+            memory_key_padding_mask = src_padding_mask.to(memory.device)  # [1, src_len]
+            ys = (
+                torch.ones(1, 1).fill_(sos_idx).type(torch.long).to(device)
+            )  # [1, 1] (Batch size 1)
+            for i in range(max_len - 1):
+                tgt_seq_len = ys.shape[1]
+                # Create masks for the current decoded sequence length
+                tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device).to(
+                    device
+                )  # [curr_len, curr_len]
+                # No padding in target during greedy decode yet
+                tgt_padding_mask = torch.zeros(ys.shape, dtype=torch.bool).to(
+                    device
+                )  # [1, curr_len]
+                # Use the model's decode method
+                out = transformer_model.decode(
+                    ys, memory, tgt_mask, tgt_padding_mask, memory_key_padding_mask
+                )
+                # out: [1, curr_len, emb_size]
+                # Get the logits for the last token generated
+                last_token_logits = transformer_model.generator(
+                    out[:, -1, :]
+                )  # [1, tgt_vocab_size]
+                prob = last_token_logits  # Use logits directly for argmax
+                _, next_word = torch.max(prob, dim=1)
+                next_word = next_word.item()
+                # Append the predicted token ID
+                ys = torch.cat(
+                    [ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1
+                )
+                # Stop if EOS token is generated
+                if next_word == eos_idx:
+                    break
+        # Return the generated sequence, excluding the initial SOS token
+        return ys[:, 1:]
+    except RuntimeError as e:
+        logging.error(f"Runtime error during greedy decode: {e}")
+        if "CUDA out of memory" in str(e):
+            gc.collect()
+            torch.cuda.empty_cache()
+        # Return an empty tensor on error
+        return torch.tensor([[]], dtype=torch.long, device=device)
+def translate(
+    model: pl.LightningModule,  # Takes the LightningModule
+    src_sentence: str,
+    smiles_tokenizer,
+    iupac_tokenizer,
+    device: torch.device,
+    max_len: int,
+    sos_idx: int,
+    eos_idx: int,
+    pad_idx: int,
+) -> str:
+    """Translates a single SMILES string using the LightningModule."""
+    model.eval()  # Ensure model is in eval mode for inference
+    try:
+        src_encoded = smiles_tokenizer.encode(src_sentence)
+        if not src_encoded or len(src_encoded.ids) == 0:
+            logging.warning(f"Encoding failed for SMILES: {src_sentence}")
+            return "[Encoding Error]"
+        # Truncate source sequence if needed before creating tensor
+        src_ids = src_encoded.ids[:max_len]
+        if not src_ids:
+            logging.warning(
+                f"Source sequence empty after truncation for SMILES: {src_sentence}"
+            )
+            return "[Encoding Error - Empty Src]"
+    except Exception as e:
+        logging.error(f"Error tokenizing SMILES '{src_sentence}': {e}")
+        return "[Encoding Error]"
+    # Create tensor and move to device
+    src = (
+        torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)
+    )  # Add batch dimension
+    # Create padding mask (boolean, True where padded)
+    # For single sentence inference, there's no padding unless the original sequence was shorter than max_len
+    # and we padded it, but here we just take the IDs. The mask should reflect the actual length.
+    # However, the model expects a mask, even if it's all False for non-padded sequences.
+    src_padding_mask = src == pad_idx  # [1, src_len]
+    # Perform greedy decoding
+    tgt_tokens_tensor = greedy_decode(
+        model=model,  # Pass the LightningModule
+        src=src,
+        src_padding_mask=src_padding_mask,
+        max_len=max_len,  # Use the configured max_len for generation limit
+        sos_idx=sos_idx,
+        eos_idx=eos_idx,
+        device=device,
+    )
+    # Decode the generated token IDs
+    if tgt_tokens_tensor.numel() > 0:
+        tgt_tokens = tgt_tokens_tensor.flatten().cpu().numpy().tolist()
+        try:
+            # Decode using the target tokenizer, skipping special tokens like <pad>, <sos>, <eos>
+            translation = iupac_tokenizer.decode(tgt_tokens, skip_special_tokens=True)
+            return translation
+        except Exception as e:
+            logging.error(f"Error decoding target tokens {tgt_tokens}: {e}")
+            return "[Decoding Error]"
+    else:
+        # Log if decoding returned an empty tensor (might happen on error in greedy_decode)
+        # logging.warning(f"Greedy decode returned empty tensor for SMILES: {src_sentence}")
+        return "[Decoding Error - Empty Output]"
+# --- 7. Main Execution Script (Minor updates for clarity) ---
+if __name__ == "__main__":
+    pl.seed_everything(RANDOM_SEED, workers=True)  # Seed everything for reproducibility
+    # --- Create Checkpoint Directory ---
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+    # --- Load Data from CSV and Split ---
+    # (Keep this data preparation step outside the Lightning Module)
+    logging.info(f"Loading and splitting data from {INPUT_CSV_FILE}...")
+    # (Re-using the data loading and splitting logic from the original script)
+    try:
+        # Load with dtype specification for potentially large files
+        df = pd.read_csv(INPUT_CSV_FILE, dtype={"SMILES": str, "Systematic": str})
+        logging.info(f"Initial rows loaded: {len(df)}")
+        if "SMILES" not in df.columns:
+            raise ValueError("CSV must contain 'SMILES' column.")
+        if "Systematic" not in df.columns:
+            raise ValueError("CSV must contain 'Systematic' (IUPAC name) column.")
+        df.rename(columns={"Systematic": "IUPAC"}, inplace=True)
+        initial_rows = len(df)
+        df.dropna(subset=["SMILES", "IUPAC"], inplace=True)
+        rows_after_na = len(df)
+        if initial_rows > rows_after_na:
+            logging.info(
+                f"Dropped {initial_rows - rows_after_na} rows with missing values."
+            )
+        # Strip whitespace and filter empty strings more efficiently
+        df = df[df["SMILES"].str.strip().astype(bool)]
+        df = df[df["IUPAC"].str.strip().astype(bool)]
+        df["SMILES"] = df["SMILES"].str.strip()
+        df["IUPAC"] = df["IUPAC"].str.strip()
+        rows_after_empty = len(df)
+        if rows_after_na > rows_after_empty:
+            logging.info(
+                f"Dropped {rows_after_na - rows_after_empty} rows with empty strings after stripping."
+            )
+        smiles_data = df["SMILES"].tolist()
+        iupac_data = df["IUPAC"].tolist()
+        logging.info(f"Loaded {len(smiles_data)} valid pairs from CSV.")
+        del df
+        gc.collect()  # Free memory
+        if len(smiles_data) < 10:
+            raise ValueError(
+                f"Not enough valid data ({len(smiles_data)}) for split. Need at least 10."
+            )
+        train_smi, val_smi, train_iupac, val_iupac = train_test_split(
+            smiles_data,
+            iupac_data,
+            test_size=VALIDATION_SPLIT,
+            random_state=RANDOM_SEED,
+        )
+        logging.info(f"Split: {len(train_smi)} train, {len(val_smi)} validation.")
+        del smiles_data, iupac_data
+        gc.collect()  # Free memory
+        logging.info("Writing split data to files...")
+        with open(TRAIN_SMILES_FILE, "w", encoding="utf-8") as f:
+            f.write("\n".join(train_smi))
+        with open(TRAIN_IUPAC_FILE, "w", encoding="utf-8") as f:
+            f.write("\n".join(train_iupac))
+        with open(VAL_SMILES_FILE, "w", encoding="utf-8") as f:
+            f.write("\n".join(val_smi))
+        with open(VAL_IUPAC_FILE, "w", encoding="utf-8") as f:
+            f.write("\n".join(val_iupac))
+        logging.info(
+            f"Split files written: {TRAIN_SMILES_FILE}, {TRAIN_IUPAC_FILE}, {VAL_SMILES_FILE}, {VAL_IUPAC_FILE}"
+        )
+        del train_smi, val_smi, train_iupac, val_iupac
+        gc.collect()  # Free memory
+    except FileNotFoundError:
+        logging.error(f"Fatal error: Input CSV file not found at {INPUT_CSV_FILE}")
+        exit(1)
+    except ValueError as ve:
+        logging.error(f"Fatal error during data preparation: {ve}")
+        exit(1)
+    except Exception as e:
+        logging.error(f"Fatal error during data preparation: {e}", exc_info=True)
+        exit(1)
+    # --- End Data Preparation ---
+    # --- Initialize Tokenizers ---
+    logging.info("Initializing Tokenizers...")
+    # Ensure training files exist before attempting to train tokenizers
+    if not os.path.exists(TRAIN_SMILES_FILE) or not os.path.exists(TRAIN_IUPAC_FILE):
+        logging.error(
+            f"Training files ({TRAIN_SMILES_FILE}, {TRAIN_IUPAC_FILE}) not found. Cannot train tokenizers."
+        )
+        exit(1)
+    smiles_tokenizer = get_smiles_tokenizer(
+        train_files=[TRAIN_SMILES_FILE],
+        vocab_size=SRC_VOCAB_SIZE_ESTIMATE,
+        tokenizer_path=SMILES_TOKENIZER_FILE,
+    )
+    iupac_tokenizer = get_iupac_tokenizer(
+        train_files=[TRAIN_IUPAC_FILE],
+        vocab_size=TGT_VOCAB_SIZE_ESTIMATE,
+        tokenizer_path=IUPAC_TOKENIZER_FILE,
+    )
+    ACTUAL_SRC_VOCAB_SIZE = smiles_tokenizer.get_vocab_size()
+    ACTUAL_TGT_VOCAB_SIZE = iupac_tokenizer.get_vocab_size()
+    logging.info(f"Actual SMILES Vocab Size: {ACTUAL_SRC_VOCAB_SIZE}")
+    logging.info(f"Actual IUPAC Vocab Size: {ACTUAL_TGT_VOCAB_SIZE}")
+    # Update hparams with actual sizes (will be logged by WandbLogger)
+    hparams["actual_src_vocab_size"] = ACTUAL_SRC_VOCAB_SIZE
+    hparams["actual_tgt_vocab_size"] = ACTUAL_TGT_VOCAB_SIZE
+    # --- Setup WandB Logger ---
+    # Ensure WANDB_ENTITY is set if required, otherwise it uses default
+    if WANDB_ENTITY is None:
+        logging.warning(
+            "WANDB_ENTITY not set. WandB will log to your default entity. Set WANDB_ENTITY='your_username_or_team' to specify."
+        )
+    wandb_logger = WandbLogger(
+        project=WANDB_PROJECT,
+        entity=WANDB_ENTITY,  # Set your entity here or leave as None
+        name=WANDB_RUN_NAME,
+        config=hparams,  # Log hyperparameters defined above
+        # log_model='all' # Log model checkpoints to WandB (can consume significant storage)
+        # log_model=True # Log best model checkpoint based on monitor
+    )
+    # --- Initialize Datasets and DataLoaders ---
+    logging.info("Creating Datasets and DataLoaders...")
+    try:
+        train_dataset = SmilesIupacDataset(TRAIN_SMILES_FILE, TRAIN_IUPAC_FILE)
+        val_dataset = SmilesIupacDataset(VAL_SMILES_FILE, VAL_IUPAC_FILE)
+        if len(train_dataset) == 0 or len(val_dataset) == 0:
+            logging.error(
+                "Training or validation dataset is empty. Check data splitting and file content."
+            )
+            exit(1)
+    except Exception as e:
+        logging.error(f"Error creating Datasets: {e}", exc_info=True)
+        exit(1)
+    # Create partial function for collate_fn to pass tokenizers and params
+    def collate_fn_partial(batch):
+        return collate_fn(
+            batch,
+            smiles_tokenizer,
+            iupac_tokenizer,
+            PAD_IDX,
+            SOS_IDX,
+            EOS_IDX,
+            hparams["max_len"],
+        )
+    # Use persistent_workers=True if num_workers > 0 for efficiency, especially with DDP
+    persistent_workers = NUM_WORKERS > 0 and STRATEGY == "ddp"  # Recommended for DDP
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE_PER_GPU,
+        shuffle=True,
+        collate_fn=collate_fn_partial,
+        num_workers=NUM_WORKERS,
+        pin_memory=True,
+        persistent_workers=persistent_workers,
+        drop_last=True,
+    )  # Drop last incomplete batch in training for DDP consistency
+    val_dataloader = DataLoader(
+        val_dataset,
+        batch_size=BATCH_SIZE_PER_GPU,  # Use same batch size for validation
+        shuffle=False,
+        collate_fn=collate_fn_partial,
+        num_workers=NUM_WORKERS,
+        pin_memory=True,
+        persistent_workers=persistent_workers,
+        drop_last=False,
+    )  # Keep all validation batches
+    # --- Initialize Model ---
+    logging.info("Initializing Lightning Module...")
+    # Pass hparams dictionary directly, PTL handles it via save_hyperparameters
+    model = SmilesIupacLitModule(
+        src_vocab_size=ACTUAL_SRC_VOCAB_SIZE,
+        tgt_vocab_size=ACTUAL_TGT_VOCAB_SIZE,
+        hparams_dict=hparams,
+    )
+    # Optional: Log model topology to WandB (do this after model init, before training)
+    # Note: watch can sometimes slow down training start, especially with large models
+    # wandb_logger.watch(model, log='all', log_freq=100) # Log gradients and parameters
+    # --- Define Callbacks ---
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=CHECKPOINT_DIR,
+        filename=BEST_MODEL_FILENAME + "-{epoch:02d}-{val_loss:.4f}",
+        save_top_k=1,  # Save only the best model
+        verbose=True,
+        monitor="val_loss",  # Monitor validation loss
+        mode="min",  # Save the model with the minimum validation loss
+        save_last=True,  # Optionally save the last checkpoint as well
+    )
+    early_stopping_callback = EarlyStopping(
+        monitor="val_loss",
+        patience=PATIENCE,  # Number of epochs with no improvement after which training will be stopped
+        verbose=True,
+        mode="min",
+    )
+    # --- Initialize PyTorch Lightning Trainer ---
+    logging.info(
+        f"Initializing PyTorch Lightning Trainer (GPUs={DEVICES}, Strategy='{STRATEGY}', Precision='{PRECISION}')..."
+    )
+    trainer = pl.Trainer(
+        accelerator=ACCELERATOR,
+        devices=DEVICES,
+        strategy=STRATEGY,
+        precision=PRECISION,
+        max_epochs=NUM_EPOCHS,
+        logger=wandb_logger,  # Use WandbLogger
+        callbacks=[checkpoint_callback, early_stopping_callback],
+        gradient_clip_val=GRAD_CLIP_NORM,  # Gradient clipping
+        accumulate_grad_batches=ACCUMULATE_GRAD_BATCHES,  # Gradient accumulation
+        log_every_n_steps=50,  # How often to log metrics (steps across all GPUs)
+        # deterministic=True, # Might slow down training, use for debugging reproducibility if needed
+        # profiler="simple", # Optional: Add profiler ("simple", "advanced", "pytorch") for performance analysis
+        # Checkpointing behavior is controlled by ModelCheckpoint callback
+        # enable_checkpointing=True, # Default is True if callbacks has ModelCheckpoint
+    )
+    # --- Start Training ---
+    logging.info(
+        f"Starting training with Effective Batch Size: {hparams['effective_batch_size']}..."
+    )
+    start_time = time.time()
+    try:
+        trainer.fit(model, train_dataloader, val_dataloader)
+        training_duration = time.time() - start_time
+        logging.info(
+            f"Training finished in {training_duration / 3600:.2f} hours ({training_duration:.2f} seconds)."
+        )
+        # Log best model path and score
+        best_path = checkpoint_callback.best_model_path
+        best_score = checkpoint_callback.best_model_score  # This is a tensor, get value
+        if best_score is not None:
+            logging.info(
+                f"Best model checkpoint saved at: {best_path} with val_loss: {best_score.item():.4f}"
+            )
+            # Log best score to wandb summary
+            wandb_logger.experiment.summary["best_val_loss"] = best_score.item()
+            wandb_logger.experiment.summary["best_model_path"] = best_path
+        else:
+            logging.warning(
+                "Could not retrieve best model score from checkpoint callback."
+            )
+    except Exception as e:
+        logging.error(f"Fatal error during training: {e}", exc_info=True)
+        # Ensure wandb run is finished even on error
+        if wandb.run is not None:
+            wandb.finish(exit_code=1)  # Mark as failed run
+        exit(1)
+    # --- Load Best Model for Final Translation Examples ---
+    best_model_path_to_load = checkpoint_callback.best_model_path
+    logging.info(
+        f"\nLoading best model from {best_model_path_to_load} for translation examples..."
+    )
+    final_model = None
+    if best_model_path_to_load and os.path.exists(best_model_path_to_load):
+        try:
+            # Load the model using the Lightning checkpoint loading mechanism
+            # Pass hparams_dict again in case it's needed and not perfectly saved/loaded
+            final_model = SmilesIupacLitModule.load_from_checkpoint(
+                best_model_path_to_load,
+                # Provide necessary args again if they weren't saved in hparams properly
+                # (though save_hyperparameters should handle this)
+                src_vocab_size=ACTUAL_SRC_VOCAB_SIZE,
+                tgt_vocab_size=ACTUAL_TGT_VOCAB_SIZE,
+                hparams_dict=hparams,  # Pass the original hparams
+            )
+            # Determine device for inference (use the first GPU if available)
+            inference_device = torch.device(
+                f"{ACCELERATOR}:0"
+                if ACCELERATOR == "gpu" and torch.cuda.is_available()
+                else "cpu"
+            )
+            final_model = final_model.to(inference_device)
+            final_model.eval()  # Set to evaluation mode
+            final_model.freeze()  # Freeze weights for inference
+            logging.info(
+                f"Best model loaded successfully to {inference_device} for final translation."
+            )
+        except Exception as e:
+            logging.error(
+                f"Error loading saved model from {best_model_path_to_load}: {e}",
+                exc_info=True,
+            )
+            final_model = None  # Ensure final_model is None if loading fails
+    else:
+        logging.error(
+            f"Error: Best model checkpoint path not found or invalid: '{best_model_path_to_load}'. Cannot perform final translation."
+        )
+    # --- Example Translation (using some validation samples) ---
+    if final_model:
+        logging.info("\n--- Example Translations (using validation data) ---")
+        num_examples = 20  # Show more examples
+        try:
+            # Load validation samples directly from the files
+            val_smi_examples = []
+            val_iupac_examples = []
+            if os.path.exists(VAL_SMILES_FILE) and os.path.exists(VAL_IUPAC_FILE):
+                with (
+                    open(VAL_SMILES_FILE, "r", encoding="utf-8") as f_smi,
+                    open(VAL_IUPAC_FILE, "r", encoding="utf-8") as f_iupac,
+                ):
+                    for i, (smi_line, iupac_line) in enumerate(zip(f_smi, f_iupac)):
+                        if i >= num_examples:
+                            break
+                        val_smi_examples.append(smi_line.strip())
+                        val_iupac_examples.append(iupac_line.strip())
+            else:
+                logging.warning(
+                    f"Validation files ({VAL_SMILES_FILE}, {VAL_IUPAC_FILE}) not found. Cannot show examples."
+                )
+            if len(val_smi_examples) > 0:
+                print("\n" + "=" * 40)
+                print(
+                    f"Example Translations (First {len(val_smi_examples)} Validation Samples)"
+                )
+                print("=" * 40)
+                # Use the device the model was loaded onto
+                inference_device = next(final_model.parameters()).device
+                translation_examples = []  # For potential logging to wandb
+                for i in range(len(val_smi_examples)):
+                    smi = val_smi_examples[i]
+                    true_iupac = val_iupac_examples[i]
+                    predicted_iupac = translate(
+                        model=final_model,  # Use the loaded best model
+                        src_sentence=smi,
+                        smiles_tokenizer=smiles_tokenizer,
+                        iupac_tokenizer=iupac_tokenizer,
+                        device=inference_device,  # Use model's device
+                        max_len=hparams["max_len"],
+                        sos_idx=SOS_IDX,
+                        eos_idx=EOS_IDX,
+                        pad_idx=PAD_IDX,
+                    )
+                    print(f"\nExample {i + 1}:")
+                    print(f"  SMILES:        {smi}")
+                    print(f"  True IUPAC:    {true_iupac}")
+                    print(f"  Predicted IUPAC: {predicted_iupac}")
+                    print("-" * 30)
+                    # Prepare data for wandb table
+                    translation_examples.append([smi, true_iupac, predicted_iupac])
+                print("=" * 40 + "\n")
+                # Log examples to a WandB Table
+                try:
+                    columns = ["SMILES", "True IUPAC", "Predicted IUPAC"]
+                    wandb_table = wandb.Table(
+                        data=translation_examples, columns=columns
+                    )
+                    wandb_logger.experiment.log(
+                        {"validation_translations": wandb_table}
+                    )
+                    logging.info("Logged translation examples to WandB Table.")
+                except Exception as wb_err:
+                    logging.error(
+                        f"Failed to log translation examples to WandB: {wb_err}"
+                    )
+            else:
+                logging.warning("Could not load validation samples for examples.")
+        except Exception as e:
+            logging.error(f"Error during example translation phase: {e}", exc_info=True)
+    else:
+        logging.warning(
+            "Skipping final translation examples as the best model could not be loaded."
+        )
+    # --- Finish WandB Run ---
+    if wandb.run is not None:
+        wandb.finish()
+        logging.info("WandB run finished.")
+    else:
+        logging.info("No active WandB run to finish.")
+    logging.info("Script finished.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+torch
+pytorch_lightning
+json
+logging
+tokenizers
+transformers
+math
+gc
+os
+gradio
+huggingface_hub
+pandas
+sklearn
+scikit-learn
+time
+wandb