Spaces:

AdrianM0
/

smi2iupac

Running on Zero

App Files Files Community

AdrianM0 commited on Apr 20

Commit

e6c42e6

verified ·

1 Parent(s): 21ea065

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +110 -56

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # app.py
 import gradio as gr
 import torch
 # import torch.nn.functional as F # No longer needed for greedy decode directly
 import pytorch_lightning as pl
 import os
@@ -28,6 +29,7 @@ logging.basicConfig(
 # --- Load Helper Code (Only Model Definition and Mask Function Needed) ---
 try:
     from enhanced_trainer import SmilesIupacLitModule, generate_square_subsequent_mask
     logging.info("Successfully imported from enhanced_trainer.py.")
 except ImportError as e:
     logging.error(
@@ -80,10 +82,12 @@ def greedy_decode(
             # --- Initialize Target Sequence ---
             # Start with the SOS token
-            ys = torch.ones(1, 1, dtype=torch.long, device=device).fill_(sos_idx) # [1, 1]
             # --- Decoding Loop ---
-            for _ in range(max_len - 1): # Max length limit
                 tgt_seq_len = ys.shape[1]
                 tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device).to(
                     device
@@ -104,34 +108,43 @@ def greedy_decode(
                 # Get logits for the *next* token prediction
                 next_token_logits = transformer_model.generator(
-                    decoder_output[:, -1, :] # Use output corresponding to the last input token
                 )  # [1, tgt_vocab_size]
                 # Find the most likely next token (greedy choice)
                 # prob = F.log_softmax(next_token_logits, dim=-1) # Not needed for argmax
                 # _, next_word_id_tensor = torch.max(prob, dim=1)
-                next_word_id_tensor = torch.argmax(next_token_logits, dim=1) # [1]
                 next_word_id = next_word_id_tensor.item()
                 # Append the chosen token to the sequence
                 ys = torch.cat(
-                    [ys, torch.ones(1, 1, dtype=torch.long, device=device).fill_(next_word_id)],
-                    dim=1
-                ) # [1, current_len + 1]
                 # Stop if EOS token is generated
                 if next_word_id == eos_idx:
                     break
             # Return the generated sequence (excluding the initial SOS token)
-            return ys[:, 1:] # Shape [1, generated_len]
     except RuntimeError as e:
         logging.error(f"Runtime error during greedy decode: {e}", exc_info=True)
         if "CUDA out of memory" in str(e) and device.type == "cuda":
             gc.collect()
             torch.cuda.empty_cache()
-        return torch.empty((1, 0), dtype=torch.long, device=device) # Return empty tensor on error
     except Exception as e:
         logging.error(f"Unexpected error during greedy decode: {e}", exc_info=True)
         return torch.empty((1, 0), dtype=torch.long, device=device)
@@ -148,7 +161,7 @@ def translate(
     sos_idx: int,
     eos_idx: int,
     pad_idx: int,
-) -> str: # Returns a single string
     """
     Translates a single SMILES string using greedy decoding.
     """
@@ -157,7 +170,9 @@ def translate(
     # --- Tokenize Source ---
     try:
         # Ensure tokenizer has truncation/padding configured if needed, or handle manually
-        smiles_tokenizer.enable_truncation(max_length=max_len) # Use max_len for source truncation too
         src_encoded = smiles_tokenizer.encode(src_sentence)
         if not src_encoded or not src_encoded.ids:
             logging.warning(f"Encoding failed or empty for SMILES: {src_sentence}")
@@ -194,15 +209,15 @@ def translate(
     # --- Decode Generated Tokens ---
     if tgt_tokens_tensor is None or tgt_tokens_tensor.numel() == 0:
-        logging.warning(f"Greedy decode returned empty tensor for SMILES: {src_sentence}")
         return "[Decoding Error - Empty Output]"
     tgt_tokens = tgt_tokens_tensor.flatten().cpu().numpy().tolist()
     try:
         # Decode using the target tokenizer, skipping special tokens
-        translation = iupac_tokenizer.decode(
-            tgt_tokens, skip_special_tokens=True
-        )
         return translation
     except Exception as e:
         logging.error(
@@ -275,23 +290,34 @@ def load_model_and_tokenizers():
             logging.info("Configuration loaded.")
             # --- Validate essential config keys ---
             required_keys = [
-                "src_vocab_size", # Use the key saved in config
-                "tgt_vocab_size", # Use the key saved in config
-                "emb_size", "nhead", "ffn_hid_dim", "num_encoder_layers",
-                "num_decoder_layers", "dropout", "max_len",
-                "pad_token_id", "bos_token_id", "eos_token_id",
             ]
             # Remap if needed (example shown, adjust if your keys differ)
             config_key_mapping = {
-                "src_vocab_size": config.get("src_vocab_size", config.get("actual_src_vocab_size")),
-                "tgt_vocab_size": config.get("tgt_vocab_size", config.get("actual_tgt_vocab_size")),
                 # Add other mappings if necessary
             }
             config.update(config_key_mapping)
             missing_keys = [key for key in required_keys if config.get(key) is None]
             if missing_keys:
-                 raise ValueError(
                     f"Config file '{CONFIG_FILENAME}' is missing required keys: {missing_keys}. "
                     f"Ensure these were saved in the hyperparameters during training."
                 )
@@ -307,7 +333,9 @@ def load_model_and_tokenizers():
             raise gr.Error(f"Config Error: Config file '{CONFIG_FILENAME}' not found.")
         except json.JSONDecodeError as e:
             logging.error(f"Error decoding JSON from config file {config_path}: {e}")
-            raise gr.Error(f"Config Error: Could not parse '{CONFIG_FILENAME}'. Error: {e}")
         except ValueError as e:
             logging.error(f"Config validation error: {e}")
             raise gr.Error(f"Config Error: {e}")
@@ -329,17 +357,26 @@ def load_model_and_tokenizers():
             unk_token = "<unk>"
             issues = []
             # ... (keep the validation checks as in the original code) ...
-            if smiles_tokenizer.token_to_id(pad_token) != config["pad_token_id"]: issues.append(f"SMILES PAD ID mismatch")
-            if smiles_tokenizer.token_to_id(unk_token) is None: issues.append("SMILES UNK token not found")
-            if iupac_tokenizer.token_to_id(pad_token) != config["pad_token_id"]: issues.append(f"IUPAC PAD ID mismatch")
-            if iupac_tokenizer.token_to_id(sos_token) != config["bos_token_id"]: issues.append(f"IUPAC SOS ID mismatch")
-            if iupac_tokenizer.token_to_id(eos_token) != config["eos_token_id"]: issues.append(f"IUPAC EOS ID mismatch")
-            if iupac_tokenizer.token_to_id(unk_token) is None: issues.append("IUPAC UNK token not found")
-            if issues: logging.warning("Tokenizer validation issues: " + "; ".join(issues))
         except Exception as e:
             logging.error(f"Failed to load tokenizers: {e}", exc_info=True)
-            raise gr.Error(f"Tokenizer Error: Could not load tokenizers. Check logs. Error: {e}")
         # Load model
         logging.info("Loading model from checkpoint...")
@@ -352,9 +389,9 @@ def load_model_and_tokenizers():
                 tgt_vocab_size=config["tgt_vocab_size"],
                 # Pass the whole config dict, load_from_checkpoint will pick what it needs
                 # if the keys match the __init__ args or are in hparams
-                **config, # Pass loaded config directly as keyword args
                 map_location=device,
-                strict=True, # Start strict, set to False if encountering key errors
             )
             model.to(device)
@@ -366,24 +403,36 @@ def load_model_and_tokenizers():
         except FileNotFoundError:
             logging.error(f"Checkpoint file not found: {checkpoint_path}")
-            raise gr.Error(f"Model Error: Checkpoint file '{CHECKPOINT_FILENAME}' not found.")
         except Exception as e:
-            logging.error(f"Error loading model checkpoint {checkpoint_path}: {e}", exc_info=True)
             if "size mismatch" in str(e):
-                error_detail = (f"Potential size mismatch. Check vocab sizes in config.json (src={config.get('src_vocab_size')}, tgt={config.get('tgt_vocab_size')}) vs checkpoint.")
                 logging.error(error_detail)
                 raise gr.Error(f"Model Error: {error_detail} Original error: {e}")
             elif "memory" in str(e).lower():
                 logging.warning("Potential OOM error during model loading.")
-                gc.collect(); torch.cuda.empty_cache() if device.type == "cuda" else None
-                raise gr.Error(f"Model Error: OOM loading model. Check Space resources. Error: {e}")
             else:
-                raise gr.Error(f"Model Error: Failed to load checkpoint. Check logs. Error: {e}")
-    except gr.Error: raise
     except Exception as e:
         logging.error(f"Unexpected error during loading: {e}", exc_info=True)
-        raise gr.Error(f"Initialization Error: Unexpected error. Check logs. Error: {e}")
 # --- Inference Function for Gradio (Simplified) ---
@@ -396,11 +445,11 @@ def predict_iupac(smiles_string):
     if not all([model, smiles_tokenizer, iupac_tokenizer, device, config]):
         error_msg = "Error: Model or tokenizers not loaded properly. App initialization might have failed. Check Space logs."
         logging.error(error_msg)
-        return f"Error: {error_msg}" # Return single error string
     if not smiles_string or not smiles_string.strip():
         error_msg = "Error: Please enter a valid SMILES string."
-        return f"Error: {error_msg}" # Return single error string
     smiles_input = smiles_string.strip()
@@ -411,7 +460,7 @@ def predict_iupac(smiles_string):
         pad_idx = config["pad_token_id"]
         gen_max_len = config["max_len"]
-        predicted_name = translate( # Returns a single string now
             model=model,
             src_sentence=smiles_input,
             smiles_tokenizer=smiles_tokenizer,
@@ -425,10 +474,12 @@ def predict_iupac(smiles_string):
         logging.info(f"Prediction returned: {predicted_name}")
         # --- Format Output ---
-        if "[Error]" in predicted_name: # Check for error messages from translate
-             output_text = f"Input SMILES: {smiles_input}\n\nPrediction Failed: {predicted_name}"
         elif not predicted_name:
-             output_text = f"Input SMILES: {smiles_input}\n\nNo prediction generated (decoding might have failed)."
         else:
             output_text = (
                 f"Input SMILES: {smiles_input}\n\n"
@@ -439,12 +490,12 @@ def predict_iupac(smiles_string):
     except RuntimeError as e:
         logging.error(f"Runtime error during translation: {e}", exc_info=True)
-        return f"Error: {error_msg}" # Return single error string
     except Exception as e:
         logging.error(f"Unexpected error during translation: {e}", exc_info=True)
         error_msg = f"Unexpected Error during translation: {e}"
-        return f"Error: {error_msg}" # Return single error string
 # --- Load Model on App Start ---
@@ -452,7 +503,7 @@ try:
     load_model_and_tokenizers()
 except gr.Error as ge:
     logging.error(f"Gradio Initialization Error: {ge}")
-    pass # Allow Gradio to potentially start with an error message
 except Exception as e:
     logging.error(f"Critical error during initial model loading: {e}", exc_info=True)
     # Optionally raise gr.Error here too
@@ -463,7 +514,7 @@ title = "SMILES to IUPAC Name Translator (Greedy Decoding)"
 description = f"""
 Enter a SMILES string to translate it into its IUPAC chemical name using a Transformer model ({MODEL_REPO_ID}) trained via PyTorch Lightning.
 Translation uses **greedy decoding** (picks the most likely next word at each step).
-**Note:** Model loaded on **{str(device).upper() if device else 'N/A'}**. Performance may vary. Check `config.json` in the repo for model details.
 """
 # Define examples - remove beam search parameters
@@ -481,16 +532,19 @@ smiles_input = gr.Textbox(
     placeholder="Enter SMILES string here (e.g., CCO for Ethanol)",
     lines=1,
 )
 # Output component
 output_text = gr.Textbox(
-    label="Predicted IUPAC Name", lines=3, show_copy_button=True # Reduced lines slightly
 )
 # Create the interface instance
 iface = gr.Interface(
     fn=predict_iupac,  # The function to call
-    inputs=smiles_input, # Single input component
     outputs=output_text,  # Output component
     title=title,
     description=description,
@@ -505,4 +559,4 @@ iface = gr.Interface(
 # --- Launch the App ---
 if __name__ == "__main__":
-    iface.launch()

 # app.py
 import gradio as gr
 import torch
 # import torch.nn.functional as F # No longer needed for greedy decode directly
 import pytorch_lightning as pl
 import os
 # --- Load Helper Code (Only Model Definition and Mask Function Needed) ---
 try:
     from enhanced_trainer import SmilesIupacLitModule, generate_square_subsequent_mask
     logging.info("Successfully imported from enhanced_trainer.py.")
 except ImportError as e:
     logging.error(
             # --- Initialize Target Sequence ---
             # Start with the SOS token
+            ys = torch.ones(1, 1, dtype=torch.long, device=device).fill_(
+                sos_idx
+            )  # [1, 1]
             # --- Decoding Loop ---
+            for _ in range(max_len - 1):  # Max length limit
                 tgt_seq_len = ys.shape[1]
                 tgt_mask = generate_square_subsequent_mask(tgt_seq_len, device).to(
                     device
                 # Get logits for the *next* token prediction
                 next_token_logits = transformer_model.generator(
+                    decoder_output[
+                        :, -1, :
+                    ]  # Use output corresponding to the last input token
                 )  # [1, tgt_vocab_size]
                 # Find the most likely next token (greedy choice)
                 # prob = F.log_softmax(next_token_logits, dim=-1) # Not needed for argmax
                 # _, next_word_id_tensor = torch.max(prob, dim=1)
+                next_word_id_tensor = torch.argmax(next_token_logits, dim=1)  # [1]
                 next_word_id = next_word_id_tensor.item()
                 # Append the chosen token to the sequence
                 ys = torch.cat(
+                    [
+                        ys,
+                        torch.ones(1, 1, dtype=torch.long, device=device).fill_(
+                            next_word_id
+                        ),
+                    ],
+                    dim=1,
+                )  # [1, current_len + 1]
                 # Stop if EOS token is generated
                 if next_word_id == eos_idx:
                     break
             # Return the generated sequence (excluding the initial SOS token)
+            return ys[:, 1:]  # Shape [1, generated_len]
     except RuntimeError as e:
         logging.error(f"Runtime error during greedy decode: {e}", exc_info=True)
         if "CUDA out of memory" in str(e) and device.type == "cuda":
             gc.collect()
             torch.cuda.empty_cache()
+        return torch.empty(
+            (1, 0), dtype=torch.long, device=device
+        )  # Return empty tensor on error
     except Exception as e:
         logging.error(f"Unexpected error during greedy decode: {e}", exc_info=True)
         return torch.empty((1, 0), dtype=torch.long, device=device)
     sos_idx: int,
     eos_idx: int,
     pad_idx: int,
+) -> str:  # Returns a single string
     """
     Translates a single SMILES string using greedy decoding.
     """
     # --- Tokenize Source ---
     try:
         # Ensure tokenizer has truncation/padding configured if needed, or handle manually
+        smiles_tokenizer.enable_truncation(
+            max_length=max_len
+        )  # Use max_len for source truncation too
         src_encoded = smiles_tokenizer.encode(src_sentence)
         if not src_encoded or not src_encoded.ids:
             logging.warning(f"Encoding failed or empty for SMILES: {src_sentence}")
     # --- Decode Generated Tokens ---
     if tgt_tokens_tensor is None or tgt_tokens_tensor.numel() == 0:
+        logging.warning(
+            f"Greedy decode returned empty tensor for SMILES: {src_sentence}"
+        )
         return "[Decoding Error - Empty Output]"
     tgt_tokens = tgt_tokens_tensor.flatten().cpu().numpy().tolist()
     try:
         # Decode using the target tokenizer, skipping special tokens
+        translation = iupac_tokenizer.decode(tgt_tokens, skip_special_tokens=True)
         return translation
     except Exception as e:
         logging.error(
             logging.info("Configuration loaded.")
             # --- Validate essential config keys ---
             required_keys = [
+                "src_vocab_size",  # Use the key saved in config
+                "tgt_vocab_size",  # Use the key saved in config
+                "emb_size",
+                "nhead",
+                "ffn_hid_dim",
+                "num_encoder_layers",
+                "num_decoder_layers",
+                "dropout",
+                "max_len",
+                "pad_token_id",
+                "bos_token_id",
+                "eos_token_id",
             ]
             # Remap if needed (example shown, adjust if your keys differ)
             config_key_mapping = {
+                "src_vocab_size": config.get(
+                    "src_vocab_size", config.get("actual_src_vocab_size")
+                ),
+                "tgt_vocab_size": config.get(
+                    "tgt_vocab_size", config.get("actual_tgt_vocab_size")
+                ),
                 # Add other mappings if necessary
             }
             config.update(config_key_mapping)
             missing_keys = [key for key in required_keys if config.get(key) is None]
             if missing_keys:
+                raise ValueError(
                     f"Config file '{CONFIG_FILENAME}' is missing required keys: {missing_keys}. "
                     f"Ensure these were saved in the hyperparameters during training."
                 )
             raise gr.Error(f"Config Error: Config file '{CONFIG_FILENAME}' not found.")
         except json.JSONDecodeError as e:
             logging.error(f"Error decoding JSON from config file {config_path}: {e}")
+            raise gr.Error(
+                f"Config Error: Could not parse '{CONFIG_FILENAME}'. Error: {e}"
+            )
         except ValueError as e:
             logging.error(f"Config validation error: {e}")
             raise gr.Error(f"Config Error: {e}")
             unk_token = "<unk>"
             issues = []
             # ... (keep the validation checks as in the original code) ...
+            if smiles_tokenizer.token_to_id(pad_token) != config["pad_token_id"]:
+                issues.append(f"SMILES PAD ID mismatch")
+            if smiles_tokenizer.token_to_id(unk_token) is None:
+                issues.append("SMILES UNK token not found")
+            if iupac_tokenizer.token_to_id(pad_token) != config["pad_token_id"]:
+                issues.append(f"IUPAC PAD ID mismatch")
+            if iupac_tokenizer.token_to_id(sos_token) != config["bos_token_id"]:
+                issues.append(f"IUPAC SOS ID mismatch")
+            if iupac_tokenizer.token_to_id(eos_token) != config["eos_token_id"]:
+                issues.append(f"IUPAC EOS ID mismatch")
+            if iupac_tokenizer.token_to_id(unk_token) is None:
+                issues.append("IUPAC UNK token not found")
+            if issues:
+                logging.warning("Tokenizer validation issues: " + "; ".join(issues))
         except Exception as e:
             logging.error(f"Failed to load tokenizers: {e}", exc_info=True)
+            raise gr.Error(
+                f"Tokenizer Error: Could not load tokenizers. Check logs. Error: {e}"
+            )
         # Load model
         logging.info("Loading model from checkpoint...")
                 tgt_vocab_size=config["tgt_vocab_size"],
                 # Pass the whole config dict, load_from_checkpoint will pick what it needs
                 # if the keys match the __init__ args or are in hparams
+                **config,  # Pass loaded config directly as keyword args
                 map_location=device,
+                strict=True,  # Start strict, set to False if encountering key errors
             )
             model.to(device)
         except FileNotFoundError:
             logging.error(f"Checkpoint file not found: {checkpoint_path}")
+            raise gr.Error(
+                f"Model Error: Checkpoint file '{CHECKPOINT_FILENAME}' not found."
+            )
         except Exception as e:
+            logging.error(
+                f"Error loading model checkpoint {checkpoint_path}: {e}", exc_info=True
+            )
             if "size mismatch" in str(e):
+                error_detail = f"Potential size mismatch. Check vocab sizes in config.json (src={config.get('src_vocab_size')}, tgt={config.get('tgt_vocab_size')}) vs checkpoint."
                 logging.error(error_detail)
                 raise gr.Error(f"Model Error: {error_detail} Original error: {e}")
             elif "memory" in str(e).lower():
                 logging.warning("Potential OOM error during model loading.")
+                gc.collect()
+                torch.cuda.empty_cache() if device.type == "cuda" else None
+                raise gr.Error(
+                    f"Model Error: OOM loading model. Check Space resources. Error: {e}"
+                )
             else:
+                raise gr.Error(
+                    f"Model Error: Failed to load checkpoint. Check logs. Error: {e}"
+                )
+    except gr.Error:
+        raise
     except Exception as e:
         logging.error(f"Unexpected error during loading: {e}", exc_info=True)
+        raise gr.Error(
+            f"Initialization Error: Unexpected error. Check logs. Error: {e}"
+        )
 # --- Inference Function for Gradio (Simplified) ---
     if not all([model, smiles_tokenizer, iupac_tokenizer, device, config]):
         error_msg = "Error: Model or tokenizers not loaded properly. App initialization might have failed. Check Space logs."
         logging.error(error_msg)
+        return f"Error: {error_msg}"  # Return single error string
     if not smiles_string or not smiles_string.strip():
         error_msg = "Error: Please enter a valid SMILES string."
+        return f"Error: {error_msg}"  # Return single error string
     smiles_input = smiles_string.strip()
         pad_idx = config["pad_token_id"]
         gen_max_len = config["max_len"]
+        predicted_name = translate(  # Returns a single string now
             model=model,
             src_sentence=smiles_input,
             smiles_tokenizer=smiles_tokenizer,
         logging.info(f"Prediction returned: {predicted_name}")
         # --- Format Output ---
+        if "[Error]" in predicted_name:  # Check for error messages from translate
+            output_text = (
+                f"Input SMILES: {smiles_input}\n\nPrediction Failed: {predicted_name}"
+            )
         elif not predicted_name:
+            output_text = f"Input SMILES: {smiles_input}\n\nNo prediction generated (decoding might have failed)."
         else:
             output_text = (
                 f"Input SMILES: {smiles_input}\n\n"
     except RuntimeError as e:
         logging.error(f"Runtime error during translation: {e}", exc_info=True)
+        return f"Error: {error_msg}"  # Return single error string
     except Exception as e:
         logging.error(f"Unexpected error during translation: {e}", exc_info=True)
         error_msg = f"Unexpected Error during translation: {e}"
+        return f"Error: {error_msg}"  # Return single error string
 # --- Load Model on App Start ---
     load_model_and_tokenizers()
 except gr.Error as ge:
     logging.error(f"Gradio Initialization Error: {ge}")
+    pass  # Allow Gradio to potentially start with an error message
 except Exception as e:
     logging.error(f"Critical error during initial model loading: {e}", exc_info=True)
     # Optionally raise gr.Error here too
 description = f"""
 Enter a SMILES string to translate it into its IUPAC chemical name using a Transformer model ({MODEL_REPO_ID}) trained via PyTorch Lightning.
 Translation uses **greedy decoding** (picks the most likely next word at each step).
+**Note:** Model loaded on **{str(device).upper() if device else "N/A"}**. Performance may vary. Check `config.json` in the repo for model details.
 """
 # Define examples - remove beam search parameters
     placeholder="Enter SMILES string here (e.g., CCO for Ethanol)",
     lines=1,
 )
+from rdkit.Chem import CanonSmiles
+smiles_input = CanonSmiles(smiles_input)
 # Output component
 output_text = gr.Textbox(
+    label="Predicted IUPAC Name",
+    lines=3,
+    show_copy_button=True,  # Reduced lines slightly
 )
 # Create the interface instance
 iface = gr.Interface(
     fn=predict_iupac,  # The function to call
+    inputs=smiles_input,  # Single input component
     outputs=output_text,  # Output component
     title=title,
     description=description,
 # --- Launch the App ---
 if __name__ == "__main__":
+    iface.launch()