Spaces:

hbhzm
/

molecular_property_prediction

Build error

App Files Files Community

hbhzm commited on about 1 month ago

Commit

1afebd5

verified ·

1 Parent(s): 41e2665

Upload 111 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

chemprop-updated/chemprop/__init__.py +5 -0
chemprop-updated/chemprop/__pycache__/__init__.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/args.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/constants.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/hyperopt_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/hyperparameter_optimization.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/interpret.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/multitask_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/nn_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/rdkit.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/sklearn_predict.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/sklearn_train.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/spectra_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/cli/common.py +216 -0
chemprop-updated/chemprop/cli/conf.py +9 -0
chemprop-updated/chemprop/cli/convert.py +55 -0
chemprop-updated/chemprop/cli/fingerprint.py +185 -0
chemprop-updated/chemprop/cli/hpopt.py +540 -0
chemprop-updated/chemprop/cli/main.py +85 -0
chemprop-updated/chemprop/cli/predict.py +447 -0
chemprop-updated/chemprop/cli/train.py +1343 -0
chemprop-updated/chemprop/cli/utils/__init__.py +30 -0
chemprop-updated/chemprop/cli/utils/actions.py +19 -0
chemprop-updated/chemprop/cli/utils/args.py +34 -0
chemprop-updated/chemprop/cli/utils/command.py +24 -0
chemprop-updated/chemprop/cli/utils/parsing.py +457 -0
chemprop-updated/chemprop/cli/utils/utils.py +31 -0
chemprop-updated/chemprop/conf.py +6 -0
chemprop-updated/chemprop/data/__init__.py +41 -0
chemprop-updated/chemprop/data/__pycache__/__init__.cpython-37.pyc +0 -0
chemprop-updated/chemprop/data/__pycache__/data.cpython-37.pyc +0 -0
chemprop-updated/chemprop/data/__pycache__/scaffold.cpython-37.pyc +0 -0
chemprop-updated/chemprop/data/__pycache__/scaler.cpython-37.pyc +0 -0
chemprop-updated/chemprop/data/__pycache__/utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/data/collate.py +123 -0
chemprop-updated/chemprop/data/dataloader.py +71 -0
chemprop-updated/chemprop/data/datapoints.py +150 -0
chemprop-updated/chemprop/data/datasets.py +475 -0
chemprop-updated/chemprop/data/molgraph.py +17 -0
chemprop-updated/chemprop/data/samplers.py +66 -0
chemprop-updated/chemprop/data/splitting.py +225 -0
chemprop-updated/chemprop/exceptions.py +12 -0
chemprop-updated/chemprop/features/__pycache__/__init__.cpython-37.pyc +0 -0
chemprop-updated/chemprop/features/__pycache__/features_generators.cpython-37.pyc +0 -0
chemprop-updated/chemprop/features/__pycache__/featurization.cpython-37.pyc +0 -0
chemprop-updated/chemprop/features/__pycache__/utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/featurizers/__init__.py +52 -0
chemprop-updated/chemprop/featurizers/atom.py +281 -0
chemprop-updated/chemprop/featurizers/base.py +30 -0

chemprop-updated/chemprop/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from . import data, exceptions, featurizers, models, nn, schedulers, utils
+__all__ = ["data", "featurizers", "models", "nn", "utils", "exceptions", "schedulers"]
+__version__ = "2.1.2"

chemprop-updated/chemprop/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (743 Bytes). View file

chemprop-updated/chemprop/__pycache__/args.cpython-37.pyc ADDED Viewed

Binary file (33.7 kB). View file

chemprop-updated/chemprop/__pycache__/constants.cpython-37.pyc ADDED Viewed

Binary file (430 Bytes). View file

chemprop-updated/chemprop/__pycache__/hyperopt_utils.cpython-37.pyc ADDED Viewed

Binary file (11.1 kB). View file

chemprop-updated/chemprop/__pycache__/hyperparameter_optimization.cpython-37.pyc ADDED Viewed

Binary file (6.15 kB). View file

chemprop-updated/chemprop/__pycache__/interpret.cpython-37.pyc ADDED Viewed

Binary file (14.2 kB). View file

chemprop-updated/chemprop/__pycache__/multitask_utils.cpython-37.pyc ADDED Viewed

Binary file (3.12 kB). View file

chemprop-updated/chemprop/__pycache__/nn_utils.cpython-37.pyc ADDED Viewed

Binary file (8.13 kB). View file

chemprop-updated/chemprop/__pycache__/rdkit.cpython-37.pyc ADDED Viewed

Binary file (1.43 kB). View file

chemprop-updated/chemprop/__pycache__/sklearn_predict.cpython-37.pyc ADDED Viewed

Binary file (2.82 kB). View file

chemprop-updated/chemprop/__pycache__/sklearn_train.cpython-37.pyc ADDED Viewed

Binary file (11.4 kB). View file

chemprop-updated/chemprop/__pycache__/spectra_utils.cpython-37.pyc ADDED Viewed

Binary file (5.1 kB). View file

chemprop-updated/chemprop/__pycache__/utils.cpython-37.pyc ADDED Viewed

Binary file (26.5 kB). View file

chemprop-updated/chemprop/cli/common.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+from chemprop.cli.utils import LookupAction
+from chemprop.cli.utils.args import uppercase
+from chemprop.featurizers import AtomFeatureMode, MoleculeFeaturizerRegistry, RxnMode
+logger = logging.getLogger(__name__)
+def add_common_args(parser: ArgumentParser) -> ArgumentParser:
+    data_args = parser.add_argument_group("Shared input data args")
+    data_args.add_argument(
+        "-s",
+        "--smiles-columns",
+        nargs="+",
+        help="Column names in the input CSV containing SMILES strings (uses the 0th column by default)",
+    )
+    data_args.add_argument(
+        "-r",
+        "--reaction-columns",
+        nargs="+",
+        help="Column names in the input CSV containing reaction SMILES in the format ``REACTANT>AGENT>PRODUCT``, where 'AGENT' is optional",
+    )
+    data_args.add_argument(
+        "--no-header-row",
+        action="store_true",
+        help="Turn off using the first row in the input CSV as column names",
+    )
+    dataloader_args = parser.add_argument_group("Dataloader args")
+    dataloader_args.add_argument(
+        "-n",
+        "--num-workers",
+        type=int,
+        default=0,
+        help="""Number of workers for parallel data loading where 0 means sequential
+(Warning: setting ``num_workers`` to a value greater than 0 can cause hangs on Windows and MacOS)""",
+    )
+    dataloader_args.add_argument("-b", "--batch-size", type=int, default=64, help="Batch size")
+    parser.add_argument(
+        "--accelerator", default="auto", help="Passed directly to the lightning ``Trainer()``"
+    )
+    parser.add_argument(
+        "--devices",
+        default="auto",
+        help="Passed directly to the lightning ``Trainer()`` (must be a single string of comma separated devices, e.g. '1, 2' if specifying multiple devices)",
+    )
+    featurization_args = parser.add_argument_group("Featurization args")
+    featurization_args.add_argument(
+        "--rxn-mode",
+        "--reaction-mode",
+        type=uppercase,
+        default="REAC_DIFF",
+        choices=list(RxnMode.keys()),
+        help="""Choices for construction of atom and bond features for reactions (case insensitive):
+- ``REAC_PROD``: concatenates the reactants feature with the products feature
+- ``REAC_DIFF``: concatenates the reactants feature with the difference in features between reactants and products (Default)
+- ``PROD_DIFF``: concatenates the products feature with the difference in features between reactants and products
+- ``REAC_PROD_BALANCE``: concatenates the reactants feature with the products feature, balances imbalanced reactions
+- ``REAC_DIFF_BALANCE``: concatenates the reactants feature with the difference in features between reactants and products, balances imbalanced reactions
+- ``PROD_DIFF_BALANCE``: concatenates the products feature with the difference in features between reactants and products, balances imbalanced reactions""",
+    )
+    # TODO: Update documenation for multi_hot_atom_featurizer_mode
+    featurization_args.add_argument(
+        "--multi-hot-atom-featurizer-mode",
+        type=uppercase,
+        default="V2",
+        choices=list(AtomFeatureMode.keys()),
+        help="""Choices for multi-hot atom featurization scheme. This will affect both non-reaction and reaction feturization (case insensitive):
+- ``V1``: Corresponds to the original configuration employed in the Chemprop V1
+- ``V2``: Tailored for a broad range of molecules, this configuration encompasses all elements in the first four rows of the periodic table, along with iodine. It is the default in Chemprop V2.
+- ``ORGANIC``: This configuration is designed specifically for use with organic molecules for drug research and development and includes a subset of elements most common in organic chemistry, including H, B, C, N, O, F, Si, P, S, Cl, Br, and I.
+- ``RIGR``: Modified V2 (default) featurizer using only the resonance-invariant atom and bond features.""",
+    )
+    featurization_args.add_argument(
+        "--keep-h",
+        action="store_true",
+        help="Whether hydrogens explicitly specified in input should be kept in the mol graph",
+    )
+    featurization_args.add_argument(
+        "--add-h", action="store_true", help="Whether hydrogens should be added to the mol graph"
+    )
+    data_args.add_argument(
+        "--ignore-chirality",
+        action="store_true",
+        help="Ignore chirality information in the input SMILES",
+    )
+    featurization_args.add_argument(
+        "--molecule-featurizers",
+        "--features-generators",
+        nargs="+",
+        action=LookupAction(MoleculeFeaturizerRegistry),
+        help="Method(s) of generating molecule features to use as extra descriptors",
+    )
+    # TODO: add in v2.1 to deprecate features-generators and then remove in v2.2
+    # featurization_args.add_argument(
+    #     "--features-generators", nargs="+", help="Renamed to `--molecule-featurizers`."
+    # )
+    featurization_args.add_argument(
+        "--descriptors-path",
+        type=Path,
+        help="Path to extra descriptors to concatenate to learned representation",
+    )
+    # TODO: Add in v2.1
+    # featurization_args.add_argument(
+    #     "--phase-features-path",
+    #     help="Path to features used to indicate the phase of the data in one-hot vector form. Used in spectra datatype.",
+    # )
+    featurization_args.add_argument(
+        "--no-descriptor-scaling", action="store_true", help="Turn off extra descriptor scaling"
+    )
+    featurization_args.add_argument(
+        "--no-atom-feature-scaling", action="store_true", help="Turn off extra atom feature scaling"
+    )
+    featurization_args.add_argument(
+        "--no-atom-descriptor-scaling",
+        action="store_true",
+        help="Turn off extra atom descriptor scaling",
+    )
+    featurization_args.add_argument(
+        "--no-bond-feature-scaling", action="store_true", help="Turn off extra bond feature scaling"
+    )
+    featurization_args.add_argument(
+        "--atom-features-path",
+        nargs="+",
+        action="append",
+        help="If a single path is given, it is assumed to correspond to the 0-th molecule. Alternatively, it can be a two-tuple of molecule index and path to additional atom features to supply before message passing (e.g., ``--atom-features-path 0 /path/to/features_0.npz``) indicates that the features at the given path should be supplied to the 0-th component. To supply additional features for multiple components, repeat this argument on the command line for each component's respective values (e.g., ``--atom-features-path [...] --atom-features-path [...]``).",
+    )
+    featurization_args.add_argument(
+        "--atom-descriptors-path",
+        nargs="+",
+        action="append",
+        help="If a single path is given, it is assumed to correspond to the 0-th molecule. Alternatively, it can be a two-tuple of molecule index and path to additional atom descriptors to supply after message passing (e.g., ``--atom-descriptors-path 0 /path/to/descriptors_0.npz`` indicates that the descriptors at the given path should be supplied to the 0-th component. To supply additional descriptors for multiple components, repeat this argument on the command line for each component's respective values (e.g., ``--atom-descriptors-path [...] --atom-descriptors-path [...]``).",
+    )
+    featurization_args.add_argument(
+        "--bond-features-path",
+        nargs="+",
+        action="append",
+        help="If a single path is given, it is assumed to correspond to the 0-th molecule. Alternatively, it can be a two-tuple of molecule index and path to additional bond features to supply before message passing (e.g., ``--bond-features-path 0 /path/to/features_0.npz`` indicates that the features at the given path should be supplied to the 0-th component. To supply additional features for multiple components, repeat this argument on the command line for each component's respective values (e.g., ``--bond-features-path [...] --bond-features-path [...]``).",
+    )
+    # TODO: Add in v2.2
+    # parser.add_argument(
+    #     "--constraints-path",
+    #     help="Path to constraints applied to atomic/bond properties prediction.",
+    # )
+    return parser
+def process_common_args(args: Namespace) -> Namespace:
+    # TODO: add in v2.1 to deprecate features-generators and then remove in v2.2
+    # if args.features_generators is not None:
+    #     raise ArgumentError(
+    #         argument=None,
+    #         message="`--features-generators` has been renamed to `--molecule-featurizers`.",
+    #     )
+    for key in ["atom_features_path", "atom_descriptors_path", "bond_features_path"]:
+        inds_paths = getattr(args, key)
+        if not inds_paths:
+            continue
+        ind_path_dict = {}
+        for ind_path in inds_paths:
+            if len(ind_path) > 2:
+                raise ArgumentError(
+                    argument=None,
+                    message="Too many arguments were given for atom features/descriptors or bond features. It should be either a two-tuple of molecule index and a path, or a single path (assumed to be the 0-th molecule).",
+                )
+            if len(ind_path) == 1:
+                ind = 0
+                path = ind_path[0]
+            else:
+                ind, path = ind_path
+            if ind_path_dict.get(int(ind), None):
+                raise ArgumentError(
+                    argument=None,
+                    message=f"Duplicate atom features/descriptors or bond features given for molecule index {ind}",
+                )
+            ind_path_dict[int(ind)] = Path(path)
+        setattr(args, key, ind_path_dict)
+    return args
+def validate_common_args(args):
+    pass
+def find_models(model_paths: list[Path]):
+    collected_model_paths = []
+    for model_path in model_paths:
+        if model_path.suffix in [".ckpt", ".pt"]:
+            collected_model_paths.append(model_path)
+        elif model_path.is_dir():
+            collected_model_paths.extend(list(model_path.rglob("*.pt")))
+        else:
+            raise ArgumentError(
+                argument=None,
+                message=f"Expected a .ckpt or .pt file, or a directory. Got {model_path}",
+            )
+    return collected_model_paths

chemprop-updated/chemprop/cli/conf.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from datetime import datetime
+import logging
+import os
+from pathlib import Path
+LOG_DIR = Path(os.getenv("CHEMPROP_LOG_DIR", "chemprop_logs"))
+LOG_LEVELS = {0: logging.INFO, 1: logging.DEBUG, -1: logging.WARNING, -2: logging.ERROR}
+NOW = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+CHEMPROP_TRAIN_DIR = Path(os.getenv("CHEMPROP_TRAIN_DIR", "chemprop_training"))

chemprop-updated/chemprop/cli/convert.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+import sys
+from chemprop.cli.utils import Subcommand
+from chemprop.utils.v1_to_v2 import convert_model_file_v1_to_v2
+logger = logging.getLogger(__name__)
+class ConvertSubcommand(Subcommand):
+    COMMAND = "convert"
+    HELP = "Convert a v1 model checkpoint (.pt) to a v2 model checkpoint (.pt)."
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser.add_argument(
+            "-i",
+            "--input-path",
+            required=True,
+            type=Path,
+            help="Path to a v1 model .pt checkpoint file",
+        )
+        parser.add_argument(
+            "-o",
+            "--output-path",
+            type=Path,
+            help="Path to which the converted model will be saved (``CURRENT_DIRECTORY/STEM_OF_INPUT_v2.pt`` by default)",
+        )
+        return parser
+    @classmethod
+    def func(cls, args: Namespace):
+        if args.output_path is None:
+            args.output_path = Path(args.input_path.stem + "_v2.pt")
+        if args.output_path.suffix != ".pt":
+            raise ArgumentError(
+                argument=None, message=f"Output must be a `.pt` file. Got {args.output_path}"
+            )
+        logger.info(
+            f"Converting v1 model checkpoint '{args.input_path}' to v2 model checkpoint '{args.output_path}'..."
+        )
+        convert_model_file_v1_to_v2(args.input_path, args.output_path)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = ConvertSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    ConvertSubcommand.func(args)

chemprop-updated/chemprop/cli/fingerprint.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+import torch
+from chemprop import data
+from chemprop.cli.common import add_common_args, process_common_args, validate_common_args
+from chemprop.cli.predict import find_models
+from chemprop.cli.utils import Subcommand, build_data_from_files, make_dataset
+from chemprop.models import load_model
+from chemprop.nn.metrics import LossFunctionRegistry
+logger = logging.getLogger(__name__)
+class FingerprintSubcommand(Subcommand):
+    COMMAND = "fingerprint"
+    HELP = "Use a pretrained chemprop model to calculate learned representations."
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        parser.add_argument(
+            "-i",
+            "--test-path",
+            required=True,
+            type=Path,
+            help="Path to an input CSV file containing SMILES",
+        )
+        parser.add_argument(
+            "-o",
+            "--output",
+            "--preds-path",
+            type=Path,
+            help="Specify the path where predictions will be saved. If the file extension is .npz, they will be saved as a npz file. Otherwise, the predictions will be saved as a CSV. The index of the model will be appended to the filename's stem. By default, predictions will be saved to the same location as ``--test-path`` with '_fps' appended (e.g., 'PATH/TO/TEST_PATH_fps_0.csv').",
+        )
+        parser.add_argument(
+            "--model-paths",
+            "--model-path",
+            required=True,
+            type=Path,
+            nargs="+",
+            help="Specify location of checkpoint(s) or model file(s) to use for prediction. It can be a path to either a single pretrained model checkpoint (.ckpt) or single pretrained model file (.pt), a directory that contains these files, or a list of path(s) and directory(s). If a directory, chemprop will recursively search and predict on all found (.pt) models.",
+        )
+        parser.add_argument(
+            "--ffn-block-index",
+            required=True,
+            type=int,
+            default=-1,
+            help="The index indicates which linear layer returns the encoding in the FFN. An index of 0 denotes the post-aggregation representation through a 0-layer MLP, while an index of 1 represents the output from the first linear layer in the FFN, and so forth.",
+        )
+        return parser
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        validate_common_args(args)
+        args = process_fingerprint_args(args)
+        main(args)
+def process_fingerprint_args(args: Namespace) -> Namespace:
+    if args.test_path.suffix not in [".csv"]:
+        raise ArgumentError(
+            argument=None, message=f"Input data must be a CSV file. Got {args.test_path}"
+        )
+    if args.output is None:
+        args.output = args.test_path.parent / (args.test_path.stem + "_fps.csv")
+    if args.output.suffix not in [".csv", ".npz"]:
+        raise ArgumentError(
+            argument=None, message=f"Output must be a CSV or NPZ file. Got '{args.output}'."
+        )
+    return args
+def make_fingerprint_for_model(
+    args: Namespace, model_path: Path, multicomponent: bool, output_path: Path
+):
+    model = load_model(model_path, multicomponent)
+    model.eval()
+    bounded = any(
+        isinstance(model.criterion, LossFunctionRegistry[loss_function])
+        for loss_function in LossFunctionRegistry.keys()
+        if "bounded" in loss_function
+    )
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        target_cols=[],
+        ignore_cols=None,
+        splits_col=None,
+        weight_col=None,
+        bounded=bounded,
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers,
+        keep_h=args.keep_h,
+        add_h=args.add_h,
+        ignore_chirality=args.ignore_chirality,
+    )
+    test_data = build_data_from_files(
+        args.test_path,
+        **format_kwargs,
+        p_descriptors=args.descriptors_path,
+        p_atom_feats=args.atom_features_path,
+        p_bond_feats=args.bond_features_path,
+        p_atom_descs=args.atom_descriptors_path,
+        **featurization_kwargs,
+    )
+    logger.info(f"test size: {len(test_data[0])}")
+    test_dsets = [
+        make_dataset(d, args.rxn_mode, args.multi_hot_atom_featurizer_mode) for d in test_data
+    ]
+    if multicomponent:
+        test_dset = data.MulticomponentDataset(test_dsets)
+    else:
+        test_dset = test_dsets[0]
+    test_loader = data.build_dataloader(test_dset, args.batch_size, args.num_workers, shuffle=False)
+    logger.info(model)
+    with torch.no_grad():
+        if multicomponent:
+            encodings = [
+                model.encoding(batch.bmgs, batch.V_ds, batch.X_d, args.ffn_block_index)
+                for batch in test_loader
+            ]
+        else:
+            encodings = [
+                model.encoding(batch.bmg, batch.V_d, batch.X_d, args.ffn_block_index)
+                for batch in test_loader
+            ]
+        H = torch.cat(encodings, 0).numpy()
+    if output_path.suffix in [".npz"]:
+        np.savez(output_path, H=H)
+    elif output_path.suffix == ".csv":
+        fingerprint_columns = [f"fp_{i}" for i in range(H.shape[1])]
+        df_fingerprints = pd.DataFrame(H, columns=fingerprint_columns)
+        df_fingerprints.to_csv(output_path, index=False)
+    else:
+        raise ArgumentError(
+            argument=None, message=f"Output must be a CSV or npz file. Got {args.output}."
+        )
+    logger.info(f"Fingerprints saved to '{output_path}'")
+def main(args):
+    match (args.smiles_columns, args.reaction_columns):
+        case [None, None]:
+            n_components = 1
+        case [_, None]:
+            n_components = len(args.smiles_columns)
+        case [None, _]:
+            n_components = len(args.reaction_columns)
+        case _:
+            n_components = len(args.smiles_columns) + len(args.reaction_columns)
+    multicomponent = n_components > 1
+    for i, model_path in enumerate(find_models(args.model_paths)):
+        logger.info(f"Fingerprints with model {i} at '{model_path}'")
+        output_path = args.output.parent / f"{args.output.stem}_{i}{args.output.suffix}"
+        make_fingerprint_for_model(args, model_path, multicomponent, output_path)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = FingerprintSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    args = FingerprintSubcommand.func(args)

chemprop-updated/chemprop/cli/hpopt.py ADDED Viewed

	@@ -0,0 +1,540 @@

+from copy import deepcopy
+import logging
+from pathlib import Path
+import shutil
+import sys
+from configargparse import ArgumentParser, Namespace
+from lightning import pytorch as pl
+from lightning.pytorch.callbacks import EarlyStopping
+import numpy as np
+import torch
+from chemprop.cli.common import add_common_args, process_common_args, validate_common_args
+from chemprop.cli.train import (
+    TrainSubcommand,
+    add_train_args,
+    build_datasets,
+    build_model,
+    build_splits,
+    normalize_inputs,
+    process_train_args,
+    save_config,
+    validate_train_args,
+)
+from chemprop.cli.utils.command import Subcommand
+from chemprop.data import build_dataloader
+from chemprop.nn import AggregationRegistry, MetricRegistry
+from chemprop.nn.transforms import UnscaleTransform
+from chemprop.nn.utils import Activation
+NO_RAY = False
+DEFAULT_SEARCH_SPACE = {
+    "activation": None,
+    "aggregation": None,
+    "aggregation_norm": None,
+    "batch_size": None,
+    "depth": None,
+    "dropout": None,
+    "ffn_hidden_dim": None,
+    "ffn_num_layers": None,
+    "final_lr_ratio": None,
+    "message_hidden_dim": None,
+    "init_lr_ratio": None,
+    "max_lr": None,
+    "warmup_epochs": None,
+}
+try:
+    import ray
+    from ray import tune
+    from ray.train import CheckpointConfig, RunConfig, ScalingConfig
+    from ray.train.lightning import (
+        RayDDPStrategy,
+        RayLightningEnvironment,
+        RayTrainReportCallback,
+        prepare_trainer,
+    )
+    from ray.train.torch import TorchTrainer
+    from ray.tune.schedulers import ASHAScheduler, FIFOScheduler
+    DEFAULT_SEARCH_SPACE = {
+        "activation": tune.choice(categories=list(Activation.keys())),
+        "aggregation": tune.choice(categories=list(AggregationRegistry.keys())),
+        "aggregation_norm": tune.quniform(lower=1, upper=200, q=1),
+        "batch_size": tune.choice([16, 32, 64, 128, 256]),
+        "depth": tune.qrandint(lower=2, upper=6, q=1),
+        "dropout": tune.choice([0.0] * 8 + list(np.arange(0.05, 0.45, 0.05))),
+        "ffn_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
+        "ffn_num_layers": tune.qrandint(lower=1, upper=3, q=1),
+        "final_lr_ratio": tune.loguniform(lower=1e-2, upper=1),
+        "message_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
+        "init_lr_ratio": tune.loguniform(lower=1e-2, upper=1),
+        "max_lr": tune.loguniform(lower=1e-4, upper=1e-2),
+        "warmup_epochs": None,
+    }
+except ImportError:
+    NO_RAY = True
+NO_HYPEROPT = False
+try:
+    from ray.tune.search.hyperopt import HyperOptSearch
+except ImportError:
+    NO_HYPEROPT = True
+NO_OPTUNA = False
+try:
+    from ray.tune.search.optuna import OptunaSearch
+except ImportError:
+    NO_OPTUNA = True
+logger = logging.getLogger(__name__)
+SEARCH_SPACE = DEFAULT_SEARCH_SPACE
+SEARCH_PARAM_KEYWORDS_MAP = {
+    "basic": ["depth", "ffn_num_layers", "dropout", "ffn_hidden_dim", "message_hidden_dim"],
+    "learning_rate": ["max_lr", "init_lr_ratio", "final_lr_ratio", "warmup_epochs"],
+    "all": list(DEFAULT_SEARCH_SPACE.keys()),
+    "init_lr": ["init_lr_ratio"],
+    "final_lr": ["final_lr_ratio"],
+}
+class HpoptSubcommand(Subcommand):
+    COMMAND = "hpopt"
+    HELP = "Perform hyperparameter optimization on the given task."
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        parser = add_train_args(parser)
+        return add_hpopt_args(parser)
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        args = process_train_args(args)
+        args = process_hpopt_args(args)
+        validate_common_args(args)
+        validate_train_args(args)
+        main(args)
+def add_hpopt_args(parser: ArgumentParser) -> ArgumentParser:
+    hpopt_args = parser.add_argument_group("Chemprop hyperparameter optimization arguments")
+    hpopt_args.add_argument(
+        "--search-parameter-keywords",
+        type=str,
+        nargs="+",
+        default=["basic"],
+        help=f"""The model parameters over which to search for an optimal hyperparameter configuration. Some options are bundles of parameters or otherwise special parameter operations. Special keywords include:
+        - ``basic``: Default set of hyperparameters for search (depth, ffn_num_layers, dropout, message_hidden_dim, and ffn_hidden_dim)
+        - ``learning_rate``: Search for max_lr, init_lr_ratio, final_lr_ratio, and warmup_epochs. The search for init_lr and final_lr values are defined as fractions of the max_lr value. The search for warmup_epochs is as a fraction of the total epochs used.
+        - ``all``: Include search for all 13 individual keyword options (including: activation, aggregation, aggregation_norm, and batch_size which aren't included in the other two keywords).
+    Individual supported parameters:
+        {list(DEFAULT_SEARCH_SPACE.keys())}
+    """,
+    )
+    hpopt_args.add_argument(
+        "--hpopt-save-dir",
+        type=Path,
+        help="Directory to save the hyperparameter optimization results",
+    )
+    raytune_args = parser.add_argument_group("Ray Tune arguments")
+    raytune_args.add_argument(
+        "--raytune-num-samples",
+        type=int,
+        default=10,
+        help="Passed directly to Ray Tune ``TuneConfig`` to control number of trials to run",
+    )
+    raytune_args.add_argument(
+        "--raytune-search-algorithm",
+        choices=["random", "hyperopt", "optuna"],
+        default="hyperopt",
+        help="Passed to Ray Tune ``TuneConfig`` to control search algorithm",
+    )
+    raytune_args.add_argument(
+        "--raytune-trial-scheduler",
+        choices=["FIFO", "AsyncHyperBand"],
+        default="FIFO",
+        help="Passed to Ray Tune ``TuneConfig`` to control trial scheduler",
+    )
+    raytune_args.add_argument(
+        "--raytune-num-workers",
+        type=int,
+        default=1,
+        help="Passed directly to Ray Tune ``ScalingConfig`` to control number of workers to use",
+    )
+    raytune_args.add_argument(
+        "--raytune-use-gpu",
+        action="store_true",
+        help="Passed directly to Ray Tune ``ScalingConfig`` to control whether to use GPUs",
+    )
+    raytune_args.add_argument(
+        "--raytune-num-checkpoints-to-keep",
+        type=int,
+        default=1,
+        help="Passed directly to Ray Tune ``CheckpointConfig`` to control number of checkpoints to keep",
+    )
+    raytune_args.add_argument(
+        "--raytune-grace-period",
+        type=int,
+        default=10,
+        help="Passed directly to Ray Tune ``ASHAScheduler`` to control grace period",
+    )
+    raytune_args.add_argument(
+        "--raytune-reduction-factor",
+        type=int,
+        default=2,
+        help="Passed directly to Ray Tune ``ASHAScheduler`` to control reduction factor",
+    )
+    raytune_args.add_argument(
+        "--raytune-temp-dir", help="Passed directly to Ray Tune init to control temporary directory"
+    )
+    raytune_args.add_argument(
+        "--raytune-num-cpus",
+        type=int,
+        help="Passed directly to Ray Tune init to control number of CPUs to use",
+    )
+    raytune_args.add_argument(
+        "--raytune-num-gpus",
+        type=int,
+        help="Passed directly to Ray Tune init to control number of GPUs to use",
+    )
+    raytune_args.add_argument(
+        "--raytune-max-concurrent-trials",
+        type=int,
+        help="Passed directly to Ray Tune TuneConfig to control maximum concurrent trials",
+    )
+    hyperopt_args = parser.add_argument_group("Hyperopt arguments")
+    hyperopt_args.add_argument(
+        "--hyperopt-n-initial-points",
+        type=int,
+        help="Passed directly to ``HyperOptSearch`` to control number of initial points to sample",
+    )
+    hyperopt_args.add_argument(
+        "--hyperopt-random-state-seed",
+        type=int,
+        default=None,
+        help="Passed directly to ``HyperOptSearch`` to control random state seed",
+    )
+    return parser
+def process_hpopt_args(args: Namespace) -> Namespace:
+    if args.hpopt_save_dir is None:
+        args.hpopt_save_dir = Path(f"chemprop_hpopt/{args.data_path.stem}")
+    args.hpopt_save_dir.mkdir(exist_ok=True, parents=True)
+    search_parameters = set()
+    available_search_parameters = list(SEARCH_SPACE.keys()) + list(SEARCH_PARAM_KEYWORDS_MAP.keys())
+    for keyword in args.search_parameter_keywords:
+        if keyword not in available_search_parameters:
+            raise ValueError(
+                f"Search parameter keyword: {keyword} not in available options: {available_search_parameters}."
+            )
+        search_parameters.update(
+            SEARCH_PARAM_KEYWORDS_MAP[keyword]
+            if keyword in SEARCH_PARAM_KEYWORDS_MAP
+            else [keyword]
+        )
+    args.search_parameter_keywords = list(search_parameters)
+    if not args.hyperopt_n_initial_points:
+        args.hyperopt_n_initial_points = args.raytune_num_samples // 2
+    return args
+def build_search_space(search_parameters: list[str], train_epochs: int) -> dict:
+    if "warmup_epochs" in search_parameters and SEARCH_SPACE.get("warmup_epochs", None) is None:
+        assert (
+            train_epochs >= 6
+        ), "Training epochs must be at least 6 to perform hyperparameter optimization for warmup_epochs."
+        SEARCH_SPACE["warmup_epochs"] = tune.qrandint(lower=1, upper=train_epochs // 2, q=1)
+    return {param: SEARCH_SPACE[param] for param in search_parameters}
+def update_args_with_config(args: Namespace, config: dict) -> Namespace:
+    args = deepcopy(args)
+    for key, value in config.items():
+        match key:
+            case "final_lr_ratio":
+                setattr(args, "final_lr", value * config.get("max_lr", args.max_lr))
+            case "init_lr_ratio":
+                setattr(args, "init_lr", value * config.get("max_lr", args.max_lr))
+            case _:
+                assert key in args, f"Key: {key} not found in args."
+                setattr(args, key, value)
+    return args
+def train_model(config, args, train_dset, val_dset, logger, output_transform, input_transforms):
+    args = update_args_with_config(args, config)
+    train_loader = build_dataloader(
+        train_dset, args.batch_size, args.num_workers, seed=args.data_seed
+    )
+    val_loader = build_dataloader(val_dset, args.batch_size, args.num_workers, shuffle=False)
+    seed = args.pytorch_seed if args.pytorch_seed is not None else torch.seed()
+    torch.manual_seed(seed)
+    model = build_model(args, train_loader.dataset, output_transform, input_transforms)
+    logger.info(model)
+    if args.tracking_metric == "val_loss":
+        T_tracking_metric = model.criterion.__class__
+    else:
+        T_tracking_metric = MetricRegistry[args.tracking_metric]
+        args.tracking_metric = "val/" + args.tracking_metric
+    monitor_mode = "max" if T_tracking_metric.higher_is_better else "min"
+    logger.debug(f"Evaluation metric: '{T_tracking_metric.alias}', mode: '{monitor_mode}'")
+    patience = args.patience if args.patience is not None else args.epochs
+    early_stopping = EarlyStopping(args.tracking_metric, patience=patience, mode=monitor_mode)
+    trainer = pl.Trainer(
+        accelerator=args.accelerator,
+        devices=args.devices,
+        max_epochs=args.epochs,
+        gradient_clip_val=args.grad_clip,
+        strategy=RayDDPStrategy(),
+        callbacks=[RayTrainReportCallback(), early_stopping],
+        plugins=[RayLightningEnvironment()],
+        deterministic=args.pytorch_seed is not None,
+    )
+    trainer = prepare_trainer(trainer)
+    trainer.fit(model, train_loader, val_loader)
+def tune_model(
+    args, train_dset, val_dset, logger, monitor_mode, output_transform, input_transforms
+):
+    match args.raytune_trial_scheduler:
+        case "FIFO":
+            scheduler = FIFOScheduler()
+        case "AsyncHyperBand":
+            scheduler = ASHAScheduler(
+                max_t=args.epochs,
+                grace_period=min(args.raytune_grace_period, args.epochs),
+                reduction_factor=args.raytune_reduction_factor,
+            )
+        case _:
+            raise ValueError(f"Invalid trial scheduler! got: {args.raytune_trial_scheduler}.")
+    resources_per_worker = {}
+    if args.raytune_num_cpus and args.raytune_max_concurrent_trials:
+        resources_per_worker["CPU"] = args.raytune_num_cpus / args.raytune_max_concurrent_trials
+    if args.raytune_num_gpus and args.raytune_max_concurrent_trials:
+        resources_per_worker["GPU"] = args.raytune_num_gpus / args.raytune_max_concurrent_trials
+    if not resources_per_worker:
+        resources_per_worker = None
+    if args.raytune_num_gpus:
+        use_gpu = True
+    else:
+        use_gpu = args.raytune_use_gpu
+    scaling_config = ScalingConfig(
+        num_workers=args.raytune_num_workers,
+        use_gpu=use_gpu,
+        resources_per_worker=resources_per_worker,
+        trainer_resources={"CPU": 0},
+    )
+    checkpoint_config = CheckpointConfig(
+        num_to_keep=args.raytune_num_checkpoints_to_keep,
+        checkpoint_score_attribute=args.tracking_metric,
+        checkpoint_score_order=monitor_mode,
+    )
+    run_config = RunConfig(
+        checkpoint_config=checkpoint_config,
+        storage_path=args.hpopt_save_dir.absolute() / "ray_results",
+    )
+    ray_trainer = TorchTrainer(
+        lambda config: train_model(
+            config, args, train_dset, val_dset, logger, output_transform, input_transforms
+        ),
+        scaling_config=scaling_config,
+        run_config=run_config,
+    )
+    match args.raytune_search_algorithm:
+        case "random":
+            search_alg = None
+        case "hyperopt":
+            if NO_HYPEROPT:
+                raise ImportError(
+                    "HyperOptSearch requires hyperopt to be installed. Use 'pip install -U hyperopt' to install or use 'pip install -e .[hpopt]' in chemprop folder if you installed from source to install all hpopt relevant packages."
+                )
+            search_alg = HyperOptSearch(
+                n_initial_points=args.hyperopt_n_initial_points,
+                random_state_seed=args.hyperopt_random_state_seed,
+            )
+        case "optuna":
+            if NO_OPTUNA:
+                raise ImportError(
+                    "OptunaSearch requires optuna to be installed. Use 'pip install -U optuna' to install or use 'pip install -e .[hpopt]' in chemprop folder if you installed from source to install all hpopt relevant packages."
+                )
+            search_alg = OptunaSearch()
+    tune_config = tune.TuneConfig(
+        metric=args.tracking_metric,
+        mode=monitor_mode,
+        num_samples=args.raytune_num_samples,
+        scheduler=scheduler,
+        search_alg=search_alg,
+        trial_dirname_creator=lambda trial: str(trial.trial_id),
+    )
+    tuner = tune.Tuner(
+        ray_trainer,
+        param_space={
+            "train_loop_config": build_search_space(args.search_parameter_keywords, args.epochs)
+        },
+        tune_config=tune_config,
+    )
+    return tuner.fit()
+def main(args: Namespace):
+    if NO_RAY:
+        raise ImportError(
+            "Ray Tune requires ray to be installed. If you installed Chemprop from PyPI, run 'pip install -U ray[tune]' to install ray. If you installed from source, use 'pip install -e .[hpopt]' in Chemprop folder to install all hpopt relevant packages."
+        )
+    if not ray.is_initialized():
+        try:
+            ray.init(
+                _temp_dir=args.raytune_temp_dir,
+                num_cpus=args.raytune_num_cpus,
+                num_gpus=args.raytune_num_gpus,
+            )
+        except OSError as e:
+            if "AF_UNIX path length cannot exceed 107 bytes" in str(e):
+                raise OSError(
+                    f"Ray Tune fails due to: {e}. This can sometimes be solved by providing a temporary directory, num_cpus, and num_gpus to Ray Tune via the CLI: --raytune-temp-dir <absolute_path> --raytune-num-cpus <int> --raytune-num-gpus <int>."
+                )
+            else:
+                raise e
+    else:
+        logger.info("Ray is already initialized.")
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        target_cols=args.target_columns,
+        ignore_cols=args.ignore_columns,
+        splits_col=args.splits_column,
+        weight_col=args.weight_column,
+        bounded=args.loss_function is not None and "bounded" in args.loss_function,
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers,
+        keep_h=args.keep_h,
+        add_h=args.add_h,
+        ignore_chirality=args.ignore_chirality,
+    )
+    train_data, val_data, test_data = build_splits(args, format_kwargs, featurization_kwargs)
+    train_dset, val_dset, test_dset = build_datasets(args, train_data[0], val_data[0], test_data[0])
+    input_transforms = normalize_inputs(train_dset, val_dset, args)
+    if "regression" in args.task_type:
+        output_scaler = train_dset.normalize_targets()
+        val_dset.normalize_targets(output_scaler)
+        logger.info(f"Train data: mean = {output_scaler.mean_} | std = {output_scaler.scale_}")
+        output_transform = UnscaleTransform.from_standard_scaler(output_scaler)
+    else:
+        output_transform = None
+    train_loader = build_dataloader(
+        train_dset, args.batch_size, args.num_workers, seed=args.data_seed
+    )
+    model = build_model(args, train_loader.dataset, output_transform, input_transforms)
+    monitor_mode = "max" if model.metrics[0].higher_is_better else "min"
+    results = tune_model(
+        args, train_dset, val_dset, logger, monitor_mode, output_transform, input_transforms
+    )
+    best_result = results.get_best_result()
+    best_config = best_result.config["train_loop_config"]
+    best_checkpoint_path = Path(best_result.checkpoint.path) / "checkpoint.ckpt"
+    best_config_save_path = args.hpopt_save_dir / "best_config.toml"
+    best_checkpoint_save_path = args.hpopt_save_dir / "best_checkpoint.ckpt"
+    all_progress_save_path = args.hpopt_save_dir / "all_progress.csv"
+    logger.info(f"Best hyperparameters saved to: '{best_config_save_path}'")
+    args = update_args_with_config(args, best_config)
+    args = TrainSubcommand.parser.parse_known_args(namespace=args)[0]
+    save_config(TrainSubcommand.parser, args, best_config_save_path)
+    logger.info(
+        f"Best hyperparameter configuration checkpoint saved to '{best_checkpoint_save_path}'"
+    )
+    shutil.copyfile(best_checkpoint_path, best_checkpoint_save_path)
+    logger.info(f"Hyperparameter optimization results saved to '{all_progress_save_path}'")
+    result_df = results.get_dataframe()
+    result_df.to_csv(all_progress_save_path, index=False)
+    ray.shutdown()
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = HpoptSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    HpoptSubcommand.func(args)

chemprop-updated/chemprop/cli/main.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import logging
+from pathlib import Path
+import sys
+from configargparse import ArgumentParser
+from chemprop.cli.conf import LOG_DIR, LOG_LEVELS, NOW
+from chemprop.cli.convert import ConvertSubcommand
+from chemprop.cli.fingerprint import FingerprintSubcommand
+from chemprop.cli.hpopt import HpoptSubcommand
+from chemprop.cli.predict import PredictSubcommand
+from chemprop.cli.train import TrainSubcommand
+from chemprop.cli.utils import pop_attr
+logger = logging.getLogger(__name__)
+SUBCOMMANDS = [
+    TrainSubcommand,
+    PredictSubcommand,
+    ConvertSubcommand,
+    FingerprintSubcommand,
+    HpoptSubcommand,
+]
+def construct_parser():
+    parser = ArgumentParser()
+    subparsers = parser.add_subparsers(title="mode", dest="mode", required=True)
+    parent = ArgumentParser(add_help=False)
+    parent.add_argument(
+        "--logfile",
+        "--log",
+        nargs="?",
+        const="default",
+        help=f"Path to which the log file should be written (specifying just the flag alone will automatically log to a file ``{LOG_DIR}/MODE/TIMESTAMP.log`` , where 'MODE' is the CLI mode chosen, e.g., ``{LOG_DIR}/MODE/{NOW}.log``)",
+    )
+    parent.add_argument("-v", action="store_true", help="Increase verbosity level to DEBUG")
+    parent.add_argument(
+        "-q",
+        action="count",
+        default=0,
+        help="Decrease verbosity level to WARNING or ERROR if specified twice",
+    )
+    parents = [parent]
+    for subcommand in SUBCOMMANDS:
+        subcommand.add(subparsers, parents)
+    return parser
+def main():
+    parser = construct_parser()
+    args = parser.parse_args()
+    logfile, v_flag, q_count, mode, func = (
+        pop_attr(args, attr) for attr in ["logfile", "v", "q", "mode", "func"]
+    )
+    if v_flag and q_count:
+        parser.error("The -v and -q options cannot be used together.")
+    match logfile:
+        case None:
+            handler = logging.StreamHandler(sys.stderr)
+        case "default":
+            (LOG_DIR / mode).mkdir(parents=True, exist_ok=True)
+            handler = logging.FileHandler(str(LOG_DIR / mode / f"{NOW}.log"))
+        case _:
+            Path(logfile).parent.mkdir(parents=True, exist_ok=True)
+            handler = logging.FileHandler(logfile)
+    verbosity = q_count * -1 if q_count else (1 if v_flag else 0)
+    logging_level = LOG_LEVELS.get(verbosity, logging.ERROR)
+    logging.basicConfig(
+        handlers=[handler],
+        format="%(asctime)s - %(levelname)s:%(name)s - %(message)s",
+        level=logging_level,
+        datefmt="%Y-%m-%dT%H:%M:%S",
+        force=True,
+    )
+    logger.info(f"Running in mode '{mode}' with args: {vars(args)}")
+    func(args)

chemprop-updated/chemprop/cli/predict.py ADDED Viewed

	@@ -0,0 +1,447 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+import sys
+from typing import Iterator
+from lightning import pytorch as pl
+import numpy as np
+import pandas as pd
+import torch
+from chemprop import data
+from chemprop.cli.common import (
+    add_common_args,
+    find_models,
+    process_common_args,
+    validate_common_args,
+)
+from chemprop.cli.utils import LookupAction, Subcommand, build_data_from_files, make_dataset
+from chemprop.models.utils import load_model, load_output_columns
+from chemprop.nn.metrics import LossFunctionRegistry
+from chemprop.nn.predictors import EvidentialFFN, MulticlassClassificationFFN, MveFFN
+from chemprop.uncertainty import (
+    MVEWeightingCalibrator,
+    NoUncertaintyEstimator,
+    RegressionCalibrator,
+    RegressionEvaluator,
+    UncertaintyCalibratorRegistry,
+    UncertaintyEstimatorRegistry,
+    UncertaintyEvaluatorRegistry,
+)
+from chemprop.utils import Factory
+logger = logging.getLogger(__name__)
+class PredictSubcommand(Subcommand):
+    COMMAND = "predict"
+    HELP = "use a pretrained chemprop model for prediction"
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        return add_predict_args(parser)
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        validate_common_args(args)
+        args = process_predict_args(args)
+        main(args)
+def add_predict_args(parser: ArgumentParser) -> ArgumentParser:
+    parser.add_argument(
+        "-i",
+        "--test-path",
+        required=True,
+        type=Path,
+        help="Path to an input CSV file containing SMILES",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        "--preds-path",
+        type=Path,
+        help="Specify path to which predictions will be saved. If the file extension is .pkl, it will be saved as a pickle file. Otherwise, chemprop will save predictions as a CSV. If multiple models are used to make predictions, the average predictions will be saved in the file, and another file ending in '_individual' with the same file extension will save the predictions for each individual model, with the column names being the target names appended with the model index (e.g., '_model_<index>').",
+    )
+    parser.add_argument(
+        "--drop-extra-columns",
+        action="store_true",
+        help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns",
+    )
+    parser.add_argument(
+        "--model-paths",
+        "--model-path",
+        required=True,
+        type=Path,
+        nargs="+",
+        help="Location of checkpoint(s) or model file(s) to use for prediction. It can be a path to either a single pretrained model checkpoint (.ckpt) or single pretrained model file (.pt), a directory that contains these files, or a list of path(s) and directory(s). If a directory, will recursively search and predict on all found (.pt) models.",
+    )
+    unc_args = parser.add_argument_group("Uncertainty and calibration args")
+    unc_args.add_argument(
+        "--cal-path", type=Path, help="Path to data file to be used for uncertainty calibration."
+    )
+    unc_args.add_argument(
+        "--uncertainty-method",
+        default="none",
+        action=LookupAction(UncertaintyEstimatorRegistry),
+        help="The method of calculating uncertainty.",
+    )
+    unc_args.add_argument(
+        "--calibration-method",
+        action=LookupAction(UncertaintyCalibratorRegistry),
+        help="The method used for calibrating the uncertainty calculated with uncertainty method.",
+    )
+    unc_args.add_argument(
+        "--evaluation-methods",
+        "--evaluation-method",
+        nargs="+",
+        action=LookupAction(UncertaintyEvaluatorRegistry),
+        help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.",
+    )
+    # unc_args.add_argument(
+    #     "--evaluation-scores-path", help="Location to save the results of uncertainty evaluations."
+    # )
+    unc_args.add_argument(
+        "--uncertainty-dropout-p",
+        type=float,
+        default=0.1,
+        help="The probability to use for Monte Carlo dropout uncertainty estimation.",
+    )
+    unc_args.add_argument(
+        "--dropout-sampling-size",
+        type=int,
+        default=10,
+        help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.",
+    )
+    unc_args.add_argument(
+        "--calibration-interval-percentile",
+        type=float,
+        default=95,
+        help="Sets the percentile used in the calibration methods. Must be in the range (1, 100).",
+    )
+    unc_args.add_argument(
+        "--conformal-alpha",
+        type=float,
+        default=0.1,
+        help="Target error rate for conformal prediction. Must be in the range (0, 1).",
+    )
+    # TODO: Decide if we want to implment this in v2.1.x
+    # unc_args.add_argument(
+    #     "--regression-calibrator-metric",
+    #     choices=["stdev", "interval"],
+    #     help="Regression calibrators can output either a stdev or an inverval.",
+    # )
+    unc_args.add_argument(
+        "--cal-descriptors-path",
+        nargs="+",
+        action="append",
+        help="Path to extra descriptors to concatenate to learned representation in calibration dataset.",
+    )
+    # TODO: Add in v2.1.x
+    # unc_args.add_argument(
+    #     "--calibration-phase-features-path",
+    #     help=" ",
+    # )
+    unc_args.add_argument(
+        "--cal-atom-features-path",
+        nargs="+",
+        action="append",
+        help="Path to the extra atom features in calibration dataset.",
+    )
+    unc_args.add_argument(
+        "--cal-atom-descriptors-path",
+        nargs="+",
+        action="append",
+        help="Path to the extra atom descriptors in calibration dataset.",
+    )
+    unc_args.add_argument(
+        "--cal-bond-features-path",
+        nargs="+",
+        action="append",
+        help="Path to the extra bond descriptors in calibration dataset.",
+    )
+    return parser
+def process_predict_args(args: Namespace) -> Namespace:
+    if args.test_path.suffix not in [".csv"]:
+        raise ArgumentError(
+            argument=None, message=f"Input data must be a CSV file. Got {args.test_path}"
+        )
+    if args.output is None:
+        args.output = args.test_path.parent / (args.test_path.stem + "_preds.csv")
+    if args.output.suffix not in [".csv", ".pkl"]:
+        raise ArgumentError(
+            argument=None, message=f"Output must be a CSV or Pickle file. Got {args.output}"
+        )
+    return args
+def prepare_data_loader(
+    args: Namespace, multicomponent: bool, is_calibration: bool, format_kwargs: dict
+):
+    data_path = args.cal_path if is_calibration else args.test_path
+    descriptors_path = args.cal_descriptors_path if is_calibration else args.descriptors_path
+    atom_feats_path = args.cal_atom_features_path if is_calibration else args.atom_features_path
+    bond_feats_path = args.cal_bond_features_path if is_calibration else args.bond_features_path
+    atom_descs_path = (
+        args.cal_atom_descriptors_path if is_calibration else args.atom_descriptors_path
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers,
+        keep_h=args.keep_h,
+        add_h=args.add_h,
+        ignore_chirality=args.ignore_chirality,
+    )
+    datas = build_data_from_files(
+        data_path,
+        **format_kwargs,
+        p_descriptors=descriptors_path,
+        p_atom_feats=atom_feats_path,
+        p_bond_feats=bond_feats_path,
+        p_atom_descs=atom_descs_path,
+        **featurization_kwargs,
+    )
+    dsets = [make_dataset(d, args.rxn_mode, args.multi_hot_atom_featurizer_mode) for d in datas]
+    dset = data.MulticomponentDataset(dsets) if multicomponent else dsets[0]
+    return data.build_dataloader(dset, args.batch_size, args.num_workers, shuffle=False)
+def make_prediction_for_models(
+    args: Namespace, model_paths: Iterator[Path], multicomponent: bool, output_path: Path
+):
+    model = load_model(model_paths[0], multicomponent)
+    output_columns = load_output_columns(model_paths[0])
+    bounded = any(
+        isinstance(model.criterion, LossFunctionRegistry[loss_function])
+        for loss_function in LossFunctionRegistry.keys()
+        if "bounded" in loss_function
+    )
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        ignore_cols=None,
+        splits_col=None,
+        weight_col=None,
+        bounded=bounded,
+    )
+    format_kwargs["target_cols"] = output_columns if args.evaluation_methods is not None else []
+    test_loader = prepare_data_loader(args, multicomponent, False, format_kwargs)
+    logger.info(f"test size: {len(test_loader.dataset)}")
+    if args.cal_path is not None:
+        format_kwargs["target_cols"] = output_columns
+        cal_loader = prepare_data_loader(args, multicomponent, True, format_kwargs)
+        logger.info(f"calibration size: {len(cal_loader.dataset)}")
+    uncertainty_estimator = Factory.build(
+        UncertaintyEstimatorRegistry[args.uncertainty_method],
+        ensemble_size=args.dropout_sampling_size,
+        dropout=args.uncertainty_dropout_p,
+    )
+    models = [load_model(model_path, multicomponent) for model_path in model_paths]
+    trainer = pl.Trainer(
+        logger=False, enable_progress_bar=True, accelerator=args.accelerator, devices=args.devices
+    )
+    test_individual_preds, test_individual_uncs = uncertainty_estimator(
+        test_loader, models, trainer
+    )
+    test_preds = torch.mean(test_individual_preds, dim=0)
+    if not isinstance(uncertainty_estimator, NoUncertaintyEstimator):
+        test_uncs = torch.mean(test_individual_uncs, dim=0)
+    else:
+        test_uncs = None
+    if args.calibration_method is not None:
+        uncertainty_calibrator = Factory.build(
+            UncertaintyCalibratorRegistry[args.calibration_method],
+            p=args.calibration_interval_percentile / 100,
+            alpha=args.conformal_alpha,
+        )
+        cal_targets = cal_loader.dataset.Y
+        cal_mask = torch.from_numpy(np.isfinite(cal_targets))
+        cal_targets = np.nan_to_num(cal_targets, nan=0.0)
+        cal_targets = torch.from_numpy(cal_targets)
+        cal_individual_preds, cal_individual_uncs = uncertainty_estimator(
+            cal_loader, models, trainer
+        )
+        cal_preds = torch.mean(cal_individual_preds, dim=0)
+        cal_uncs = torch.mean(cal_individual_uncs, dim=0)
+        if isinstance(uncertainty_calibrator, MVEWeightingCalibrator):
+            uncertainty_calibrator.fit(cal_preds, cal_individual_uncs, cal_targets, cal_mask)
+            test_uncs = uncertainty_calibrator.apply(cal_individual_uncs)
+        else:
+            if isinstance(uncertainty_calibrator, RegressionCalibrator):
+                uncertainty_calibrator.fit(cal_preds, cal_uncs, cal_targets, cal_mask)
+            else:
+                uncertainty_calibrator.fit(cal_uncs, cal_targets, cal_mask)
+            test_uncs = uncertainty_calibrator.apply(test_uncs)
+            for i in range(test_individual_uncs.shape[0]):
+                test_individual_uncs[i] = uncertainty_calibrator.apply(test_individual_uncs[i])
+    if args.evaluation_methods is not None:
+        uncertainty_evaluators = [
+            Factory.build(UncertaintyEvaluatorRegistry[method])
+            for method in args.evaluation_methods
+        ]
+        logger.info("Uncertainty evaluation metric:")
+        for evaluator in uncertainty_evaluators:
+            test_targets = test_loader.dataset.Y
+            test_mask = torch.from_numpy(np.isfinite(test_targets))
+            test_targets = np.nan_to_num(test_targets, nan=0.0)
+            test_targets = torch.from_numpy(test_targets)
+            if isinstance(evaluator, RegressionEvaluator):
+                metric_value = evaluator.evaluate(test_preds, test_uncs, test_targets, test_mask)
+            else:
+                metric_value = evaluator.evaluate(test_uncs, test_targets, test_mask)
+            logger.info(f"{evaluator.alias}: {metric_value.tolist()}")
+    if args.uncertainty_method == "none" and (
+        isinstance(model.predictor, MveFFN) or isinstance(model.predictor, EvidentialFFN)
+    ):
+        test_preds = test_preds[..., 0]
+        test_individual_preds = test_individual_preds[..., 0]
+    if output_columns is None:
+        output_columns = [
+            f"pred_{i}" for i in range(test_preds.shape[1])
+        ]  # TODO: need to improve this for cases like multi-task MVE and multi-task multiclass
+    save_predictions(args, model, output_columns, test_preds, test_uncs, output_path)
+    if len(model_paths) > 1:
+        save_individual_predictions(
+            args,
+            model,
+            model_paths,
+            output_columns,
+            test_individual_preds,
+            test_individual_uncs,
+            output_path,
+        )
+def save_predictions(args, model, output_columns, test_preds, test_uncs, output_path):
+    unc_columns = [f"{col}_unc" for col in output_columns]
+    if isinstance(model.predictor, MulticlassClassificationFFN):
+        output_columns = output_columns + [f"{col}_prob" for col in output_columns]
+        predicted_class_labels = test_preds.argmax(axis=-1)
+        formatted_probability_strings = np.apply_along_axis(
+            lambda x: ",".join(map(str, x)), 2, test_preds.numpy()
+        )
+        test_preds = np.concatenate(
+            (predicted_class_labels, formatted_probability_strings), axis=-1
+        )
+    df_test = pd.read_csv(
+        args.test_path, header=None if args.no_header_row else "infer", index_col=False
+    )
+    df_test[output_columns] = test_preds
+    if args.uncertainty_method not in ["none", "classification"]:
+        df_test[unc_columns] = np.round(test_uncs, 6)
+    if output_path.suffix == ".pkl":
+        df_test = df_test.reset_index(drop=True)
+        df_test.to_pickle(output_path)
+    else:
+        df_test.to_csv(output_path, index=False)
+    logger.info(f"Predictions saved to '{output_path}'")
+def save_individual_predictions(
+    args,
+    model,
+    model_paths,
+    output_columns,
+    test_individual_preds,
+    test_individual_uncs,
+    output_path,
+):
+    unc_columns = [
+        f"{col}_unc_model_{i}" for i in range(len(model_paths)) for col in output_columns
+    ]
+    if isinstance(model.predictor, MulticlassClassificationFFN):
+        output_columns = [
+            item
+            for i in range(len(model_paths))
+            for col in output_columns
+            for item in (f"{col}_model_{i}", f"{col}_prob_model_{i}")
+        ]
+        predicted_class_labels = test_individual_preds.argmax(axis=-1)
+        formatted_probability_strings = np.apply_along_axis(
+            lambda x: ",".join(map(str, x)), 3, test_individual_preds.numpy()
+        )
+        test_individual_preds = np.concatenate(
+            (predicted_class_labels, formatted_probability_strings), axis=-1
+        )
+    else:
+        output_columns = [
+            f"{col}_model_{i}" for i in range(len(model_paths)) for col in output_columns
+        ]
+    m, n, t = test_individual_preds.shape
+    test_individual_preds = np.transpose(test_individual_preds, (1, 0, 2)).reshape(n, m * t)
+    df_test = pd.read_csv(
+        args.test_path, header=None if args.no_header_row else "infer", index_col=False
+    )
+    df_test[output_columns] = test_individual_preds
+    if args.uncertainty_method not in ["none", "classification", "ensemble"]:
+        m, n, t = test_individual_uncs.shape
+        test_individual_uncs = np.transpose(test_individual_uncs, (1, 0, 2)).reshape(n, m * t)
+        df_test[unc_columns] = np.round(test_individual_uncs, 6)
+    output_path = output_path.parent / Path(
+        str(args.output.stem) + "_individual" + str(output_path.suffix)
+    )
+    if output_path.suffix == ".pkl":
+        df_test = df_test.reset_index(drop=True)
+        df_test.to_pickle(output_path)
+    else:
+        df_test.to_csv(output_path, index=False)
+    logger.info(f"Individual predictions saved to '{output_path}'")
+    for i, model_path in enumerate(model_paths):
+        logger.info(
+            f"Results from model path {model_path} are saved under the column name ending with 'model_{i}'"
+        )
+def main(args):
+    match (args.smiles_columns, args.reaction_columns):
+        case [None, None]:
+            n_components = 1
+        case [_, None]:
+            n_components = len(args.smiles_columns)
+        case [None, _]:
+            n_components = len(args.reaction_columns)
+        case _:
+            n_components = len(args.smiles_columns) + len(args.reaction_columns)
+    multicomponent = n_components > 1
+    model_paths = find_models(args.model_paths)
+    make_prediction_for_models(args, model_paths, multicomponent, output_path=args.output)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = PredictSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    args = PredictSubcommand.func(args)

chemprop-updated/chemprop/cli/train.py ADDED Viewed

	@@ -0,0 +1,1343 @@

+from copy import deepcopy
+from io import StringIO
+import json
+import logging
+from pathlib import Path
+import sys
+from tempfile import TemporaryDirectory
+from configargparse import ArgumentError, ArgumentParser, Namespace
+from lightning import pytorch as pl
+from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
+from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
+from lightning.pytorch.strategies import DDPStrategy
+import numpy as np
+import pandas as pd
+from rich.console import Console
+from rich.table import Column, Table
+import torch
+import torch.nn as nn
+from chemprop.cli.common import (
+    add_common_args,
+    find_models,
+    process_common_args,
+    validate_common_args,
+)
+from chemprop.cli.conf import CHEMPROP_TRAIN_DIR, NOW
+from chemprop.cli.utils import (
+    LookupAction,
+    Subcommand,
+    build_data_from_files,
+    get_column_names,
+    make_dataset,
+    parse_indices,
+)
+from chemprop.cli.utils.args import uppercase
+from chemprop.data import (
+    MoleculeDataset,
+    MolGraphDataset,
+    MulticomponentDataset,
+    ReactionDatapoint,
+    SplitType,
+    build_dataloader,
+    make_split_indices,
+    split_data_by_indices,
+)
+from chemprop.data.datasets import _MolGraphDatasetMixin
+from chemprop.models import MPNN, MulticomponentMPNN, save_model
+from chemprop.nn import AggregationRegistry, LossFunctionRegistry, MetricRegistry, PredictorRegistry
+from chemprop.nn.message_passing import (
+    AtomMessagePassing,
+    BondMessagePassing,
+    MulticomponentMessagePassing,
+)
+from chemprop.nn.transforms import GraphTransform, ScaleTransform, UnscaleTransform
+from chemprop.nn.utils import Activation
+from chemprop.utils import Factory
+logger = logging.getLogger(__name__)
+_CV_REMOVAL_ERROR = (
+    "The -k/--num-folds argument was removed in v2.1.0 - use --num-replicates instead."
+)
+class TrainSubcommand(Subcommand):
+    COMMAND = "train"
+    HELP = "Train a chemprop model."
+    parser = None
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        parser = add_train_args(parser)
+        cls.parser = parser
+        return parser
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        validate_common_args(args)
+        args = process_train_args(args)
+        validate_train_args(args)
+        args.output_dir.mkdir(exist_ok=True, parents=True)
+        config_path = args.output_dir / "config.toml"
+        save_config(cls.parser, args, config_path)
+        main(args)
+def add_train_args(parser: ArgumentParser) -> ArgumentParser:
+    parser.add_argument(
+        "--config-path",
+        type=Path,
+        is_config_file=True,
+        help="Path to a configuration file (command line arguments override values in the configuration file)",
+    )
+    parser.add_argument(
+        "-i",
+        "--data-path",
+        type=Path,
+        help="Path to an input CSV file containing SMILES and the associated target values",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        "--save-dir",
+        type=Path,
+        help="Directory where training outputs will be saved (defaults to ``CURRENT_DIRECTORY/chemprop_training/STEM_OF_INPUT/TIME_STAMP``)",
+    )
+    parser.add_argument(
+        "--remove-checkpoints",
+        action="store_true",
+        help="Remove intermediate checkpoint files after training is complete.",
+    )
+    # TODO: Add in v2.1; see if we can tell lightning how often to log training loss
+    # parser.add_argument(
+    #     "--log-frequency",
+    #     type=int,
+    #     default=10,
+    #     help="The number of batches between each logging of the training loss.",
+    # )
+    transfer_args = parser.add_argument_group("transfer learning args")
+    transfer_args.add_argument(
+        "--checkpoint",
+        type=Path,
+        nargs="+",
+        help="Path to checkpoint(s) or model file(s) for loading and overwriting weights. Accepts a single pre-trained model checkpoint (.ckpt), a single model file (.pt), a directory containing such files, or a list of paths and directories. If a directory is provided, it will recursively search for and use all (.pt) files found for prediction.",
+    )
+    transfer_args.add_argument(
+        "--freeze-encoder",
+        action="store_true",
+        help="Freeze the message passing layer from the checkpoint model (specified by ``--checkpoint``).",
+    )
+    transfer_args.add_argument(
+        "--model-frzn",
+        help="Path to model checkpoint file to be loaded for overwriting and freezing weights. By default, all MPNN weights are frozen with this option.",
+    )
+    transfer_args.add_argument(
+        "--frzn-ffn-layers",
+        type=int,
+        default=0,
+        help="Freeze the first ``n`` layers of the FFN from the checkpoint model (specified by ``--checkpoint``). The message passing layer should also be frozen with ``--freeze-encoder``.",
+    )
+    # transfer_args.add_argument(
+    #     "--freeze-first-only",
+    #     action="store_true",
+    #     help="Determines whether or not to use checkpoint_frzn for just the first encoder. Default (False) is to use the checkpoint to freeze all encoders. (only relevant for number_of_molecules > 1, where checkpoint model has number_of_molecules = 1)",
+    # )
+    # TODO: Add in v2.1
+    # parser.add_argument(
+    #     "--resume-experiment",
+    #     action="store_true",
+    #     help="Whether to resume the experiment. Loads test results from any folds that have already been completed and skips training those folds.",
+    # )
+    # parser.add_argument(
+    #     "--config-path",
+    #     help="Path to a :code:`.json` file containing arguments. Any arguments present in the config file will override arguments specified via the command line or by the defaults.",
+    # )
+    parser.add_argument(
+        "--ensemble-size",
+        type=int,
+        default=1,
+        help="Number of models in ensemble for each splitting of data",
+    )
+    # TODO: Add in v2.2
+    # abt_args = parser.add_argument_group("atom/bond target args")
+    # abt_args.add_argument(
+    #     "--is-atom-bond-targets",
+    #     action="store_true",
+    #     help="Whether this is atomic/bond properties prediction.",
+    # )
+    # abt_args.add_argument(
+    #     "--no-adding-bond-types",
+    #     action="store_true",
+    #     help="Whether the bond types determined by RDKit molecules added to the output of bond targets. This option is intended to be used with the :code:`is_atom_bond_targets`.",
+    # )
+    # abt_args.add_argument(
+    #     "--keeping-atom-map",
+    #     action="store_true",
+    #     help="Whether RDKit molecules keep the original atom mapping. This option is intended to be used when providing atom-mapped SMILES with the :code:`is_atom_bond_targets`.",
+    # )
+    # abt_args.add_argument(
+    #     "--no-shared-atom-bond-ffn",
+    #     action="store_true",
+    #     help="Whether the FFN weights for atom and bond targets should be independent between tasks.",
+    # )
+    # abt_args.add_argument(
+    #     "--weights-ffn-num-layers",
+    #     type=int,
+    #     default=2,
+    #     help="Number of layers in FFN for determining weights used in constrained targets.",
+    # )
+    mp_args = parser.add_argument_group("message passing")
+    mp_args.add_argument(
+        "--message-hidden-dim", type=int, default=300, help="Hidden dimension of the messages"
+    )
+    mp_args.add_argument(
+        "--message-bias", action="store_true", help="Add bias to the message passing layers"
+    )
+    mp_args.add_argument("--depth", type=int, default=3, help="Number of message passing steps")
+    mp_args.add_argument(
+        "--undirected",
+        action="store_true",
+        help="Pass messages on undirected bonds/edges (always sum the two relevant bond vectors)",
+    )
+    mp_args.add_argument(
+        "--dropout",
+        type=float,
+        default=0.0,
+        help="Dropout probability in message passing/FFN layers",
+    )
+    mp_args.add_argument(
+        "--mpn-shared",
+        action="store_true",
+        help="Whether to use the same message passing neural network for all input molecules (only relevant if ``number_of_molecules`` > 1)",
+    )
+    mp_args.add_argument(
+        "--activation",
+        type=uppercase,
+        default="RELU",
+        choices=list(Activation.keys()),
+        help="Activation function in message passing/FFN layers",
+    )
+    mp_args.add_argument(
+        "--aggregation",
+        "--agg",
+        default="norm",
+        action=LookupAction(AggregationRegistry),
+        help="Aggregation mode to use during graph predictor",
+    )
+    mp_args.add_argument(
+        "--aggregation-norm",
+        type=float,
+        default=100,
+        help="Normalization factor by which to divide summed up atomic features for ``norm`` aggregation",
+    )
+    mp_args.add_argument(
+        "--atom-messages", action="store_true", help="Pass messages on atoms rather than bonds."
+    )
+    # TODO: Add in v2.1
+    # mpsolv_args = parser.add_argument_group("message passing with solvent")
+    # mpsolv_args.add_argument(
+    #     "--reaction-solvent",
+    #     action="store_true",
+    #     help="Whether to adjust the MPNN layer to take as input a reaction and a molecule, and to encode them with separate MPNNs.",
+    # )
+    # mpsolv_args.add_argument(
+    #     "--bias-solvent",
+    #     action="store_true",
+    #     help="Whether to add bias to linear layers for solvent MPN if :code:`reaction_solvent` is True.",
+    # )
+    # mpsolv_args.add_argument(
+    #     "--hidden-size-solvent",
+    #     type=int,
+    #     default=300,
+    #     help="Dimensionality of hidden layers in solvent MPN if :code:`reaction_solvent` is True.",
+    # )
+    # mpsolv_args.add_argument(
+    #     "--depth-solvent",
+    #     type=int,
+    #     default=3,
+    #     help="Number of message passing steps for solvent if :code:`reaction_solvent` is True.",
+    # )
+    ffn_args = parser.add_argument_group("FFN args")
+    ffn_args.add_argument(
+        "--ffn-hidden-dim", type=int, default=300, help="Hidden dimension in the FFN top model"
+    )
+    ffn_args.add_argument(  # TODO: the default in v1 was 2. (see weights_ffn_num_layers option) Do we really want the default to now be 1?
+        "--ffn-num-layers", type=int, default=1, help="Number of layers in FFN top model"
+    )
+    # TODO: Decide if we want to implment this in v2
+    # ffn_args.add_argument(
+    #     "--features-only",
+    #     action="store_true",
+    #     help="Use only the additional features in an FFN, no graph network.",
+    # )
+    extra_mpnn_args = parser.add_argument_group("extra MPNN args")
+    extra_mpnn_args.add_argument(
+        "--batch-norm", action="store_true", help="Turn on batch normalization after aggregation"
+    )
+    extra_mpnn_args.add_argument(
+        "--multiclass-num-classes",
+        type=int,
+        default=3,
+        help="Number of classes when running multiclass classification",
+    )
+    # TODO: Add in v2.1
+    # extra_mpnn_args.add_argument(
+    #     "--spectral-activation",
+    #     default="exp",
+    #     choices=["softplus", "exp"],
+    #     help="Indicates which function to use in task_type spectra training to constrain outputs to be positive.",
+    # )
+    train_data_args = parser.add_argument_group("training input data args")
+    train_data_args.add_argument(
+        "-w",
+        "--weight-column",
+        help="Name of the column in the input CSV containing individual data weights",
+    )
+    train_data_args.add_argument(
+        "--target-columns",
+        nargs="+",
+        help="Name of the columns containing target values (by default, uses all columns except the SMILES column and the ``ignore_columns``)",
+    )
+    train_data_args.add_argument(
+        "--ignore-columns",
+        nargs="+",
+        help="Name of the columns to ignore when ``target_columns`` is not provided",
+    )
+    train_data_args.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Turn off caching the featurized ``MolGraph`` s at the beginning of training",
+    )
+    train_data_args.add_argument(
+        "--splits-column",
+        help="Name of the column in the input CSV file containing 'train', 'val', or 'test' for each row.",
+    )
+    # TODO: Add in v2.1
+    # train_data_args.add_argument(
+    #     "--spectra-phase-mask-path",
+    #     help="Path to a file containing a phase mask array, used for excluding particular regions in spectra predictions.",
+    # )
+    train_args = parser.add_argument_group("training args")
+    train_args.add_argument(
+        "-t",
+        "--task-type",
+        default="regression",
+        action=LookupAction(PredictorRegistry),
+        help="Type of dataset (determines the default loss function used during training, defaults to ``regression``)",
+    )
+    train_args.add_argument(
+        "-l",
+        "--loss-function",
+        action=LookupAction(LossFunctionRegistry),
+        help="Loss function to use during training (will use the default loss function for the given task type if not specified)",
+    )
+    train_args.add_argument(
+        "--v-kl",
+        "--evidential-regularization",
+        type=float,
+        default=0.0,
+        help="Specify the value used in regularization for evidential loss function. The default value recommended by Soleimany et al. (2021) is 0.2. However, the optimal value is dataset-dependent, so it is recommended that users test different values to find the best value for their model.",
+    )
+    train_args.add_argument(
+        "--eps", type=float, default=1e-8, help="Evidential regularization epsilon"
+    )
+    train_args.add_argument(
+        "--alpha", type=float, default=0.1, help="Target error bounds for quantile interval loss"
+    )
+    # TODO: Add in v2.1
+    # train_args.add_argument(  # TODO: Is threshold the same thing as the spectra target floor? I'm not sure but combined them.
+    #     "-T",
+    #     "--threshold",
+    #     "--spectra-target-floor",
+    #     type=float,
+    #     default=1e-8,
+    #     help="spectral threshold limit. v1 help string: Values in targets for dataset type spectra are replaced with this value, intended to be a small positive number used to enforce positive values.",
+    # )
+    train_args.add_argument(
+        "--metrics",
+        "--metric",
+        nargs="+",
+        action=LookupAction(MetricRegistry),
+        help="Specify the evaluation metrics. If unspecified, chemprop will use the following metrics for given dataset types: regression -> ``rmse``, classification -> ``roc``, multiclass -> ``ce`` ('cross entropy'), spectral -> ``sid``. If multiple metrics are provided, the 0-th one will be used for early stopping and checkpointing.",
+    )
+    train_args.add_argument(
+        "--tracking-metric",
+        default="val_loss",
+        help="The metric to track for early stopping and checkpointing. Defaults to the criterion used during training.",
+    )
+    train_args.add_argument(
+        "--show-individual-scores",
+        action="store_true",
+        help="Show all scores for individual targets, not just average, at the end.",
+    )
+    train_args.add_argument(
+        "--task-weights",
+        nargs="+",
+        type=float,
+        help="Weights to apply for whole tasks in the loss function",
+    )
+    train_args.add_argument(
+        "--warmup-epochs",
+        type=int,
+        default=2,
+        help="Number of epochs during which learning rate increases linearly from ``init_lr`` to ``max_lr`` (afterwards, learning rate decreases exponentially from ``max_lr`` to ``final_lr``)",
+    )
+    train_args.add_argument("--init-lr", type=float, default=1e-4, help="Initial learning rate.")
+    train_args.add_argument("--max-lr", type=float, default=1e-3, help="Maximum learning rate.")
+    train_args.add_argument("--final-lr", type=float, default=1e-4, help="Final learning rate.")
+    train_args.add_argument("--epochs", type=int, default=50, help="Number of epochs to train over")
+    train_args.add_argument(
+        "--patience",
+        type=int,
+        default=None,
+        help="Number of epochs to wait for improvement before early stopping",
+    )
+    train_args.add_argument(
+        "--grad-clip",
+        type=float,
+        help="Passed directly to the lightning trainer which controls grad clipping (see the ``Trainer()`` docstring for details)",
+    )
+    train_args.add_argument(
+        "--class-balance",
+        action="store_true",
+        help="Ensures each training batch contains an equal number of positive and negative samples.",
+    )
+    split_args = parser.add_argument_group("split args")
+    split_args.add_argument(
+        "--split",
+        "--split-type",
+        type=uppercase,
+        default="RANDOM",
+        choices=list(SplitType.keys()),
+        help="Method of splitting the data into train/val/test (case insensitive)",
+    )
+    split_args.add_argument(
+        "--split-sizes",
+        type=float,
+        nargs=3,
+        default=[0.8, 0.1, 0.1],
+        help="Split proportions for train/validation/test sets",
+    )
+    split_args.add_argument(
+        "--split-key-molecule",
+        type=int,
+        default=0,
+        help="Specify the index of the key molecule used for splitting when multiple molecules are present and constrained split_type is used (e.g., ``scaffold_balanced`` or ``random_with_repeated_smiles``). Note that this index begins with zero for the first molecule.",
+    )
+    split_args.add_argument("--num-replicates", type=int, default=1, help="Number of replicates.")
+    split_args.add_argument("-k", "--num-folds", help=_CV_REMOVAL_ERROR)
+    split_args.add_argument(
+        "--save-smiles-splits",
+        action="store_true",
+        help="Whether to store the SMILES in each train/val/test split",
+    )
+    split_args.add_argument(
+        "--splits-file",
+        type=Path,
+        help="Path to a JSON file containing pre-defined splits for the input data, formatted as a list of dictionaries with keys ``train``, ``val``, and ``test`` and values as lists of indices or formatted strings (e.g. [0, 1, 2, 4] or '0-2,4')",
+    )
+    split_args.add_argument(
+        "--data-seed",
+        type=int,
+        default=0,
+        help="Specify the random seed to use when splitting data into train/val/test sets. When ``--num-replicates`` > 1, the first replicate uses this seed and all subsequent replicates add 1 to the seed (also used for shuffling data in ``build_dataloader`` when ``shuffle`` is True).",
+    )
+    parser.add_argument(
+        "--pytorch-seed",
+        type=int,
+        default=None,
+        help="Seed for PyTorch randomness (e.g., random initial weights)",
+    )
+    return parser
+def process_train_args(args: Namespace) -> Namespace:
+    if args.output_dir is None:
+        args.output_dir = CHEMPROP_TRAIN_DIR / args.data_path.stem / NOW
+    return args
+def validate_train_args(args):
+    if args.config_path is None and args.data_path is None:
+        raise ArgumentError(argument=None, message="Data path must be provided for training.")
+    if args.num_folds is not None:  # i.e. user-specified
+        raise ArgumentError(argument=None, message=_CV_REMOVAL_ERROR)
+    if args.data_path.suffix not in [".csv"]:
+        raise ArgumentError(
+            argument=None, message=f"Input data must be a CSV file. Got {args.data_path}"
+        )
+    if args.epochs != -1 and args.epochs <= args.warmup_epochs:
+        raise ArgumentError(
+            argument=None,
+            message=f"The number of epochs should be higher than the number of epochs during warmup. Got {args.epochs} epochs and {args.warmup_epochs} warmup epochs",
+        )
+    # TODO: model_frzn is deprecated and then remove in v2.2
+    if args.checkpoint is not None and args.model_frzn is not None:
+        raise ArgumentError(
+            argument=None,
+            message="`--checkpoint` and `--model-frzn` cannot be used at the same time.",
+        )
+    if "--model-frzn" in sys.argv:
+        logger.warning(
+            "`--model-frzn` is deprecated and will be removed in v2.2. "
+            "Please use `--checkpoint` with `--freeze-encoder` instead."
+        )
+    if args.freeze_encoder and args.checkpoint is None:
+        raise ArgumentError(
+            argument=None,
+            message="`--freeze-encoder` can only be used when `--checkpoint` is used.",
+        )
+    if args.frzn_ffn_layers > 0:
+        if args.checkpoint is None and args.model_frzn is None:
+            raise ArgumentError(
+                argument=None,
+                message="`--frzn-ffn-layers` can only be used when `--checkpoint` or `--model-frzn` (depreciated in v2.1) is used.",
+            )
+        if args.checkpoint is not None and not args.freeze_encoder:
+            raise ArgumentError(
+                argument=None,
+                message="To freeze the first `n` layers of the FFN via `--frzn-ffn-layers`. The message passing layer should also be frozen with `--freeze-encoder`.",
+            )
+    if args.class_balance and args.task_type != "classification":
+        raise ArgumentError(
+            argument=None, message="Class balance is only applicable for classification tasks."
+        )
+    valid_tracking_metrics = (
+        args.metrics or [PredictorRegistry[args.task_type]._T_default_metric.alias]
+    ) + ["val_loss"]
+    if args.tracking_metric not in valid_tracking_metrics:
+        raise ArgumentError(
+            argument=None,
+            message=f"Tracking metric must be one of {','.join(valid_tracking_metrics)}. "
+            f"Got {args.tracking_metric}. Additional tracking metric options can be specified with "
+            "the `--metrics` flag.",
+        )
+    input_cols, target_cols = get_column_names(
+        args.data_path,
+        args.smiles_columns,
+        args.reaction_columns,
+        args.target_columns,
+        args.ignore_columns,
+        args.splits_column,
+        args.weight_column,
+        args.no_header_row,
+    )
+    args.input_columns = input_cols
+    args.target_columns = target_cols
+    return args
+def normalize_inputs(train_dset, val_dset, args):
+    multicomponent = isinstance(train_dset, MulticomponentDataset)
+    num_components = train_dset.n_components if multicomponent else 1
+    X_d_transform = None
+    V_f_transforms = [nn.Identity()] * num_components
+    E_f_transforms = [nn.Identity()] * num_components
+    V_d_transforms = [None] * num_components
+    graph_transforms = []
+    d_xd = train_dset.d_xd
+    d_vf = train_dset.d_vf
+    d_ef = train_dset.d_ef
+    d_vd = train_dset.d_vd
+    if d_xd > 0 and not args.no_descriptor_scaling:
+        scaler = train_dset.normalize_inputs("X_d")
+        val_dset.normalize_inputs("X_d", scaler)
+        scaler = scaler if not isinstance(scaler, list) else scaler[0]
+        if scaler is not None:
+            logger.info(
+                f"Descriptors: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            X_d_transform = ScaleTransform.from_standard_scaler(scaler)
+    if d_vf > 0 and not args.no_atom_feature_scaling:
+        scaler = train_dset.normalize_inputs("V_f")
+        val_dset.normalize_inputs("V_f", scaler)
+        scalers = [scaler] if not isinstance(scaler, list) else scaler
+        for i, scaler in enumerate(scalers):
+            if scaler is None:
+                continue
+            logger.info(
+                f"Atom features for mol {i}: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            featurizer = (
+                train_dset.datasets[i].featurizer if multicomponent else train_dset.featurizer
+            )
+            V_f_transforms[i] = ScaleTransform.from_standard_scaler(
+                scaler, pad=featurizer.atom_fdim - featurizer.extra_atom_fdim
+            )
+    if d_ef > 0 and not args.no_bond_feature_scaling:
+        scaler = train_dset.normalize_inputs("E_f")
+        val_dset.normalize_inputs("E_f", scaler)
+        scalers = [scaler] if not isinstance(scaler, list) else scaler
+        for i, scaler in enumerate(scalers):
+            if scaler is None:
+                continue
+            logger.info(
+                f"Bond features for mol {i}: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            featurizer = (
+                train_dset.datasets[i].featurizer if multicomponent else train_dset.featurizer
+            )
+            E_f_transforms[i] = ScaleTransform.from_standard_scaler(
+                scaler, pad=featurizer.bond_fdim - featurizer.extra_bond_fdim
+            )
+    for V_f_transform, E_f_transform in zip(V_f_transforms, E_f_transforms):
+        graph_transforms.append(GraphTransform(V_f_transform, E_f_transform))
+    if d_vd > 0 and not args.no_atom_descriptor_scaling:
+        scaler = train_dset.normalize_inputs("V_d")
+        val_dset.normalize_inputs("V_d", scaler)
+        scalers = [scaler] if not isinstance(scaler, list) else scaler
+        for i, scaler in enumerate(scalers):
+            if scaler is None:
+                continue
+            logger.info(
+                f"Atom descriptors for mol {i}: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            V_d_transforms[i] = ScaleTransform.from_standard_scaler(scaler)
+    return X_d_transform, graph_transforms, V_d_transforms
+def load_and_use_pretrained_model_scalers(model_path: Path, train_dset, val_dset) -> None:
+    if isinstance(train_dset, MulticomponentDataset):
+        _model = MulticomponentMPNN.load_from_file(model_path)
+        blocks = _model.message_passing.blocks
+        train_dsets = train_dset.datasets
+        val_dsets = val_dset.datasets
+    else:
+        _model = MPNN.load_from_file(model_path)
+        blocks = [_model.message_passing]
+        train_dsets = [train_dset]
+        val_dsets = [val_dset]
+    for i in range(len(blocks)):
+        if isinstance(_model.X_d_transform, ScaleTransform):
+            scaler = _model.X_d_transform.to_standard_scaler()
+            train_dsets[i].normalize_inputs("X_d", scaler)
+            val_dsets[i].normalize_inputs("X_d", scaler)
+        if isinstance(blocks[i].graph_transform, GraphTransform):
+            if isinstance(blocks[i].graph_transform.V_transform, ScaleTransform):
+                V_anti_pad = (
+                    train_dsets[i].featurizer.atom_fdim - train_dsets[i].featurizer.extra_atom_fdim
+                )
+                scaler = blocks[i].graph_transform.V_transform.to_standard_scaler(
+                    anti_pad=V_anti_pad
+                )
+                train_dsets[i].normalize_inputs("V_f", scaler)
+                val_dsets[i].normalize_inputs("V_f", scaler)
+            if isinstance(blocks[i].graph_transform.E_transform, ScaleTransform):
+                E_anti_pad = (
+                    train_dsets[i].featurizer.bond_fdim - train_dsets[i].featurizer.extra_bond_fdim
+                )
+                scaler = blocks[i].graph_transform.E_transform.to_standard_scaler(
+                    anti_pad=E_anti_pad
+                )
+                train_dsets[i].normalize_inputs("E_f", scaler)
+                val_dsets[i].normalize_inputs("E_f", scaler)
+        if isinstance(blocks[i].V_d_transform, ScaleTransform):
+            scaler = blocks[i].V_d_transform.to_standard_scaler()
+            train_dsets[i].normalize_inputs("V_d", scaler)
+            val_dsets[i].normalize_inputs("V_d", scaler)
+    if isinstance(_model.predictor.output_transform, UnscaleTransform):
+        scaler = _model.predictor.output_transform.to_standard_scaler()
+        train_dset.normalize_targets(scaler)
+        val_dset.normalize_targets(scaler)
+def save_config(parser: ArgumentParser, args: Namespace, config_path: Path):
+    config_args = deepcopy(args)
+    for key, value in vars(config_args).items():
+        if isinstance(value, Path):
+            setattr(config_args, key, str(value))
+    for key in ["atom_features_path", "atom_descriptors_path", "bond_features_path"]:
+        if getattr(config_args, key) is not None:
+            for index, path in getattr(config_args, key).items():
+                getattr(config_args, key)[index] = str(path)
+    parser.write_config_file(parsed_namespace=config_args, output_file_paths=[str(config_path)])
+def save_smiles_splits(args: Namespace, output_dir, train_dset, val_dset, test_dset):
+    match (args.smiles_columns, args.reaction_columns):
+        case [_, None]:
+            column_labels = deepcopy(args.smiles_columns)
+        case [None, _]:
+            column_labels = deepcopy(args.reaction_columns)
+        case _:
+            column_labels = deepcopy(args.smiles_columns)
+            column_labels.extend(args.reaction_columns)
+    train_smis = train_dset.names
+    df_train = pd.DataFrame(train_smis, columns=column_labels)
+    df_train.to_csv(output_dir / "train_smiles.csv", index=False)
+    val_smis = val_dset.names
+    df_val = pd.DataFrame(val_smis, columns=column_labels)
+    df_val.to_csv(output_dir / "val_smiles.csv", index=False)
+    if test_dset is not None:
+        test_smis = test_dset.names
+        df_test = pd.DataFrame(test_smis, columns=column_labels)
+        df_test.to_csv(output_dir / "test_smiles.csv", index=False)
+def build_splits(args, format_kwargs, featurization_kwargs):
+    """build the train/val/test splits"""
+    logger.info(f"Pulling data from file: {args.data_path}")
+    all_data = build_data_from_files(
+        args.data_path,
+        p_descriptors=args.descriptors_path,
+        p_atom_feats=args.atom_features_path,
+        p_bond_feats=args.bond_features_path,
+        p_atom_descs=args.atom_descriptors_path,
+        **format_kwargs,
+        **featurization_kwargs,
+    )
+    if args.splits_column is not None:
+        df = pd.read_csv(
+            args.data_path, header=None if args.no_header_row else "infer", index_col=False
+        )
+        grouped = df.groupby(df[args.splits_column].str.lower())
+        train_indices = grouped.groups.get("train", pd.Index([])).tolist()
+        val_indices = grouped.groups.get("val", pd.Index([])).tolist()
+        test_indices = grouped.groups.get("test", pd.Index([])).tolist()
+        train_indices, val_indices, test_indices = [train_indices], [val_indices], [test_indices]
+    elif args.splits_file is not None:
+        with open(args.splits_file, "rb") as json_file:
+            split_idxss = json.load(json_file)
+        train_indices = [parse_indices(d["train"]) for d in split_idxss]
+        val_indices = [parse_indices(d["val"]) for d in split_idxss]
+        test_indices = [parse_indices(d["test"]) for d in split_idxss]
+        args.num_replicates = len(split_idxss)
+    else:
+        splitting_data = all_data[args.split_key_molecule]
+        if isinstance(splitting_data[0], ReactionDatapoint):
+            splitting_mols = [datapoint.rct for datapoint in splitting_data]
+        else:
+            splitting_mols = [datapoint.mol for datapoint in splitting_data]
+        train_indices, val_indices, test_indices = make_split_indices(
+            splitting_mols, args.split, args.split_sizes, args.data_seed, args.num_replicates
+        )
+    train_data, val_data, test_data = split_data_by_indices(
+        all_data, train_indices, val_indices, test_indices
+    )
+    for i_split in range(len(train_data)):
+        sizes = [len(train_data[i_split][0]), len(val_data[i_split][0]), len(test_data[i_split][0])]
+        logger.info(f"train/val/test split_{i_split} sizes: {sizes}")
+    return train_data, val_data, test_data
+def summarize(
+    target_cols: list[str], task_type: str, dataset: _MolGraphDatasetMixin
+) -> tuple[list, list]:
+    if task_type in [
+        "regression",
+        "regression-mve",
+        "regression-evidential",
+        "regression-quantile",
+    ]:
+        if isinstance(dataset, MulticomponentDataset):
+            y = dataset.datasets[0].Y
+        else:
+            y = dataset.Y
+        y_mean = np.nanmean(y, axis=0)
+        y_std = np.nanstd(y, axis=0)
+        y_median = np.nanmedian(y, axis=0)
+        mean_dev_abs = np.abs(y - y_mean)
+        num_targets = np.sum(~np.isnan(y), axis=0)
+        frac_1_sigma = np.sum((mean_dev_abs < y_std), axis=0) / num_targets
+        frac_2_sigma = np.sum((mean_dev_abs < 2 * y_std), axis=0) / num_targets
+        column_headers = ["Statistic"] + [f"Value ({target_cols[i]})" for i in range(y.shape[1])]
+        table_rows = [
+            ["Num. smiles"] + [f"{len(y)}" for i in range(y.shape[1])],
+            ["Num. targets"] + [f"{num_targets[i]}" for i in range(y.shape[1])],
+            ["Num. NaN"] + [f"{len(y) - num_targets[i]}" for i in range(y.shape[1])],
+            ["Mean"] + [f"{mean:0.3g}" for mean in y_mean],
+            ["Std. dev."] + [f"{std:0.3g}" for std in y_std],
+            ["Median"] + [f"{median:0.3g}" for median in y_median],
+            ["% within 1 s.d."] + [f"{sigma:0.0%}" for sigma in frac_1_sigma],
+            ["% within 2 s.d."] + [f"{sigma:0.0%}" for sigma in frac_2_sigma],
+        ]
+        return (column_headers, table_rows)
+    elif task_type in [
+        "classification",
+        "classification-dirichlet",
+        "multiclass",
+        "multiclass-dirichlet",
+    ]:
+        if isinstance(dataset, MulticomponentDataset):
+            y = dataset.datasets[0].Y
+        else:
+            y = dataset.Y
+        mask = np.isnan(y)
+        classes = np.sort(np.unique(y[~mask]))
+        class_counts = np.stack([(classes[:, None] == y[:, i]).sum(1) for i in range(y.shape[1])])
+        class_fracs = class_counts / y.shape[0]
+        nan_count = np.nansum(mask, axis=0)
+        nan_frac = nan_count / y.shape[0]
+        column_headers = ["Class"] + [f"Count/Percent {target_cols[i]}" for i in range(y.shape[1])]
+        table_rows = [
+            [f"{k}"] + [f"{class_counts[j, i]}/{class_fracs[j, i]:0.0%}" for j in range(y.shape[1])]
+            for i, k in enumerate(classes)
+        ]
+        nan_row = ["NaN"] + [f"{nan_count[i]}/{nan_frac[i]:0.0%}" for i in range(y.shape[1])]
+        table_rows.append(nan_row)
+        total_row = ["Total"] + [f"{y.shape[0]}/{100.00}%" for i in range(y.shape[1])]
+        table_rows.append(total_row)
+        return (column_headers, table_rows)
+    else:
+        raise ValueError(f"unsupported task type! Task type '{task_type}' was not recognized.")
+def build_table(column_headers: list[str], table_rows: list[str], title: str | None = None) -> str:
+    right_justified_columns = [
+        Column(header=column_header, justify="right") for column_header in column_headers
+    ]
+    table = Table(*right_justified_columns, title=title)
+    for row in table_rows:
+        table.add_row(*row)
+    console = Console(record=True, file=StringIO(), width=200)
+    console.print(table)
+    return console.export_text()
+def build_datasets(args, train_data, val_data, test_data):
+    """build the train/val/test datasets, where :attr:`test_data` may be None"""
+    multicomponent = len(train_data) > 1
+    if multicomponent:
+        train_dsets = [
+            make_dataset(data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+            for data in train_data
+        ]
+        val_dsets = [
+            make_dataset(data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+            for data in val_data
+        ]
+        train_dset = MulticomponentDataset(train_dsets)
+        val_dset = MulticomponentDataset(val_dsets)
+        if len(test_data[0]) > 0:
+            test_dsets = [
+                make_dataset(data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+                for data in test_data
+            ]
+            test_dset = MulticomponentDataset(test_dsets)
+        else:
+            test_dset = None
+    else:
+        train_data = train_data[0]
+        val_data = val_data[0]
+        test_data = test_data[0]
+        train_dset = make_dataset(train_data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+        val_dset = make_dataset(val_data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+        if len(test_data) > 0:
+            test_dset = make_dataset(test_data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+        else:
+            test_dset = None
+    if args.task_type != "spectral":
+        for dataset, label in zip(
+            [train_dset, val_dset, test_dset], ["Training", "Validation", "Test"]
+        ):
+            column_headers, table_rows = summarize(args.target_columns, args.task_type, dataset)
+            output = build_table(column_headers, table_rows, f"Summary of {label} Data")
+            logger.info("\n" + output)
+    return train_dset, val_dset, test_dset
+def build_model(
+    args,
+    train_dset: MolGraphDataset | MulticomponentDataset,
+    output_transform: UnscaleTransform,
+    input_transforms: tuple[ScaleTransform, list[GraphTransform], list[ScaleTransform]],
+) -> MPNN:
+    mp_cls = AtomMessagePassing if args.atom_messages else BondMessagePassing
+    X_d_transform, graph_transforms, V_d_transforms = input_transforms
+    if isinstance(train_dset, MulticomponentDataset):
+        mp_blocks = [
+            mp_cls(
+                train_dset.datasets[i].featurizer.atom_fdim,
+                train_dset.datasets[i].featurizer.bond_fdim,
+                d_h=args.message_hidden_dim,
+                d_vd=(
+                    train_dset.datasets[i].d_vd
+                    if isinstance(train_dset.datasets[i], MoleculeDataset)
+                    else 0
+                ),
+                bias=args.message_bias,
+                depth=args.depth,
+                undirected=args.undirected,
+                dropout=args.dropout,
+                activation=args.activation,
+                V_d_transform=V_d_transforms[i],
+                graph_transform=graph_transforms[i],
+            )
+            for i in range(train_dset.n_components)
+        ]
+        if args.mpn_shared:
+            if args.reaction_columns is not None and args.smiles_columns is not None:
+                raise ArgumentError(
+                    argument=None,
+                    message="Cannot use shared MPNN with both molecule and reaction data.",
+                )
+        mp_block = MulticomponentMessagePassing(mp_blocks, train_dset.n_components, args.mpn_shared)
+        # NOTE(degraff): this if/else block should be handled by the init of MulticomponentMessagePassing
+        # if args.mpn_shared:
+        #     mp_block = MulticomponentMessagePassing(mp_blocks[0], n_components, args.mpn_shared)
+        # else:
+        d_xd = train_dset.datasets[0].d_xd
+        n_tasks = train_dset.datasets[0].Y.shape[1]
+        mpnn_cls = MulticomponentMPNN
+    else:
+        mp_block = mp_cls(
+            train_dset.featurizer.atom_fdim,
+            train_dset.featurizer.bond_fdim,
+            d_h=args.message_hidden_dim,
+            d_vd=train_dset.d_vd if isinstance(train_dset, MoleculeDataset) else 0,
+            bias=args.message_bias,
+            depth=args.depth,
+            undirected=args.undirected,
+            dropout=args.dropout,
+            activation=args.activation,
+            V_d_transform=V_d_transforms[0],
+            graph_transform=graph_transforms[0],
+        )
+        d_xd = train_dset.d_xd
+        n_tasks = train_dset.Y.shape[1]
+        mpnn_cls = MPNN
+    agg = Factory.build(AggregationRegistry[args.aggregation], norm=args.aggregation_norm)
+    predictor_cls = PredictorRegistry[args.task_type]
+    if args.loss_function is not None:
+        task_weights = torch.ones(n_tasks) if args.task_weights is None else args.task_weights
+        criterion = Factory.build(
+            LossFunctionRegistry[args.loss_function],
+            task_weights=task_weights,
+            v_kl=args.v_kl,
+            # threshold=args.threshold, TODO: Add in v2.1
+            eps=args.eps,
+            alpha=args.alpha,
+        )
+    else:
+        criterion = None
+    if args.metrics is not None:
+        metrics = [Factory.build(MetricRegistry[metric]) for metric in args.metrics]
+    else:
+        metrics = None
+    predictor = Factory.build(
+        predictor_cls,
+        input_dim=mp_block.output_dim + d_xd,
+        n_tasks=n_tasks,
+        hidden_dim=args.ffn_hidden_dim,
+        n_layers=args.ffn_num_layers,
+        dropout=args.dropout,
+        activation=args.activation,
+        criterion=criterion,
+        task_weights=args.task_weights,
+        n_classes=args.multiclass_num_classes,
+        output_transform=output_transform,
+        # spectral_activation=args.spectral_activation, TODO: Add in v2.1
+    )
+    if args.loss_function is None:
+        logger.info(
+            f"No loss function was specified! Using class default: {predictor_cls._T_default_criterion}"
+        )
+    return mpnn_cls(
+        mp_block,
+        agg,
+        predictor,
+        args.batch_norm,
+        metrics,
+        args.warmup_epochs,
+        args.init_lr,
+        args.max_lr,
+        args.final_lr,
+        X_d_transform=X_d_transform,
+    )
+def train_model(
+    args, train_loader, val_loader, test_loader, output_dir, output_transform, input_transforms
+):
+    if args.checkpoint is not None:
+        model_paths = find_models(args.checkpoint)
+        if args.ensemble_size != len(model_paths):
+            logger.warning(
+                f"The number of models in ensemble for each splitting of data is set to {len(model_paths)}."
+            )
+            args.ensemble_size = len(model_paths)
+    for model_idx in range(args.ensemble_size):
+        model_output_dir = output_dir / f"model_{model_idx}"
+        model_output_dir.mkdir(exist_ok=True, parents=True)
+        if args.pytorch_seed is None:
+            seed = torch.seed()
+            deterministic = False
+        else:
+            seed = args.pytorch_seed + model_idx
+            deterministic = True
+        torch.manual_seed(seed)
+        if args.checkpoint or args.model_frzn is not None:
+            mpnn_cls = (
+                MulticomponentMPNN
+                if isinstance(train_loader.dataset, MulticomponentDataset)
+                else MPNN
+            )
+            model_path = model_paths[model_idx] if args.checkpoint else args.model_frzn
+            model = mpnn_cls.load_from_file(model_path)
+            if args.checkpoint:
+                model.apply(
+                    lambda m: setattr(m, "p", args.dropout)
+                    if isinstance(m, torch.nn.Dropout)
+                    else None
+                )
+            # TODO: model_frzn is deprecated and then remove in v2.2
+            if args.model_frzn or args.freeze_encoder:
+                model.message_passing.apply(lambda module: module.requires_grad_(False))
+                model.message_passing.eval()
+                model.bn.apply(lambda module: module.requires_grad_(False))
+                model.bn.eval()
+                for idx in range(args.frzn_ffn_layers):
+                    model.predictor.ffn[idx].requires_grad_(False)
+                    model.predictor.ffn[idx + 1].eval()
+        else:
+            model = build_model(args, train_loader.dataset, output_transform, input_transforms)
+        logger.info(model)
+        try:
+            trainer_logger = TensorBoardLogger(
+                model_output_dir, "trainer_logs", default_hp_metric=False
+            )
+        except ModuleNotFoundError as e:
+            logger.warning(
+                f"Unable to import TensorBoardLogger, reverting to CSVLogger (original error: {e})."
+            )
+            trainer_logger = CSVLogger(model_output_dir, "trainer_logs")
+        if args.tracking_metric == "val_loss":
+            T_tracking_metric = model.criterion.__class__
+            tracking_metric = args.tracking_metric
+        else:
+            T_tracking_metric = MetricRegistry[args.tracking_metric]
+            tracking_metric = "val/" + args.tracking_metric
+        monitor_mode = "max" if T_tracking_metric.higher_is_better else "min"
+        logger.debug(f"Evaluation metric: '{T_tracking_metric.alias}', mode: '{monitor_mode}'")
+        if args.remove_checkpoints:
+            temp_dir = TemporaryDirectory()
+            checkpoint_dir = Path(temp_dir.name)
+        else:
+            checkpoint_dir = model_output_dir
+        checkpoint_filename = (
+            f"best-epoch={{epoch}}-{tracking_metric.replace('/', '_')}="
+            f"{{{tracking_metric}:.2f}}"
+        )
+        checkpointing = ModelCheckpoint(
+            checkpoint_dir / "checkpoints",
+            checkpoint_filename,
+            tracking_metric,
+            mode=monitor_mode,
+            save_last=True,
+            auto_insert_metric_name=False,
+        )
+        if args.epochs != -1:
+            patience = args.patience if args.patience is not None else args.epochs
+            early_stopping = EarlyStopping(tracking_metric, patience=patience, mode=monitor_mode)
+            callbacks = [checkpointing, early_stopping]
+        else:
+            callbacks = [checkpointing]
+        trainer = pl.Trainer(
+            logger=trainer_logger,
+            enable_progress_bar=True,
+            accelerator=args.accelerator,
+            devices=args.devices,
+            max_epochs=args.epochs,
+            callbacks=callbacks,
+            gradient_clip_val=args.grad_clip,
+            deterministic=deterministic,
+        )
+        trainer.fit(model, train_loader, val_loader)
+        if test_loader is not None:
+            if isinstance(trainer.strategy, DDPStrategy):
+                torch.distributed.destroy_process_group()
+                best_ckpt_path = trainer.checkpoint_callback.best_model_path
+                trainer = pl.Trainer(
+                    logger=trainer_logger,
+                    enable_progress_bar=True,
+                    accelerator=args.accelerator,
+                    devices=1,
+                )
+                model = model.load_from_checkpoint(best_ckpt_path)
+                predss = trainer.predict(model, dataloaders=test_loader)
+            else:
+                predss = trainer.predict(dataloaders=test_loader)
+            preds = torch.concat(predss, 0)
+            if model.predictor.n_targets > 1:
+                preds = preds[..., 0]
+            preds = preds.numpy()
+            evaluate_and_save_predictions(
+                preds, test_loader, model.metrics[:-1], model_output_dir, args
+            )
+        best_model_path = checkpointing.best_model_path
+        model = model.__class__.load_from_checkpoint(best_model_path)
+        p_model = model_output_dir / "best.pt"
+        save_model(p_model, model, args.target_columns)
+        logger.info(f"Best model saved to '{p_model}'")
+        if args.remove_checkpoints:
+            temp_dir.cleanup()
+def evaluate_and_save_predictions(preds, test_loader, metrics, model_output_dir, args):
+    if isinstance(test_loader.dataset, MulticomponentDataset):
+        test_dset = test_loader.dataset.datasets[0]
+    else:
+        test_dset = test_loader.dataset
+    targets = test_dset.Y
+    mask = torch.from_numpy(np.isfinite(targets))
+    targets = np.nan_to_num(targets, nan=0.0)
+    weights = torch.ones(len(test_dset))
+    lt_mask = torch.from_numpy(test_dset.lt_mask) if test_dset.lt_mask[0] is not None else None
+    gt_mask = torch.from_numpy(test_dset.gt_mask) if test_dset.gt_mask[0] is not None else None
+    individual_scores = dict()
+    for metric in metrics:
+        individual_scores[metric.alias] = []
+        for i, col in enumerate(args.target_columns):
+            if "multiclass" in args.task_type:
+                preds_slice = torch.from_numpy(preds[:, i : i + 1, :])
+                targets_slice = torch.from_numpy(targets[:, i : i + 1])
+            else:
+                preds_slice = torch.from_numpy(preds[:, i : i + 1])
+                targets_slice = torch.from_numpy(targets[:, i : i + 1])
+            preds_loss = metric(
+                preds_slice,
+                targets_slice,
+                mask[:, i : i + 1],
+                weights,
+                lt_mask[:, i] if lt_mask is not None else None,
+                gt_mask[:, i] if gt_mask is not None else None,
+            )
+            individual_scores[metric.alias].append(preds_loss)
+    logger.info("Test Set results:")
+    for metric in metrics:
+        avg_loss = sum(individual_scores[metric.alias]) / len(individual_scores[metric.alias])
+        logger.info(f"test/{metric.alias}: {avg_loss}")
+    if args.show_individual_scores:
+        logger.info("Entire Test Set individual results:")
+        for metric in metrics:
+            for i, col in enumerate(args.target_columns):
+                logger.info(f"test/{col}/{metric.alias}: {individual_scores[metric.alias][i]}")
+    names = test_loader.dataset.names
+    if isinstance(test_loader.dataset, MulticomponentDataset):
+        namess = list(zip(*names))
+    else:
+        namess = [names]
+    columns = args.input_columns + args.target_columns
+    if "multiclass" in args.task_type:
+        columns = columns + [f"{col}_prob" for col in args.target_columns]
+        formatted_probability_strings = np.apply_along_axis(
+            lambda x: ",".join(map(str, x)), 2, preds
+        )
+        predicted_class_labels = preds.argmax(axis=-1)
+        df_preds = pd.DataFrame(
+            list(zip(*namess, *predicted_class_labels.T, *formatted_probability_strings.T)),
+            columns=columns,
+        )
+    else:
+        df_preds = pd.DataFrame(list(zip(*namess, *preds.T)), columns=columns)
+    df_preds.to_csv(model_output_dir / "test_predictions.csv", index=False)
+def main(args):
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        target_cols=args.target_columns,
+        ignore_cols=args.ignore_columns,
+        splits_col=args.splits_column,
+        weight_col=args.weight_column,
+        bounded=args.loss_function is not None and "bounded" in args.loss_function,
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers,
+        keep_h=args.keep_h,
+        add_h=args.add_h,
+        ignore_chirality=args.ignore_chirality,
+    )
+    splits = build_splits(args, format_kwargs, featurization_kwargs)
+    for replicate_idx, (train_data, val_data, test_data) in enumerate(zip(*splits)):
+        if args.num_replicates == 1:
+            output_dir = args.output_dir
+        else:
+            output_dir = args.output_dir / f"replicate_{replicate_idx}"
+        output_dir.mkdir(exist_ok=True, parents=True)
+        train_dset, val_dset, test_dset = build_datasets(args, train_data, val_data, test_data)
+        if args.save_smiles_splits:
+            save_smiles_splits(args, output_dir, train_dset, val_dset, test_dset)
+        if args.checkpoint or args.model_frzn is not None:
+            model_paths = find_models(args.checkpoint)
+            if len(model_paths) > 1:
+                logger.warning(
+                    "Multiple checkpoint files were loaded, but only the scalers from "
+                    f"{model_paths[0]} are used. It is assumed that all models provided have the "
+                    "same data scalings, meaning they were trained on the same data."
+                )
+            model_path = model_paths[0] if args.checkpoint else args.model_frzn
+            load_and_use_pretrained_model_scalers(model_path, train_dset, val_dset)
+            input_transforms = (None, None, None)
+            output_transform = None
+        else:
+            input_transforms = normalize_inputs(train_dset, val_dset, args)
+            if "regression" in args.task_type:
+                output_scaler = train_dset.normalize_targets()
+                val_dset.normalize_targets(output_scaler)
+                logger.info(
+                    f"Train data: mean = {output_scaler.mean_} | std = {output_scaler.scale_}"
+                )
+                output_transform = UnscaleTransform.from_standard_scaler(output_scaler)
+            else:
+                output_transform = None
+        if not args.no_cache:
+            train_dset.cache = True
+            val_dset.cache = True
+        train_loader = build_dataloader(
+            train_dset,
+            args.batch_size,
+            args.num_workers,
+            class_balance=args.class_balance,
+            seed=args.data_seed,
+        )
+        if args.class_balance:
+            logger.debug(
+                f"With `--class-balance`, effective train size = {len(train_loader.sampler)}"
+            )
+        val_loader = build_dataloader(val_dset, args.batch_size, args.num_workers, shuffle=False)
+        if test_dset is not None:
+            test_loader = build_dataloader(
+                test_dset, args.batch_size, args.num_workers, shuffle=False
+            )
+        else:
+            test_loader = None
+        train_model(
+            args,
+            train_loader,
+            val_loader,
+            test_loader,
+            output_dir,
+            output_transform,
+            input_transforms,
+        )
+if __name__ == "__main__":
+    # TODO: update this old code or remove it.
+    parser = ArgumentParser()
+    parser = TrainSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    TrainSubcommand.func(args)

chemprop-updated/chemprop/cli/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from .actions import LookupAction
+from .args import bounded
+from .command import Subcommand
+from .parsing import (
+    build_data_from_files,
+    get_column_names,
+    make_datapoints,
+    make_dataset,
+    parse_indices,
+)
+from .utils import _pop_attr, _pop_attr_d, pop_attr
+__all__ = [
+    "bounded",
+    "LookupAction",
+    "Subcommand",
+    "build_data_from_files",
+    "make_datapoints",
+    "make_dataset",
+    "get_column_names",
+    "parse_indices",
+    "actions",
+    "args",
+    "command",
+    "parsing",
+    "utils",
+    "pop_attr",
+    "_pop_attr",
+    "_pop_attr_d",
+]

chemprop-updated/chemprop/cli/utils/actions.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from argparse import _StoreAction
+from typing import Any, Mapping
+def LookupAction(obj: Mapping[str, Any]):
+    class LookupAction_(_StoreAction):
+        def __init__(self, option_strings, dest, default=None, choices=None, **kwargs):
+            if default not in obj.keys() and default is not None:
+                raise ValueError(
+                    f"Invalid value for arg 'default': '{default}'. "
+                    f"Expected one of {tuple(obj.keys())}"
+                )
+            kwargs["choices"] = choices if choices is not None else obj.keys()
+            kwargs["default"] = default
+            super().__init__(option_strings, dest, **kwargs)
+    return LookupAction_

chemprop-updated/chemprop/cli/utils/args.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import functools
+__all__ = ["bounded"]
+def bounded(lo: float | None = None, hi: float | None = None):
+    if lo is None and hi is None:
+        raise ValueError("No bounds provided!")
+    def decorator(f):
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+            x = f(*args, **kwargs)
+            if (lo is not None and hi is not None) and not lo <= x <= hi:
+                raise ValueError(f"Parsed value outside of range [{lo}, {hi}]! got: {x}")
+            if hi is not None and x > hi:
+                raise ValueError(f"Parsed value below {hi}! got: {x}")
+            if lo is not None and x < lo:
+                raise ValueError(f"Parsed value above {lo}]! got: {x}")
+            return x
+        return wrapper
+    return decorator
+def uppercase(x: str):
+    return x.upper()
+def lowercase(x: str):
+    return x.lower()

chemprop-updated/chemprop/cli/utils/command.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from abc import ABC, abstractmethod
+from argparse import ArgumentParser, Namespace, _SubParsersAction
+class Subcommand(ABC):
+    COMMAND: str
+    HELP: str | None = None
+    @classmethod
+    def add(cls, subparsers: _SubParsersAction, parents) -> ArgumentParser:
+        parser = subparsers.add_parser(cls.COMMAND, help=cls.HELP, parents=parents)
+        cls.add_args(parser).set_defaults(func=cls.func)
+        return parser
+    @classmethod
+    @abstractmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        pass
+    @classmethod
+    @abstractmethod
+    def func(cls, args: Namespace):
+        pass

chemprop-updated/chemprop/cli/utils/parsing.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import logging
+from os import PathLike
+from typing import Literal, Mapping, Sequence
+import numpy as np
+import pandas as pd
+from chemprop.data.datapoints import MoleculeDatapoint, ReactionDatapoint
+from chemprop.data.datasets import MoleculeDataset, ReactionDataset
+from chemprop.featurizers.atom import get_multi_hot_atom_featurizer
+from chemprop.featurizers.bond import MultiHotBondFeaturizer, RIGRBondFeaturizer
+from chemprop.featurizers.molecule import MoleculeFeaturizerRegistry
+from chemprop.featurizers.molgraph import (
+    CondensedGraphOfReactionFeaturizer,
+    SimpleMoleculeMolGraphFeaturizer,
+)
+from chemprop.utils import make_mol
+logger = logging.getLogger(__name__)
+def parse_csv(
+    path: PathLike,
+    smiles_cols: Sequence[str] | None,
+    rxn_cols: Sequence[str] | None,
+    target_cols: Sequence[str] | None,
+    ignore_cols: Sequence[str] | None,
+    splits_col: str | None,
+    weight_col: str | None,
+    bounded: bool = False,
+    no_header_row: bool = False,
+):
+    df = pd.read_csv(path, header=None if no_header_row else "infer", index_col=False)
+    if smiles_cols is not None and rxn_cols is not None:
+        smiss = df[smiles_cols].T.values.tolist()
+        rxnss = df[rxn_cols].T.values.tolist()
+        input_cols = [*smiles_cols, *rxn_cols]
+    elif smiles_cols is not None and rxn_cols is None:
+        smiss = df[smiles_cols].T.values.tolist()
+        rxnss = None
+        input_cols = smiles_cols
+    elif smiles_cols is None and rxn_cols is not None:
+        smiss = None
+        rxnss = df[rxn_cols].T.values.tolist()
+        input_cols = rxn_cols
+    else:
+        smiss = df.iloc[:, [0]].T.values.tolist()
+        rxnss = None
+        input_cols = [df.columns[0]]
+    if target_cols is None:
+        target_cols = list(
+            column
+            for column in df.columns
+            if column
+            not in set(  # if splits or weight is None, df.columns will never have None
+                input_cols + (ignore_cols or []) + [splits_col] + [weight_col]
+            )
+        )
+    Y = df[target_cols]
+    weights = None if weight_col is None else df[weight_col].to_numpy(np.single)
+    if bounded:
+        Y = Y.astype(str)
+        lt_mask = Y.applymap(lambda x: "<" in x).to_numpy()
+        gt_mask = Y.applymap(lambda x: ">" in x).to_numpy()
+        Y = Y.applymap(lambda x: x.strip("<").strip(">")).to_numpy(np.single)
+    else:
+        Y = Y.to_numpy(np.single)
+        lt_mask = None
+        gt_mask = None
+    return smiss, rxnss, Y, weights, lt_mask, gt_mask
+def get_column_names(
+    path: PathLike,
+    smiles_cols: Sequence[str] | None,
+    rxn_cols: Sequence[str] | None,
+    target_cols: Sequence[str] | None,
+    ignore_cols: Sequence[str] | None,
+    splits_col: str | None,
+    weight_col: str | None,
+    no_header_row: bool = False,
+) -> tuple[list[str], list[str]]:
+    df_cols = pd.read_csv(path, index_col=False, nrows=0).columns.tolist()
+    if no_header_row:
+        return ["SMILES"], ["pred_" + str(i) for i in range((len(df_cols) - 1))]
+    input_cols = (smiles_cols or []) + (rxn_cols or [])
+    if len(input_cols) == 0:
+        input_cols = [df_cols[0]]
+    if target_cols is None:
+        target_cols = list(
+            column
+            for column in df_cols
+            if column
+            not in set(
+                input_cols + (ignore_cols or []) + ([splits_col] or []) + ([weight_col] or [])
+            )
+        )
+    return input_cols, target_cols
+def make_datapoints(
+    smiss: list[list[str]] | None,
+    rxnss: list[list[str]] | None,
+    Y: np.ndarray,
+    weights: np.ndarray | None,
+    lt_mask: np.ndarray | None,
+    gt_mask: np.ndarray | None,
+    X_d: np.ndarray | None,
+    V_fss: list[list[np.ndarray] | list[None]] | None,
+    E_fss: list[list[np.ndarray] | list[None]] | None,
+    V_dss: list[list[np.ndarray] | list[None]] | None,
+    molecule_featurizers: list[str] | None,
+    keep_h: bool,
+    add_h: bool,
+    ignore_chirality: bool,
+) -> tuple[list[list[MoleculeDatapoint]], list[list[ReactionDatapoint]]]:
+    """Make the :class:`MoleculeDatapoint`s and :class:`ReactionDatapoint`s for a given
+    dataset.
+    Parameters
+    ----------
+    smiss : list[list[str]] | None
+        a list of ``j`` lists of ``n`` SMILES strings, where ``j`` is the number of molecules per
+        datapoint and ``n`` is the number of datapoints. If ``None``, the corresponding list of
+        :class:`MoleculeDatapoint`\s will be empty.
+    rxnss : list[list[str]] | None
+        a list of ``k`` lists of ``n`` reaction SMILES strings, where ``k`` is the number of
+        reactions per datapoint. If ``None``, the corresponding list of :class:`ReactionDatapoint`\s
+        will be empty.
+    Y : np.ndarray
+        the target values of shape ``n x m``, where ``m`` is the number of targets
+    weights : np.ndarray | None
+        the weights of the datapoints to use in the loss function of shape ``n x m``. If ``None``,
+        the weights all default to 1.
+    lt_mask : np.ndarray | None
+        a boolean mask of shape ``n x m`` indicating whether the targets are less than inequality
+        targets. If ``None``, ``lt_mask`` for all datapoints will be ``None``.
+    gt_mask : np.ndarray | None
+        a boolean mask of shape ``n x m`` indicating whether the targets are greater than inequality
+        targets. If ``None``, ``gt_mask`` for all datapoints will be ``None``.
+    X_d : np.ndarray | None
+        the extra descriptors of shape ``n x p``, where ``p`` is the number of extra descriptors. If
+        ``None``, ``x_d`` for all datapoints will be ``None``.
+    V_fss : list[list[np.ndarray] | list[None]] | None
+        a list of ``j`` lists of ``n`` np.ndarrays each of shape ``v_jn x q_j``, where ``v_jn`` is
+        the number of atoms in the j-th molecule of the n-th datapoint and ``q_j`` is the number of
+        extra atom features used for the j-th molecules. Any of the ``j`` lists can be a list of
+        None values if the corresponding component does not use extra atom features. If ``None``,
+        ``V_f`` for all datapoints will be ``None``.
+    E_fss : list[list[np.ndarray] | list[None]] | None
+        a list of ``j`` lists of ``n`` np.ndarrays each of shape ``e_jn x r_j``, where ``e_jn`` is
+        the number of bonds in the j-th molecule of the n-th datapoint and ``r_j`` is the number of
+        extra bond features used for the j-th molecules. Any of the ``j`` lists can be a list of
+        None values if the corresponding component does not use extra bond features. If ``None``,
+        ``E_f`` for all datapoints will be ``None``.
+    V_dss : list[list[np.ndarray] | list[None]] | None
+        a list of ``j`` lists of ``n`` np.ndarrays each of shape ``v_jn x s_j``, where ``s_j`` is
+        the number of extra atom descriptors used for the j-th molecules. Any of the ``j`` lists can
+        be a list of None values if the corresponding component does not use extra atom features. If
+        ``None``, ``V_d`` for all datapoints will be ``None``.
+    molecule_featurizers : list[str] | None
+        a list of molecule featurizer names to generate additional molecule features to use as extra
+        descriptors. If there are multiple molecules per datapoint, the featurizers will be applied
+        to each molecule and concatenated. Note that a :code:`ReactionDatapoint` has two
+        RDKit :class:`~rdkit.Chem.Mol` objects, reactant(s) and product(s). Each
+        ``molecule_featurizer`` will be applied to both of these objects.
+    keep_h : bool
+        whether to keep hydrogen atoms
+    add_h : bool
+        whether to add hydrogen atoms
+    ignore_chirality : bool
+        whether to ignore chirality information
+    Returns
+    -------
+    list[list[MoleculeDatapoint]]
+        a list of ``j`` lists of ``n`` :class:`MoleculeDatapoint`\s
+    list[list[ReactionDatapoint]]
+        a list of ``k`` lists of ``n`` :class:`ReactionDatapoint`\s
+    .. note::
+        either ``j`` or ``k`` may be 0, in which case the corresponding list will be empty.
+    Raises
+    ------
+    ValueError
+        if both ``smiss`` and ``rxnss`` are ``None``.
+        if ``smiss`` and ``rxnss`` are both given and have different lengths.
+    """
+    if smiss is None and rxnss is None:
+        raise ValueError("args 'smiss' and 'rnxss' were both `None`!")
+    elif rxnss is None:
+        N = len(smiss[0])
+        rxnss = []
+    elif smiss is None:
+        N = len(rxnss[0])
+        smiss = []
+    elif len(smiss[0]) != len(rxnss[0]):
+        raise ValueError(
+            f"args 'smiss' and 'rxnss' must have same length! got {len(smiss[0])} and {len(rxnss[0])}"
+        )
+    else:
+        N = len(smiss[0])
+    if len(smiss) > 0:
+        molss = [[make_mol(smi, keep_h, add_h, ignore_chirality) for smi in smis] for smis in smiss]
+    if len(rxnss) > 0:
+        rctss = [
+            [
+                make_mol(
+                    f"{rct_smi}.{agt_smi}" if agt_smi else rct_smi, keep_h, add_h, ignore_chirality
+                )
+                for rct_smi, agt_smi, _ in (rxn.split(">") for rxn in rxns)
+            ]
+            for rxns in rxnss
+        ]
+        pdtss = [
+            [
+                make_mol(pdt_smi, keep_h, add_h, ignore_chirality)
+                for _, _, pdt_smi in (rxn.split(">") for rxn in rxns)
+            ]
+            for rxns in rxnss
+        ]
+    weights = np.ones(N, dtype=np.single) if weights is None else weights
+    gt_mask = [None] * N if gt_mask is None else gt_mask
+    lt_mask = [None] * N if lt_mask is None else lt_mask
+    n_mols = len(smiss) if smiss else 0
+    V_fss = [[None] * N] * n_mols if V_fss is None else V_fss
+    E_fss = [[None] * N] * n_mols if E_fss is None else E_fss
+    V_dss = [[None] * N] * n_mols if V_dss is None else V_dss
+    if X_d is None and molecule_featurizers is None:
+        X_d = [None] * N
+    elif molecule_featurizers is None:
+        pass
+    else:
+        molecule_featurizers = [MoleculeFeaturizerRegistry[mf]() for mf in molecule_featurizers]
+        if len(smiss) > 0:
+            mol_descriptors = np.hstack(
+                [
+                    np.vstack([np.hstack([mf(mol) for mf in molecule_featurizers]) for mol in mols])
+                    for mols in molss
+                ]
+            )
+            if X_d is None:
+                X_d = mol_descriptors
+            else:
+                X_d = np.hstack([X_d, mol_descriptors])
+        if len(rxnss) > 0:
+            rct_pdt_descriptors = np.hstack(
+                [
+                    np.vstack(
+                        [
+                            np.hstack(
+                                [mf(mol) for mf in molecule_featurizers for mol in (rct, pdt)]
+                            )
+                            for rct, pdt in zip(rcts, pdts)
+                        ]
+                    )
+                    for rcts, pdts in zip(rctss, pdtss)
+                ]
+            )
+            if X_d is None:
+                X_d = rct_pdt_descriptors
+            else:
+                X_d = np.hstack([X_d, rct_pdt_descriptors])
+    mol_data = [
+        [
+            MoleculeDatapoint(
+                mol=molss[mol_idx][i],
+                name=smis[i],
+                y=Y[i],
+                weight=weights[i],
+                gt_mask=gt_mask[i],
+                lt_mask=lt_mask[i],
+                x_d=X_d[i],
+                x_phase=None,
+                V_f=V_fss[mol_idx][i],
+                E_f=E_fss[mol_idx][i],
+                V_d=V_dss[mol_idx][i],
+            )
+            for i in range(N)
+        ]
+        for mol_idx, smis in enumerate(smiss)
+    ]
+    rxn_data = [
+        [
+            ReactionDatapoint(
+                rct=rctss[rxn_idx][i],
+                pdt=pdtss[rxn_idx][i],
+                name=rxns[i],
+                y=Y[i],
+                weight=weights[i],
+                gt_mask=gt_mask[i],
+                lt_mask=lt_mask[i],
+                x_d=X_d[i],
+                x_phase=None,
+            )
+            for i in range(N)
+        ]
+        for rxn_idx, rxns in enumerate(rxnss)
+    ]
+    return mol_data, rxn_data
+def build_data_from_files(
+    p_data: PathLike,
+    no_header_row: bool,
+    smiles_cols: Sequence[str] | None,
+    rxn_cols: Sequence[str] | None,
+    target_cols: Sequence[str] | None,
+    ignore_cols: Sequence[str] | None,
+    splits_col: str | None,
+    weight_col: str | None,
+    bounded: bool,
+    p_descriptors: PathLike,
+    p_atom_feats: dict[int, PathLike],
+    p_bond_feats: dict[int, PathLike],
+    p_atom_descs: dict[int, PathLike],
+    **featurization_kwargs: Mapping,
+) -> list[list[MoleculeDatapoint] | list[ReactionDatapoint]]:
+    smiss, rxnss, Y, weights, lt_mask, gt_mask = parse_csv(
+        p_data,
+        smiles_cols,
+        rxn_cols,
+        target_cols,
+        ignore_cols,
+        splits_col,
+        weight_col,
+        bounded,
+        no_header_row,
+    )
+    n_molecules = len(smiss) if smiss is not None else 0
+    n_datapoints = len(Y)
+    X_ds = load_input_feats_and_descs(p_descriptors, None, None, feat_desc="X_d")
+    V_fss = load_input_feats_and_descs(p_atom_feats, n_molecules, n_datapoints, feat_desc="V_f")
+    E_fss = load_input_feats_and_descs(p_bond_feats, n_molecules, n_datapoints, feat_desc="E_f")
+    V_dss = load_input_feats_and_descs(p_atom_descs, n_molecules, n_datapoints, feat_desc="V_d")
+    mol_data, rxn_data = make_datapoints(
+        smiss,
+        rxnss,
+        Y,
+        weights,
+        lt_mask,
+        gt_mask,
+        X_ds,
+        V_fss,
+        E_fss,
+        V_dss,
+        **featurization_kwargs,
+    )
+    return mol_data + rxn_data
+def load_input_feats_and_descs(
+    paths: dict[int, PathLike] | PathLike,
+    n_molecules: int | None,
+    n_datapoints: int | None,
+    feat_desc: str,
+):
+    if paths is None:
+        return None
+    match feat_desc:
+        case "X_d":
+            path = paths
+            loaded_feature = np.load(path)
+            features = loaded_feature["arr_0"]
+        case _:
+            for index in paths:
+                if index >= n_molecules:
+                    raise ValueError(
+                        f"For {n_molecules} molecules, atom/bond features/descriptors can only be "
+                        f"specified for indices 0-{n_molecules - 1}! Got index {index}."
+                    )
+            features = []
+            for idx in range(n_molecules):
+                path = paths.get(idx, None)
+                if path is not None:
+                    loaded_feature = np.load(path)
+                    loaded_feature = [
+                        loaded_feature[f"arr_{i}"] for i in range(len(loaded_feature))
+                    ]
+                else:
+                    loaded_feature = [None] * n_datapoints
+                features.append(loaded_feature)
+    return features
+def make_dataset(
+    data: Sequence[MoleculeDatapoint] | Sequence[ReactionDatapoint],
+    reaction_mode: str,
+    multi_hot_atom_featurizer_mode: Literal["V1", "V2", "ORGANIC", "RIGR"] = "V2",
+) -> MoleculeDataset | ReactionDataset:
+    atom_featurizer = get_multi_hot_atom_featurizer(multi_hot_atom_featurizer_mode)
+    match multi_hot_atom_featurizer_mode:
+        case "RIGR":
+            bond_featurizer = RIGRBondFeaturizer()
+        case "V1" | "V2" | "ORGANIC":
+            bond_featurizer = MultiHotBondFeaturizer()
+        case _:
+            raise TypeError(
+                f"Unsupported atom featurizer mode '{multi_hot_atom_featurizer_mode=}'!"
+            )
+    if isinstance(data[0], MoleculeDatapoint):
+        extra_atom_fdim = data[0].V_f.shape[1] if data[0].V_f is not None else 0
+        extra_bond_fdim = data[0].E_f.shape[1] if data[0].E_f is not None else 0
+        featurizer = SimpleMoleculeMolGraphFeaturizer(
+            atom_featurizer=atom_featurizer,
+            bond_featurizer=bond_featurizer,
+            extra_atom_fdim=extra_atom_fdim,
+            extra_bond_fdim=extra_bond_fdim,
+        )
+        return MoleculeDataset(data, featurizer)
+    featurizer = CondensedGraphOfReactionFeaturizer(
+        mode_=reaction_mode, atom_featurizer=atom_featurizer
+    )
+    return ReactionDataset(data, featurizer)
+def parse_indices(idxs):
+    """Parses a string of indices into a list of integers. e.g. '0,1,2-4' -> [0, 1, 2, 3, 4]"""
+    if isinstance(idxs, str):
+        indices = []
+        for idx in idxs.split(","):
+            if "-" in idx:
+                start, end = map(int, idx.split("-"))
+                indices.extend(range(start, end + 1))
+            else:
+                indices.append(int(idx))
+        return indices
+    return idxs

chemprop-updated/chemprop/cli/utils/utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Any
+__all__ = ["pop_attr"]
+def pop_attr(o: object, attr: str, *args) -> Any | None:
+    """like ``pop()`` but for attribute maps"""
+    match len(args):
+        case 0:
+            return _pop_attr(o, attr)
+        case 1:
+            return _pop_attr_d(o, attr, args[0])
+        case _:
+            raise TypeError(f"Expected at most 2 arguments! got: {len(args)}")
+def _pop_attr(o: object, attr: str) -> Any:
+    val = getattr(o, attr)
+    delattr(o, attr)
+    return val
+def _pop_attr_d(o: object, attr: str, default: Any | None = None) -> Any | None:
+    try:
+        val = getattr(o, attr)
+        delattr(o, attr)
+    except AttributeError:
+        val = default
+    return val

chemprop-updated/chemprop/conf.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Global configuration variables for chemprop"""
+from chemprop.featurizers.molgraph.molecule import SimpleMoleculeMolGraphFeaturizer
+DEFAULT_ATOM_FDIM, DEFAULT_BOND_FDIM = SimpleMoleculeMolGraphFeaturizer().shape
+DEFAULT_HIDDEN_DIM = 300

chemprop-updated/chemprop/data/__init__.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from .collate import (
+    BatchMolGraph,
+    MulticomponentTrainingBatch,
+    TrainingBatch,
+    collate_batch,
+    collate_multicomponent,
+)
+from .dataloader import build_dataloader
+from .datapoints import MoleculeDatapoint, ReactionDatapoint
+from .datasets import (
+    Datum,
+    MoleculeDataset,
+    MolGraphDataset,
+    MulticomponentDataset,
+    ReactionDataset,
+)
+from .molgraph import MolGraph
+from .samplers import ClassBalanceSampler, SeededSampler
+from .splitting import SplitType, make_split_indices, split_data_by_indices
+__all__ = [
+    "BatchMolGraph",
+    "TrainingBatch",
+    "collate_batch",
+    "MulticomponentTrainingBatch",
+    "collate_multicomponent",
+    "build_dataloader",
+    "MoleculeDatapoint",
+    "ReactionDatapoint",
+    "MoleculeDataset",
+    "ReactionDataset",
+    "Datum",
+    "MulticomponentDataset",
+    "MolGraphDataset",
+    "MolGraph",
+    "ClassBalanceSampler",
+    "SeededSampler",
+    "SplitType",
+    "make_split_indices",
+    "split_data_by_indices",
+]

chemprop-updated/chemprop/data/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (1.43 kB). View file

chemprop-updated/chemprop/data/__pycache__/data.cpython-37.pyc ADDED Viewed

Binary file (43.5 kB). View file

chemprop-updated/chemprop/data/__pycache__/scaffold.cpython-37.pyc ADDED Viewed

Binary file (7.37 kB). View file

chemprop-updated/chemprop/data/__pycache__/scaler.cpython-37.pyc ADDED Viewed

Binary file (5.95 kB). View file

chemprop-updated/chemprop/data/__pycache__/utils.cpython-37.pyc ADDED Viewed

Binary file (35.1 kB). View file

chemprop-updated/chemprop/data/collate.py ADDED Viewed

	@@ -0,0 +1,123 @@

+from dataclasses import InitVar, dataclass, field
+from typing import Iterable, NamedTuple, Sequence
+import numpy as np
+import torch
+from torch import Tensor
+from chemprop.data.datasets import Datum
+from chemprop.data.molgraph import MolGraph
+@dataclass(repr=False, eq=False, slots=True)
+class BatchMolGraph:
+    """A :class:`BatchMolGraph` represents a batch of individual :class:`MolGraph`\s.
+    It has all the attributes of a ``MolGraph`` with the addition of the ``batch`` attribute. This
+    class is intended for use with data loading, so it uses :obj:`~torch.Tensor`\s to store data
+    """
+    mgs: InitVar[Sequence[MolGraph]]
+    """A list of individual :class:`MolGraph`\s to be batched together"""
+    V: Tensor = field(init=False)
+    """the atom feature matrix"""
+    E: Tensor = field(init=False)
+    """the bond feature matrix"""
+    edge_index: Tensor = field(init=False)
+    """an tensor of shape ``2 x E`` containing the edges of the graph in COO format"""
+    rev_edge_index: Tensor = field(init=False)
+    """A tensor of shape ``E`` that maps from an edge index to the index of the source of the
+    reverse edge in the ``edge_index`` attribute."""
+    batch: Tensor = field(init=False)
+    """the index of the parent :class:`MolGraph` in the batched graph"""
+    names: list[str] = field(init=False)  # Add SMILES strings for the batch
+    __size: int = field(init=False)
+    def __post_init__(self, mgs: Sequence[MolGraph]):
+        self.__size = len(mgs)
+        Vs = []
+        Es = []
+        edge_indexes = []
+        rev_edge_indexes = []
+        batch_indexes = []
+        self.names = []
+        num_nodes = 0
+        num_edges = 0
+        for i, mg in enumerate(mgs):
+            Vs.append(mg.V)
+            Es.append(mg.E)
+            edge_indexes.append(mg.edge_index + num_nodes)
+            rev_edge_indexes.append(mg.rev_edge_index + num_edges)
+            batch_indexes.append([i] * len(mg.V))
+            self.names.append(mg.name)
+            num_nodes += mg.V.shape[0]
+            num_edges += mg.edge_index.shape[1]
+        self.V = torch.from_numpy(np.concatenate(Vs)).float()
+        self.E = torch.from_numpy(np.concatenate(Es)).float()
+        self.edge_index = torch.from_numpy(np.hstack(edge_indexes)).long()
+        self.rev_edge_index = torch.from_numpy(np.concatenate(rev_edge_indexes)).long()
+        self.batch = torch.tensor(np.concatenate(batch_indexes)).long()
+    def __len__(self) -> int:
+        """the number of individual :class:`MolGraph`\s in this batch"""
+        return self.__size
+    def to(self, device: str | torch.device):
+        self.V = self.V.to(device)
+        self.E = self.E.to(device)
+        self.edge_index = self.edge_index.to(device)
+        self.rev_edge_index = self.rev_edge_index.to(device)
+        self.batch = self.batch.to(device)
+class TrainingBatch(NamedTuple):
+    bmg: BatchMolGraph
+    V_d: Tensor | None
+    X_d: Tensor | None
+    Y: Tensor | None
+    w: Tensor
+    lt_mask: Tensor | None
+    gt_mask: Tensor | None
+def collate_batch(batch: Iterable[Datum]) -> TrainingBatch:
+    mgs, V_ds, x_ds, ys, weights, lt_masks, gt_masks = zip(*batch)
+    return TrainingBatch(
+        BatchMolGraph(mgs),
+        None if V_ds[0] is None else torch.from_numpy(np.concatenate(V_ds)).float(),
+        None if x_ds[0] is None else torch.from_numpy(np.array(x_ds)).float(),
+        None if ys[0] is None else torch.from_numpy(np.array(ys)).float(),
+        torch.tensor(weights, dtype=torch.float).unsqueeze(1),
+        None if lt_masks[0] is None else torch.from_numpy(np.array(lt_masks)),
+        None if gt_masks[0] is None else torch.from_numpy(np.array(gt_masks)),
+    )
+class MulticomponentTrainingBatch(NamedTuple):
+    bmgs: list[BatchMolGraph]
+    V_ds: list[Tensor | None]
+    X_d: Tensor | None
+    Y: Tensor | None
+    w: Tensor
+    lt_mask: Tensor | None
+    gt_mask: Tensor | None
+def collate_multicomponent(batches: Iterable[Iterable[Datum]]) -> MulticomponentTrainingBatch:
+    tbs = [collate_batch(batch) for batch in zip(*batches)]
+    return MulticomponentTrainingBatch(
+        [tb.bmg for tb in tbs],
+        [tb.V_d for tb in tbs],
+        tbs[0].X_d,
+        tbs[0].Y,
+        tbs[0].w,
+        tbs[0].lt_mask,
+        tbs[0].gt_mask,
+    )

chemprop-updated/chemprop/data/dataloader.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import logging
+from torch.utils.data import DataLoader
+from chemprop.data.collate import collate_batch, collate_multicomponent
+from chemprop.data.datasets import MoleculeDataset, MulticomponentDataset, ReactionDataset
+from chemprop.data.samplers import ClassBalanceSampler, SeededSampler
+logger = logging.getLogger(__name__)
+def build_dataloader(
+    dataset: MoleculeDataset | ReactionDataset | MulticomponentDataset,
+    batch_size: int = 64,
+    num_workers: int = 0,
+    class_balance: bool = False,
+    seed: int | None = None,
+    shuffle: bool = True,
+    **kwargs,
+):
+    """Return a :obj:`~torch.utils.data.DataLoader` for :class:`MolGraphDataset`\s
+    Parameters
+    ----------
+    dataset : MoleculeDataset | ReactionDataset | MulticomponentDataset
+        The dataset containing the molecules or reactions to load.
+    batch_size : int, default=64
+        the batch size to load.
+    num_workers : int, default=0
+        the number of workers used to build batches.
+    class_balance : bool, default=False
+        Whether to perform class balancing (i.e., use an equal number of positive and negative
+        molecules). Class balance is only available for single task classification datasets. Set
+        shuffle to True in order to get a random subset of the larger class.
+    seed : int, default=None
+        the random seed to use for shuffling (only used when `shuffle` is `True`).
+    shuffle : bool, default=False
+        whether to shuffle the data during sampling.
+    """
+    if class_balance:
+        sampler = ClassBalanceSampler(dataset.Y, seed, shuffle)
+    elif shuffle and seed is not None:
+        sampler = SeededSampler(len(dataset), seed)
+    else:
+        sampler = None
+    if isinstance(dataset, MulticomponentDataset):
+        collate_fn = collate_multicomponent
+    else:
+        collate_fn = collate_batch
+    if len(dataset) % batch_size == 1:
+        logger.warning(
+            f"Dropping last batch of size 1 to avoid issues with batch normalization \
+(dataset size = {len(dataset)}, batch_size = {batch_size})"
+        )
+        drop_last = True
+    else:
+        drop_last = False
+    return DataLoader(
+        dataset,
+        batch_size,
+        sampler is None and shuffle,
+        sampler,
+        num_workers=num_workers,
+        collate_fn=collate_fn,
+        drop_last=drop_last,
+        **kwargs,
+    )

chemprop-updated/chemprop/data/datapoints.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+from rdkit.Chem import AllChem as Chem
+from chemprop.featurizers import Featurizer
+from chemprop.utils import make_mol
+MoleculeFeaturizer = Featurizer[Chem.Mol, np.ndarray]
+@dataclass(slots=True)
+class _DatapointMixin:
+    """A mixin class for both molecule- and reaction- and multicomponent-type data"""
+    y: np.ndarray | None = None
+    """the targets for the molecule with unknown targets indicated by `nan`s"""
+    weight: float = 1.0
+    """the weight of this datapoint for the loss calculation."""
+    gt_mask: np.ndarray | None = None
+    """Indicates whether the targets are an inequality regression target of the form `<x`"""
+    lt_mask: np.ndarray | None = None
+    """Indicates whether the targets are an inequality regression target of the form `>x`"""
+    x_d: np.ndarray | None = None
+    """A vector of length ``d_f`` containing additional features (e.g., Morgan fingerprint) that
+    will be concatenated to the global representation *after* aggregation"""
+    x_phase: list[float] = None
+    """A one-hot vector indicating the phase of the data, as used in spectra data."""
+    name: str | None = None
+    """A string identifier for the datapoint."""
+    def __post_init__(self):
+        NAN_TOKEN = 0
+        if self.x_d is not None:
+            self.x_d[np.isnan(self.x_d)] = NAN_TOKEN
+    @property
+    def t(self) -> int | None:
+        return len(self.y) if self.y is not None else None
+@dataclass
+class _MoleculeDatapointMixin:
+    mol: Chem.Mol
+    """the molecule associated with this datapoint"""
+    @classmethod
+    def from_smi(
+        cls,
+        smi: str,
+        *args,
+        keep_h: bool = False,
+        add_h: bool = False,
+        ignore_chirality: bool = False,
+        **kwargs,
+    ) -> _MoleculeDatapointMixin:
+        mol = make_mol(smi, keep_h, add_h, ignore_chirality)
+        kwargs["name"] = smi if "name" not in kwargs else kwargs["name"]
+        return cls(mol, *args, **kwargs)
+@dataclass
+class MoleculeDatapoint(_DatapointMixin, _MoleculeDatapointMixin):
+    """A :class:`MoleculeDatapoint` contains a single molecule and its associated features and targets."""
+    V_f: np.ndarray | None = None
+    """a numpy array of shape ``V x d_vf``, where ``V`` is the number of atoms in the molecule, and
+    ``d_vf`` is the number of additional features that will be concatenated to atom-level features
+    *before* message passing"""
+    E_f: np.ndarray | None = None
+    """A numpy array of shape ``E x d_ef``, where ``E`` is the number of bonds in the molecule, and
+    ``d_ef`` is the number of additional features  containing additional features that will be
+    concatenated to bond-level features *before* message passing"""
+    V_d: np.ndarray | None = None
+    """A numpy array of shape ``V x d_vd``, where ``V`` is the number of atoms in the molecule, and
+    ``d_vd`` is the number of additional descriptors that will be concatenated to atom-level
+    descriptors *after* message passing"""
+    def __post_init__(self):
+        NAN_TOKEN = 0
+        if self.V_f is not None:
+            self.V_f[np.isnan(self.V_f)] = NAN_TOKEN
+        if self.E_f is not None:
+            self.E_f[np.isnan(self.E_f)] = NAN_TOKEN
+        if self.V_d is not None:
+            self.V_d[np.isnan(self.V_d)] = NAN_TOKEN
+        super().__post_init__()
+    def __len__(self) -> int:
+        return 1
+@dataclass
+class _ReactionDatapointMixin:
+    rct: Chem.Mol
+    """the reactant associated with this datapoint"""
+    pdt: Chem.Mol
+    """the product associated with this datapoint"""
+    @classmethod
+    def from_smi(
+        cls,
+        rxn_or_smis: str | tuple[str, str],
+        *args,
+        keep_h: bool = False,
+        add_h: bool = False,
+        ignore_chirality: bool = False,
+        **kwargs,
+    ) -> _ReactionDatapointMixin:
+        match rxn_or_smis:
+            case str():
+                rct_smi, agt_smi, pdt_smi = rxn_or_smis.split(">")
+                rct_smi = f"{rct_smi}.{agt_smi}" if agt_smi else rct_smi
+                name = rxn_or_smis
+            case tuple():
+                rct_smi, pdt_smi = rxn_or_smis
+                name = ">>".join(rxn_or_smis)
+            case _:
+                raise TypeError(
+                    "Must provide either a reaction SMARTS string or a tuple of reactant and"
+                    " a product SMILES strings!"
+                )
+        rct = make_mol(rct_smi, keep_h, add_h, ignore_chirality)
+        pdt = make_mol(pdt_smi, keep_h, add_h, ignore_chirality)
+        kwargs["name"] = name if "name" not in kwargs else kwargs["name"]
+        return cls(rct, pdt, *args, **kwargs)
+@dataclass
+class ReactionDatapoint(_DatapointMixin, _ReactionDatapointMixin):
+    """A :class:`ReactionDatapoint` contains a single reaction and its associated features and targets."""
+    def __post_init__(self):
+        if self.rct is None:
+            raise ValueError("Reactant cannot be `None`!")
+        if self.pdt is None:
+            raise ValueError("Product cannot be `None`!")
+        return super().__post_init__()
+    def __len__(self) -> int:
+        return 2

chemprop-updated/chemprop/data/datasets.py ADDED Viewed

	@@ -0,0 +1,475 @@

+from dataclasses import dataclass, field
+from functools import cached_property
+from typing import NamedTuple, TypeAlias
+import numpy as np
+from numpy.typing import ArrayLike
+from rdkit import Chem
+from rdkit.Chem import Mol
+from sklearn.preprocessing import StandardScaler
+from torch.utils.data import Dataset
+from chemprop.data.datapoints import MoleculeDatapoint, ReactionDatapoint
+from chemprop.data.molgraph import MolGraph
+from chemprop.featurizers.base import Featurizer
+from chemprop.featurizers.molgraph import CGRFeaturizer, SimpleMoleculeMolGraphFeaturizer
+from chemprop.featurizers.molgraph.cache import MolGraphCache, MolGraphCacheOnTheFly
+from chemprop.types import Rxn
+class Datum(NamedTuple):
+    """a singular training data point"""
+    mg: MolGraph
+    V_d: np.ndarray | None
+    x_d: np.ndarray | None
+    y: np.ndarray | None
+    weight: float
+    lt_mask: np.ndarray | None
+    gt_mask: np.ndarray | None
+MolGraphDataset: TypeAlias = Dataset[Datum]
+class _MolGraphDatasetMixin:
+    def __len__(self) -> int:
+        return len(self.data)
+    @cached_property
+    def _Y(self) -> np.ndarray:
+        """the raw targets of the dataset"""
+        return np.array([d.y for d in self.data], float)
+    @property
+    def Y(self) -> np.ndarray:
+        """the (scaled) targets of the dataset"""
+        return self.__Y
+    @Y.setter
+    def Y(self, Y: ArrayLike):
+        self._validate_attribute(Y, "targets")
+        self.__Y = np.array(Y, float)
+    @cached_property
+    def _X_d(self) -> np.ndarray:
+        """the raw extra descriptors of the dataset"""
+        return np.array([d.x_d for d in self.data])
+    @property
+    def X_d(self) -> np.ndarray:
+        """the (scaled) extra descriptors of the dataset"""
+        return self.__X_d
+    @X_d.setter
+    def X_d(self, X_d: ArrayLike):
+        self._validate_attribute(X_d, "extra descriptors")
+        self.__X_d = np.array(X_d)
+    @property
+    def weights(self) -> np.ndarray:
+        return np.array([d.weight for d in self.data])
+    @property
+    def gt_mask(self) -> np.ndarray:
+        return np.array([d.gt_mask for d in self.data])
+    @property
+    def lt_mask(self) -> np.ndarray:
+        return np.array([d.lt_mask for d in self.data])
+    @property
+    def t(self) -> int | None:
+        return self.data[0].t if len(self.data) > 0 else None
+    @property
+    def d_xd(self) -> int:
+        """the extra molecule descriptor dimension, if any"""
+        return 0 if self.X_d[0] is None else self.X_d.shape[1]
+    @property
+    def names(self) -> list[str]:
+        return [d.name for d in self.data]
+    def normalize_targets(self, scaler: StandardScaler | None = None) -> StandardScaler:
+        """Normalizes the targets of this dataset using a :obj:`StandardScaler`
+        The :obj:`StandardScaler` subtracts the mean and divides by the standard deviation for
+        each task independently. NOTE: This should only be used for regression datasets.
+        Returns
+        -------
+        StandardScaler
+            a scaler fit to the targets.
+        """
+        if scaler is None:
+            scaler = StandardScaler().fit(self._Y)
+        self.Y = scaler.transform(self._Y)
+        return scaler
+    def normalize_inputs(
+        self, key: str = "X_d", scaler: StandardScaler | None = None
+    ) -> StandardScaler:
+        VALID_KEYS = {"X_d"}
+        if key not in VALID_KEYS:
+            raise ValueError(f"Invalid feature key! got: {key}. expected one of: {VALID_KEYS}")
+        X = self.X_d if self.X_d[0] is not None else None
+        if X is None:
+            return scaler
+        if scaler is None:
+            scaler = StandardScaler().fit(X)
+        self.X_d = scaler.transform(X)
+        return scaler
+    def reset(self):
+        """Reset the atom and bond features; atom and extra descriptors; and targets of each
+        datapoint to their initial, unnormalized values."""
+        self.__Y = self._Y
+        self.__X_d = self._X_d
+    def _validate_attribute(self, X: np.ndarray, label: str):
+        if not len(self.data) == len(X):
+            raise ValueError(
+                f"number of molecules ({len(self.data)}) and {label} ({len(X)}) "
+                "must have same length!"
+            )
+@dataclass
+class MoleculeDataset(_MolGraphDatasetMixin, MolGraphDataset):
+    """A :class:`MoleculeDataset` composed of :class:`MoleculeDatapoint`\s
+    A :class:`MoleculeDataset` produces featurized data for input to a
+    :class:`MPNN` model. Typically, data featurization is performed on-the-fly
+    and parallelized across multiple workers via the :class:`~torch.utils.data
+    DataLoader` class. However, for small datasets, it may be more efficient to
+    featurize the data in advance and cache the results. This can be done by
+    setting ``MoleculeDataset.cache=True``.
+    Parameters
+    ----------
+    data : Iterable[MoleculeDatapoint]
+        the data from which to create a dataset
+    featurizer : MoleculeFeaturizer
+        the featurizer with which to generate MolGraphs of the molecules
+    """
+    data: list[MoleculeDatapoint]
+    featurizer: Featurizer[Mol, MolGraph] = field(default_factory=SimpleMoleculeMolGraphFeaturizer)
+    def __post_init__(self):
+        if self.data is None:
+            raise ValueError("Data cannot be None!")
+        self.reset()
+        self.cache = False
+    def __getitem__(self, idx: int) -> Datum:
+        d = self.data[idx]
+        mg = self.mg_cache[idx]
+        # Assign the SMILES string to the MolGraph
+        mg_with_name = MolGraph(
+            V=mg.V,
+            E=mg.E,
+            edge_index=mg.edge_index,
+            rev_edge_index=mg.rev_edge_index,
+            name=d.name  # Assign the SMILES string
+        )
+        return Datum(
+            mg=mg_with_name,  # Use the updated MolGraph
+            V_d=self.V_ds[idx],
+            x_d=self.X_d[idx],
+            y=self.Y[idx],
+            weight=d.weight,
+            lt_mask=d.lt_mask,
+            gt_mask=d.gt_mask,
+        )
+    @property
+    def cache(self) -> bool:
+        return self.__cache
+    @cache.setter
+    def cache(self, cache: bool = False):
+        self.__cache = cache
+        self._init_cache()
+    def _init_cache(self):
+        """initialize the cache"""
+        self.mg_cache = (MolGraphCache if self.cache else MolGraphCacheOnTheFly)(
+            self.mols, self.V_fs, self.E_fs, self.featurizer
+        )
+    @property
+    def smiles(self) -> list[str]:
+        """the SMILES strings associated with the dataset"""
+        return [Chem.MolToSmiles(d.mol) for d in self.data]
+    @property
+    def mols(self) -> list[Chem.Mol]:
+        """the molecules associated with the dataset"""
+        return [d.mol for d in self.data]
+    @property
+    def _V_fs(self) -> list[np.ndarray]:
+        """the raw atom features of the dataset"""
+        return [d.V_f for d in self.data]
+    @property
+    def V_fs(self) -> list[np.ndarray]:
+        """the (scaled) atom descriptors of the dataset"""
+        return self.__V_fs
+    @V_fs.setter
+    def V_fs(self, V_fs: list[np.ndarray]):
+        """the (scaled) atom features of the dataset"""
+        self._validate_attribute(V_fs, "atom features")
+        self.__V_fs = V_fs
+        self._init_cache()
+    @property
+    def _E_fs(self) -> list[np.ndarray]:
+        """the raw bond features of the dataset"""
+        return [d.E_f for d in self.data]
+    @property
+    def E_fs(self) -> list[np.ndarray]:
+        """the (scaled) bond features of the dataset"""
+        return self.__E_fs
+    @E_fs.setter
+    def E_fs(self, E_fs: list[np.ndarray]):
+        self._validate_attribute(E_fs, "bond features")
+        self.__E_fs = E_fs
+        self._init_cache()
+    @property
+    def _V_ds(self) -> list[np.ndarray]:
+        """the raw atom descriptors of the dataset"""
+        return [d.V_d for d in self.data]
+    @property
+    def V_ds(self) -> list[np.ndarray]:
+        """the (scaled) atom descriptors of the dataset"""
+        return self.__V_ds
+    @V_ds.setter
+    def V_ds(self, V_ds: list[np.ndarray]):
+        self._validate_attribute(V_ds, "atom descriptors")
+        self.__V_ds = V_ds
+    @property
+    def d_vf(self) -> int:
+        """the extra atom feature dimension, if any"""
+        return 0 if self.V_fs[0] is None else self.V_fs[0].shape[1]
+    @property
+    def d_ef(self) -> int:
+        """the extra bond feature dimension, if any"""
+        return 0 if self.E_fs[0] is None else self.E_fs[0].shape[1]
+    @property
+    def d_vd(self) -> int:
+        """the extra atom descriptor dimension, if any"""
+        return 0 if self.V_ds[0] is None else self.V_ds[0].shape[1]
+    def normalize_inputs(
+        self, key: str = "X_d", scaler: StandardScaler | None = None
+    ) -> StandardScaler:
+        VALID_KEYS = {"X_d", "V_f", "E_f", "V_d"}
+        match key:
+            case "X_d":
+                X = None if self.d_xd == 0 else self.X_d
+            case "V_f":
+                X = None if self.d_vf == 0 else np.concatenate(self.V_fs, axis=0)
+            case "E_f":
+                X = None if self.d_ef == 0 else np.concatenate(self.E_fs, axis=0)
+            case "V_d":
+                X = None if self.d_vd == 0 else np.concatenate(self.V_ds, axis=0)
+            case _:
+                raise ValueError(f"Invalid feature key! got: {key}. expected one of: {VALID_KEYS}")
+        if X is None:
+            return scaler
+        if scaler is None:
+            scaler = StandardScaler().fit(X)
+        match key:
+            case "X_d":
+                self.X_d = scaler.transform(X)
+            case "V_f":
+                self.V_fs = [scaler.transform(V_f) if V_f.size > 0 else V_f for V_f in self.V_fs]
+            case "E_f":
+                self.E_fs = [scaler.transform(E_f) if E_f.size > 0 else E_f for E_f in self.E_fs]
+            case "V_d":
+                self.V_ds = [scaler.transform(V_d) if V_d.size > 0 else V_d for V_d in self.V_ds]
+            case _:
+                raise RuntimeError("unreachable code reached!")
+        return scaler
+    def reset(self):
+        """Reset the atom and bond features; atom and extra descriptors; and targets of each
+        datapoint to their initial, unnormalized values."""
+        super().reset()
+        self.__V_fs = self._V_fs
+        self.__E_fs = self._E_fs
+        self.__V_ds = self._V_ds
+@dataclass
+class ReactionDataset(_MolGraphDatasetMixin, MolGraphDataset):
+    """A :class:`ReactionDataset` composed of :class:`ReactionDatapoint`\s
+    .. note::
+        The featurized data provided by this class may be cached, simlar to a
+        :class:`MoleculeDataset`. To enable the cache, set ``ReactionDataset
+        cache=True``.
+    """
+    data: list[ReactionDatapoint]
+    """the dataset from which to load"""
+    featurizer: Featurizer[Rxn, MolGraph] = field(default_factory=CGRFeaturizer)
+    """the featurizer with which to generate MolGraphs of the input"""
+    def __post_init__(self):
+        if self.data is None:
+            raise ValueError("Data cannot be None!")
+        self.reset()
+        self.cache = False
+    @property
+    def cache(self) -> bool:
+        return self.__cache
+    @cache.setter
+    def cache(self, cache: bool = False):
+        self.__cache = cache
+        self.mg_cache = (MolGraphCache if cache else MolGraphCacheOnTheFly)(
+            self.mols, [None] * len(self), [None] * len(self), self.featurizer
+        )
+    def __getitem__(self, idx: int) -> Datum:
+        d = self.data[idx]
+        mg = self.mg_cache[idx]
+        return Datum(mg, None, self.X_d[idx], self.Y[idx], d.weight, d.lt_mask, d.gt_mask)
+    @property
+    def smiles(self) -> list[tuple]:
+        return [(Chem.MolToSmiles(d.rct), Chem.MolToSmiles(d.pdt)) for d in self.data]
+    @property
+    def mols(self) -> list[Rxn]:
+        return [(d.rct, d.pdt) for d in self.data]
+    @property
+    def d_vf(self) -> int:
+        return 0
+    @property
+    def d_ef(self) -> int:
+        return 0
+    @property
+    def d_vd(self) -> int:
+        return 0
+@dataclass(repr=False, eq=False)
+class MulticomponentDataset(_MolGraphDatasetMixin, Dataset):
+    """A :class:`MulticomponentDataset` is a :class:`Dataset` composed of parallel
+    :class:`MoleculeDatasets` and :class:`ReactionDataset`\s"""
+    datasets: list[MoleculeDataset | ReactionDataset]
+    """the parallel datasets"""
+    def __post_init__(self):
+        sizes = [len(dset) for dset in self.datasets]
+        if not all(sizes[0] == size for size in sizes[1:]):
+            raise ValueError(f"Datasets must have all same length! got: {sizes}")
+    def __len__(self) -> int:
+        return len(self.datasets[0])
+    @property
+    def n_components(self) -> int:
+        return len(self.datasets)
+    def __getitem__(self, idx: int) -> list[Datum]:
+        return [dset[idx] for dset in self.datasets]
+    @property
+    def smiles(self) -> list[list[str]]:
+        return list(zip(*[dset.smiles for dset in self.datasets]))
+    @property
+    def names(self) -> list[list[str]]:
+        return list(zip(*[dset.names for dset in self.datasets]))
+    @property
+    def mols(self) -> list[list[Chem.Mol]]:
+        return list(zip(*[dset.mols for dset in self.datasets]))
+    def normalize_targets(self, scaler: StandardScaler | None = None) -> StandardScaler:
+        return self.datasets[0].normalize_targets(scaler)
+    def normalize_inputs(
+        self, key: str = "X_d", scaler: list[StandardScaler] | None = None
+    ) -> list[StandardScaler]:
+        RXN_VALID_KEYS = {"X_d"}
+        match scaler:
+            case None:
+                return [
+                    dset.normalize_inputs(key)
+                    if isinstance(dset, MoleculeDataset) or key in RXN_VALID_KEYS
+                    else None
+                    for dset in self.datasets
+                ]
+            case _:
+                assert len(scaler) == len(
+                    self.datasets
+                ), "Number of scalers must match number of datasets!"
+                return [
+                    dset.normalize_inputs(key, s)
+                    if isinstance(dset, MoleculeDataset) or key in RXN_VALID_KEYS
+                    else None
+                    for dset, s in zip(self.datasets, scaler)
+                ]
+    def reset(self):
+        return [dset.reset() for dset in self.datasets]
+    @property
+    def d_xd(self) -> list[int]:
+        return self.datasets[0].d_xd
+    @property
+    def d_vf(self) -> list[int]:
+        return sum(dset.d_vf for dset in self.datasets)
+    @property
+    def d_ef(self) -> list[int]:
+        return sum(dset.d_ef for dset in self.datasets)
+    @property
+    def d_vd(self) -> list[int]:
+        return sum(dset.d_vd for dset in self.datasets)

chemprop-updated/chemprop/data/molgraph.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing import NamedTuple
+import numpy as np
+class MolGraph(NamedTuple):
+    """A :class:`MolGraph` represents the graph featurization of a molecule."""
+    V: np.ndarray
+    """an array of shape ``V x d_v`` containing the atom features of the molecule"""
+    E: np.ndarray
+    """an array of shape ``E x d_e`` containing the bond features of the molecule"""
+    edge_index: np.ndarray
+    """an array of shape ``2 x E`` containing the edges of the graph in COO format"""
+    rev_edge_index: np.ndarray
+    """A array of shape ``E`` that maps from an edge index to the index of the source of the reverse edge in :attr:`edge_index` attribute."""
+    name: str | None = None  # Add SMILES string as an optional attribute

chemprop-updated/chemprop/data/samplers.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from itertools import chain
+from typing import Iterator, Optional
+import numpy as np
+from torch.utils.data import Sampler
+class SeededSampler(Sampler):
+    """A :class`SeededSampler` is a class for iterating through a dataset in a randomly seeded
+    fashion"""
+    def __init__(self, N: int, seed: int):
+        if seed is None:
+            raise ValueError("arg 'seed' was `None`! A SeededSampler must be seeded!")
+        self.idxs = np.arange(N)
+        self.rg = np.random.default_rng(seed)
+    def __iter__(self) -> Iterator[int]:
+        """an iterator over indices to sample."""
+        self.rg.shuffle(self.idxs)
+        return iter(self.idxs)
+    def __len__(self) -> int:
+        """the number of indices that will be sampled."""
+        return len(self.idxs)
+class ClassBalanceSampler(Sampler):
+    """A :class:`ClassBalanceSampler` samples data from a :class:`MolGraphDataset` such that
+    positive and negative classes are equally sampled
+    Parameters
+    ----------
+    dataset : MolGraphDataset
+        the dataset from which to sample
+    seed : int
+        the random seed to use for shuffling (only used when `shuffle` is `True`)
+    shuffle : bool, default=False
+        whether to shuffle the data during sampling
+    """
+    def __init__(self, Y: np.ndarray, seed: Optional[int] = None, shuffle: bool = False):
+        self.shuffle = shuffle
+        self.rg = np.random.default_rng(seed)
+        idxs = np.arange(len(Y))
+        actives = Y.any(1)
+        self.pos_idxs = idxs[actives]
+        self.neg_idxs = idxs[~actives]
+        self.length = 2 * min(len(self.pos_idxs), len(self.neg_idxs))
+    def __iter__(self) -> Iterator[int]:
+        """an iterator over indices to sample."""
+        if self.shuffle:
+            self.rg.shuffle(self.pos_idxs)
+            self.rg.shuffle(self.neg_idxs)
+        return chain(*zip(self.pos_idxs, self.neg_idxs))
+    def __len__(self) -> int:
+        """the number of indices that will be sampled."""
+        return self.length

chemprop-updated/chemprop/data/splitting.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from collections.abc import Iterable, Sequence
+import copy
+from enum import auto
+import logging
+from astartes import train_test_split, train_val_test_split
+from astartes.molecules import train_test_split_molecules, train_val_test_split_molecules
+import numpy as np
+from rdkit import Chem
+from chemprop.data.datapoints import MoleculeDatapoint, ReactionDatapoint
+from chemprop.utils.utils import EnumMapping
+logger = logging.getLogger(__name__)
+Datapoints = Sequence[MoleculeDatapoint] | Sequence[ReactionDatapoint]
+MulticomponentDatapoints = Sequence[Datapoints]
+class SplitType(EnumMapping):
+    SCAFFOLD_BALANCED = auto()
+    RANDOM_WITH_REPEATED_SMILES = auto()
+    RANDOM = auto()
+    KENNARD_STONE = auto()
+    KMEANS = auto()
+def make_split_indices(
+    mols: Sequence[Chem.Mol],
+    split: SplitType | str = "random",
+    sizes: tuple[float, float, float] = (0.8, 0.1, 0.1),
+    seed: int = 0,
+    num_replicates: int = 1,
+    num_folds: None = None,
+) -> tuple[list[list[int]], ...]:
+    """Splits data into training, validation, and test splits.
+    Parameters
+    ----------
+    mols : Sequence[Chem.Mol]
+        Sequence of RDKit molecules to use for structure based splitting
+    split : SplitType | str, optional
+        Split type, one of ~chemprop.data.utils.SplitType, by default "random"
+    sizes : tuple[float, float, float], optional
+        3-tuple with the proportions of data in the train, validation, and test sets, by default
+        (0.8, 0.1, 0.1). Set the middle value to 0 for a two way split.
+    seed : int, optional
+        The random seed passed to astartes, by default 0
+    num_replicates : int, optional
+        Number of replicates, by default 1
+    num_folds : None, optional
+        This argument was removed in v2.1 - use `num_replicates` instead.
+    Returns
+    -------
+    tuple[list[list[int]], ...]
+        2- or 3-member tuple containing num_replicates length lists of training, validation, and testing indexes.
+        .. important::
+            Validation may or may not be present
+    Raises
+    ------
+    ValueError
+        Requested split sizes tuple not of length 3
+    ValueError
+        Unsupported split method requested
+    """
+    if num_folds is not None:
+        raise RuntimeError("This argument was removed in v2.1 - use `num_replicates` instead.")
+    if num_replicates == 1:
+        logger.warning(
+            "The return type of make_split_indices has changed in v2.1 - see help(make_split_indices)"
+        )
+    if (num_splits := len(sizes)) != 3:
+        raise ValueError(
+            f"Specify sizes for train, validation, and test (got {num_splits} values)."
+        )
+    # typically include a validation set
+    include_val = True
+    split_fun = train_val_test_split
+    mol_split_fun = train_val_test_split_molecules
+    # default sampling arguments for astartes sampler
+    astartes_kwargs = dict(
+        train_size=sizes[0], test_size=sizes[2], return_indices=True, random_state=seed
+    )
+    # if no validation set, reassign the splitting functions
+    if sizes[1] == 0.0:
+        include_val = False
+        split_fun = train_test_split
+        mol_split_fun = train_test_split_molecules
+    else:
+        astartes_kwargs["val_size"] = sizes[1]
+    n_datapoints = len(mols)
+    train_replicates, val_replicates, test_replicates = [], [], []
+    for _ in range(num_replicates):
+        train, val, test = None, None, None
+        match SplitType.get(split):
+            case SplitType.SCAFFOLD_BALANCED:
+                mols_without_atommaps = []
+                for mol in mols:
+                    copied_mol = copy.deepcopy(mol)
+                    for atom in copied_mol.GetAtoms():
+                        atom.SetAtomMapNum(0)
+                    mols_without_atommaps.append(copied_mol)
+                result = mol_split_fun(
+                    np.array(mols_without_atommaps), sampler="scaffold", **astartes_kwargs
+                )
+                train, val, test = _unpack_astartes_result(result, include_val)
+            # Use to constrain data with the same smiles go in the same split.
+            case SplitType.RANDOM_WITH_REPEATED_SMILES:
+                # get two arrays: one of all the smiles strings, one of just the unique
+                all_smiles = np.array([Chem.MolToSmiles(mol) for mol in mols])
+                unique_smiles = np.unique(all_smiles)
+                # save a mapping of smiles -> all the indices that it appeared at
+                smiles_indices = {}
+                for smiles in unique_smiles:
+                    smiles_indices[smiles] = np.where(all_smiles == smiles)[0].tolist()
+                # randomly split the unique smiles
+                result = split_fun(
+                    np.arange(len(unique_smiles)), sampler="random", **astartes_kwargs
+                )
+                train_idxs, val_idxs, test_idxs = _unpack_astartes_result(result, include_val)
+                # convert these to the 'actual' indices from the original list using the dict we made
+                train = sum((smiles_indices[unique_smiles[i]] for i in train_idxs), [])
+                val = sum((smiles_indices[unique_smiles[j]] for j in val_idxs), [])
+                test = sum((smiles_indices[unique_smiles[k]] for k in test_idxs), [])
+            case SplitType.RANDOM:
+                result = split_fun(np.arange(n_datapoints), sampler="random", **astartes_kwargs)
+                train, val, test = _unpack_astartes_result(result, include_val)
+            case SplitType.KENNARD_STONE:
+                result = mol_split_fun(
+                    np.array(mols),
+                    sampler="kennard_stone",
+                    hopts=dict(metric="jaccard"),
+                    fingerprint="morgan_fingerprint",
+                    fprints_hopts=dict(n_bits=2048),
+                    **astartes_kwargs,
+                )
+                train, val, test = _unpack_astartes_result(result, include_val)
+            case SplitType.KMEANS:
+                result = mol_split_fun(
+                    np.array(mols),
+                    sampler="kmeans",
+                    hopts=dict(metric="jaccard"),
+                    fingerprint="morgan_fingerprint",
+                    fprints_hopts=dict(n_bits=2048),
+                    **astartes_kwargs,
+                )
+                train, val, test = _unpack_astartes_result(result, include_val)
+            case _:
+                raise RuntimeError("Unreachable code reached!")
+        train_replicates.append(train)
+        val_replicates.append(val)
+        test_replicates.append(test)
+        astartes_kwargs["random_state"] += 1
+    return train_replicates, val_replicates, test_replicates
+def _unpack_astartes_result(
+    result: tuple, include_val: bool
+) -> tuple[list[int], list[int], list[int]]:
+    """Helper function to partition input data based on output of astartes sampler
+    Parameters
+    -----------
+    result: tuple
+        Output from call to astartes containing the split indices
+    include_val: bool
+        True if a validation set is included, False otherwise.
+    Returns
+    ---------
+    train: list[int]
+    val: list[int]
+    .. important::
+        validation possibly empty
+    test: list[int]
+    """
+    train_idxs, val_idxs, test_idxs = [], [], []
+    # astartes returns a set of lists containing the data, clusters (if applicable)
+    # and indices (always last), so we pull out the indices
+    if include_val:
+        train_idxs, val_idxs, test_idxs = result[-3], result[-2], result[-1]
+    else:
+        train_idxs, test_idxs = result[-2], result[-1]
+    return list(train_idxs), list(val_idxs), list(test_idxs)
+def split_data_by_indices(
+    data: Datapoints | MulticomponentDatapoints,
+    train_indices: Iterable[Iterable[int]] | None = None,
+    val_indices: Iterable[Iterable[int]] | None = None,
+    test_indices: Iterable[Iterable[int]] | None = None,
+):
+    """Splits data into training, validation, and test groups based on split indices given."""
+    train_data = _splitter_helper(data, train_indices)
+    val_data = _splitter_helper(data, val_indices)
+    test_data = _splitter_helper(data, test_indices)
+    return train_data, val_data, test_data
+def _splitter_helper(data, indices):
+    if indices is None:
+        return None
+    if isinstance(data[0], (MoleculeDatapoint, ReactionDatapoint)):
+        datapoints = data
+        idxss = indices
+        return [[datapoints[idx] for idx in idxs] for idxs in idxss]
+    else:
+        datapointss = data
+        idxss = indices
+        return [[[datapoints[idx] for idx in idxs] for datapoints in datapointss] for idxs in idxss]

chemprop-updated/chemprop/exceptions.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import Iterable
+from chemprop.utils import pretty_shape
+class InvalidShapeError(ValueError):
+    def __init__(self, var_name: str, received: Iterable[int], expected: Iterable[int]):
+        message = (
+            f"arg '{var_name}' has incorrect shape! "
+            f"got: `{pretty_shape(received)}`. expected: `{pretty_shape(expected)}`"
+        )
+        super().__init__(message)

chemprop-updated/chemprop/features/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (1.46 kB). View file

chemprop-updated/chemprop/features/__pycache__/features_generators.cpython-37.pyc ADDED Viewed

Binary file (5.71 kB). View file

chemprop-updated/chemprop/features/__pycache__/featurization.cpython-37.pyc ADDED Viewed

Binary file (24.7 kB). View file

chemprop-updated/chemprop/features/__pycache__/utils.cpython-37.pyc ADDED Viewed

Binary file (4.86 kB). View file

chemprop-updated/chemprop/featurizers/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from .atom import AtomFeatureMode, MultiHotAtomFeaturizer, get_multi_hot_atom_featurizer
+from .base import Featurizer, GraphFeaturizer, S, T, VectorFeaturizer
+from .bond import MultiHotBondFeaturizer
+from .molecule import (
+    BinaryFeaturizerMixin,
+    CountFeaturizerMixin,
+    MoleculeFeaturizerRegistry,
+    MorganBinaryFeaturizer,
+    MorganCountFeaturizer,
+    MorganFeaturizerMixin,
+    RDKit2DFeaturizer,
+    V1RDKit2DFeaturizer,
+    V1RDKit2DNormalizedFeaturizer,
+)
+from .molgraph import (
+    CGRFeaturizer,
+    CondensedGraphOfReactionFeaturizer,
+    MolGraphCache,
+    MolGraphCacheFacade,
+    MolGraphCacheOnTheFly,
+    RxnMode,
+    SimpleMoleculeMolGraphFeaturizer,
+)
+__all__ = [
+    "Featurizer",
+    "S",
+    "T",
+    "VectorFeaturizer",
+    "GraphFeaturizer",
+    "MultiHotAtomFeaturizer",
+    "AtomFeatureMode",
+    "get_multi_hot_atom_featurizer",
+    "MultiHotBondFeaturizer",
+    "MolGraphCacheFacade",
+    "MolGraphCache",
+    "MolGraphCacheOnTheFly",
+    "SimpleMoleculeMolGraphFeaturizer",
+    "CondensedGraphOfReactionFeaturizer",
+    "CGRFeaturizer",
+    "RxnMode",
+    "MoleculeFeaturizer",
+    "MorganFeaturizerMixin",
+    "BinaryFeaturizerMixin",
+    "CountFeaturizerMixin",
+    "MorganBinaryFeaturizer",
+    "MorganCountFeaturizer",
+    "RDKit2DFeaturizer",
+    "MoleculeFeaturizerRegistry",
+    "V1RDKit2DFeaturizer",
+    "V1RDKit2DNormalizedFeaturizer",
+]

chemprop-updated/chemprop/featurizers/atom.py ADDED Viewed

	@@ -0,0 +1,281 @@

+from enum import auto
+from typing import Sequence
+import numpy as np
+from rdkit.Chem.rdchem import Atom, HybridizationType
+from chemprop.featurizers.base import VectorFeaturizer
+from chemprop.utils.utils import EnumMapping
+class MultiHotAtomFeaturizer(VectorFeaturizer[Atom]):
+    """A :class:`MultiHotAtomFeaturizer` uses a multi-hot encoding to featurize atoms.
+    .. seealso::
+        The class provides three default parameterization schemes:
+        * :meth:`MultiHotAtomFeaturizer.v1`
+        * :meth:`MultiHotAtomFeaturizer.v2`
+        * :meth:`MultiHotAtomFeaturizer.organic`
+    The generated atom features are ordered as follows:
+    * atomic number
+    * degree
+    * formal charge
+    * chiral tag
+    * number of hydrogens
+    * hybridization
+    * aromaticity
+    * mass
+    .. important::
+        Each feature, except for aromaticity and mass, includes a pad for unknown values.
+    Parameters
+    ----------
+    atomic_nums : Sequence[int]
+        the choices for atom type denoted by atomic number. Ex: ``[4, 5, 6]`` for C, N and O.
+    degrees : Sequence[int]
+        the choices for number of bonds an atom is engaged in.
+    formal_charges : Sequence[int]
+        the choices for integer electronic charge assigned to an atom.
+    chiral_tags : Sequence[int]
+        the choices for an atom's chiral tag. See :class:`rdkit.Chem.rdchem.ChiralType` for possible integer values.
+    num_Hs : Sequence[int]
+        the choices for number of bonded hydrogen atoms.
+    hybridizations : Sequence[int]
+        the choices for an atom’s hybridization type. See :class:`rdkit.Chem.rdchem.HybridizationType` for possible integer values.
+    """
+    def __init__(
+        self,
+        atomic_nums: Sequence[int],
+        degrees: Sequence[int],
+        formal_charges: Sequence[int],
+        chiral_tags: Sequence[int],
+        num_Hs: Sequence[int],
+        hybridizations: Sequence[int],
+    ):
+        self.atomic_nums = {j: i for i, j in enumerate(atomic_nums)}
+        self.degrees = {i: i for i in degrees}
+        self.formal_charges = {j: i for i, j in enumerate(formal_charges)}
+        self.chiral_tags = {i: i for i in chiral_tags}
+        self.num_Hs = {i: i for i in num_Hs}
+        self.hybridizations = {ht: i for i, ht in enumerate(hybridizations)}
+        self._subfeats: list[dict] = [
+            self.atomic_nums,
+            self.degrees,
+            self.formal_charges,
+            self.chiral_tags,
+            self.num_Hs,
+            self.hybridizations,
+        ]
+        subfeat_sizes = [
+            1 + len(self.atomic_nums),
+            1 + len(self.degrees),
+            1 + len(self.formal_charges),
+            1 + len(self.chiral_tags),
+            1 + len(self.num_Hs),
+            1 + len(self.hybridizations),
+            1,
+            1,
+        ]
+        self.__size = sum(subfeat_sizes)
+    def __len__(self) -> int:
+        return self.__size
+    def __call__(self, a: Atom | None) -> np.ndarray:
+        x = np.zeros(self.__size)
+        if a is None:
+            return x
+        feats = [
+            a.GetAtomicNum(),
+            a.GetTotalDegree(),
+            a.GetFormalCharge(),
+            int(a.GetChiralTag()),
+            int(a.GetTotalNumHs()),
+            a.GetHybridization(),
+        ]
+        i = 0
+        for feat, choices in zip(feats, self._subfeats):
+            j = choices.get(feat, len(choices))
+            x[i + j] = 1
+            i += len(choices) + 1
+        x[i] = int(a.GetIsAromatic())
+        x[i + 1] = 0.01 * a.GetMass()
+        return x
+    def num_only(self, a: Atom) -> np.ndarray:
+        """featurize the atom by setting only the atomic number bit"""
+        x = np.zeros(len(self))
+        if a is None:
+            return x
+        i = self.atomic_nums.get(a.GetAtomicNum(), len(self.atomic_nums))
+        x[i] = 1
+        return x
+    @classmethod
+    def v1(cls, max_atomic_num: int = 100):
+        """The original implementation used in Chemprop V1 [1]_, [2]_.
+        Parameters
+        ----------
+        max_atomic_num : int, default=100
+            Include a bit for all atomic numbers in the interval :math:`[1, \mathtt{max\_atomic\_num}]`
+        References
+        -----------
+        .. [1] Yang, K.; Swanson, K.; Jin, W.; Coley, C.; Eiden, P.; Gao, H.; Guzman-Perez, A.; Hopper, T.;
+            Kelley, B.; Mathea, M.; Palmer, A. "Analyzing Learned Molecular Representations for Property Prediction."
+            J. Chem. Inf. Model. 2019, 59 (8), 3370–3388. https://doi.org/10.1021/acs.jcim.9b00237
+        .. [2] Heid, E.; Greenman, K.P.; Chung, Y.; Li, S.C.; Graff, D.E.; Vermeire, F.H.; Wu, H.; Green, W.H.; McGill,
+            C.J. "Chemprop: A machine learning package for chemical property prediction." J. Chem. Inf. Model. 2024,
+            64 (1), 9–17. https://doi.org/10.1021/acs.jcim.3c01250
+        """
+        return cls(
+            atomic_nums=list(range(1, max_atomic_num + 1)),
+            degrees=list(range(6)),
+            formal_charges=[-1, -2, 1, 2, 0],
+            chiral_tags=list(range(4)),
+            num_Hs=list(range(5)),
+            hybridizations=[
+                HybridizationType.SP,
+                HybridizationType.SP2,
+                HybridizationType.SP3,
+                HybridizationType.SP3D,
+                HybridizationType.SP3D2,
+            ],
+        )
+    @classmethod
+    def v2(cls):
+        """An implementation that includes an atom type bit for all elements in the first four rows of the periodic table plus iodine."""
+        return cls(
+            atomic_nums=list(range(1, 37)) + [53],
+            degrees=list(range(6)),
+            formal_charges=[-1, -2, 1, 2, 0],
+            chiral_tags=list(range(4)),
+            num_Hs=list(range(5)),
+            hybridizations=[
+                HybridizationType.S,
+                HybridizationType.SP,
+                HybridizationType.SP2,
+                HybridizationType.SP2D,
+                HybridizationType.SP3,
+                HybridizationType.SP3D,
+                HybridizationType.SP3D2,
+            ],
+        )
+    @classmethod
+    def organic(cls):
+        r"""A specific parameterization intended for use with organic or drug-like molecules.
+        This parameterization features:
+            1. includes an atomic number bit only for H, B, C, N, O, F, Si, P, S, Cl, Br, and I atoms
+            2. a hybridization bit for :math:`s, sp, sp^2` and :math:`sp^3` hybridizations.
+        """
+        return cls(
+            atomic_nums=[1, 5, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53],
+            degrees=list(range(6)),
+            formal_charges=[-1, -2, 1, 2, 0],
+            chiral_tags=list(range(4)),
+            num_Hs=list(range(5)),
+            hybridizations=[
+                HybridizationType.S,
+                HybridizationType.SP,
+                HybridizationType.SP2,
+                HybridizationType.SP3,
+            ],
+        )
+class RIGRAtomFeaturizer(VectorFeaturizer[Atom]):
+    """A :class:`RIGRAtomFeaturizer` uses a multi-hot encoding to featurize atoms using resonance-invariant features.
+    The generated atom features are ordered as follows:
+    * atomic number
+    * degree
+    * number of hydrogens
+    * mass
+    """
+    def __init__(
+        self,
+        atomic_nums: Sequence[int] | None = None,
+        degrees: Sequence[int] | None = None,
+        num_Hs: Sequence[int] | None = None,
+    ):
+        self.atomic_nums = {j: i for i, j in enumerate(atomic_nums or list(range(1, 37)) + [53])}
+        self.degrees = {i: i for i in (degrees or list(range(6)))}
+        self.num_Hs = {i: i for i in (num_Hs or list(range(5)))}
+        self._subfeats: list[dict] = [self.atomic_nums, self.degrees, self.num_Hs]
+        subfeat_sizes = [1 + len(self.atomic_nums), 1 + len(self.degrees), 1 + len(self.num_Hs), 1]
+        self.__size = sum(subfeat_sizes)
+    def __len__(self) -> int:
+        return self.__size
+    def __call__(self, a: Atom | None) -> np.ndarray:
+        x = np.zeros(self.__size)
+        if a is None:
+            return x
+        feats = [a.GetAtomicNum(), a.GetTotalDegree(), int(a.GetTotalNumHs())]
+        i = 0
+        for feat, choices in zip(feats, self._subfeats):
+            j = choices.get(feat, len(choices))
+            x[i + j] = 1
+            i += len(choices) + 1
+        x[i] = 0.01 * a.GetMass()  # scaled to about the same range as other features
+        return x
+    def num_only(self, a: Atom) -> np.ndarray:
+        """featurize the atom by setting only the atomic number bit"""
+        x = np.zeros(len(self))
+        if a is None:
+            return x
+        i = self.atomic_nums.get(a.GetAtomicNum(), len(self.atomic_nums))
+        x[i] = 1
+        return x
+class AtomFeatureMode(EnumMapping):
+    """The mode of an atom is used for featurization into a `MolGraph`"""
+    V1 = auto()
+    V2 = auto()
+    ORGANIC = auto()
+    RIGR = auto()
+def get_multi_hot_atom_featurizer(mode: str | AtomFeatureMode) -> MultiHotAtomFeaturizer:
+    """Build the corresponding multi-hot atom featurizer."""
+    match AtomFeatureMode.get(mode):
+        case AtomFeatureMode.V1:
+            return MultiHotAtomFeaturizer.v1()
+        case AtomFeatureMode.V2:
+            return MultiHotAtomFeaturizer.v2()
+        case AtomFeatureMode.ORGANIC:
+            return MultiHotAtomFeaturizer.organic()
+        case AtomFeatureMode.RIGR:
+            return RIGRAtomFeaturizer()
+        case _:
+            raise RuntimeError("unreachable code reached!")

chemprop-updated/chemprop/featurizers/base.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from abc import abstractmethod
+from collections.abc import Sized
+from typing import Generic, TypeVar
+import numpy as np
+from chemprop.data.molgraph import MolGraph
+S = TypeVar("S")
+T = TypeVar("T")
+class Featurizer(Generic[S, T]):
+    """An :class:`Featurizer` featurizes inputs type ``S`` into outputs of
+    type ``T``."""
+    @abstractmethod
+    def __call__(self, input: S, *args, **kwargs) -> T:
+        """featurize an input"""
+class VectorFeaturizer(Featurizer[S, np.ndarray], Sized):
+    ...
+class GraphFeaturizer(Featurizer[S, MolGraph]):
+    @property
+    @abstractmethod
+    def shape(self) -> tuple[int, int]:
+        ...