Spaces:

hbhzm
/

molecular_property_prediction

Build error

App Files Files Community

hbhzm commited on May 30

Commit

3ea26d1

verified ·

1 Parent(s): 8037081

Upload 625 files

Browse files

add model and bbbp model weight

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
chemprop-updated/.bumpversion.cfg +10 -0
chemprop-updated/.dockerignore +3 -0
chemprop-updated/.flake8 +9 -0
chemprop-updated/.github/ISSUE_TEMPLATE/todo.md +11 -0
chemprop-updated/.github/ISSUE_TEMPLATE/v1_bug_report.md +35 -0
chemprop-updated/.github/ISSUE_TEMPLATE/v1_question.md +17 -0
chemprop-updated/.github/ISSUE_TEMPLATE/v2_bug_report.md +35 -0
chemprop-updated/.github/ISSUE_TEMPLATE/v2_feature_request.md +23 -0
chemprop-updated/.github/ISSUE_TEMPLATE/v2_question.md +17 -0
chemprop-updated/.github/PULL_REQUEST_TEMPLATE.md +18 -0
chemprop-updated/.github/PULL_REQUEST_TEMPLATE/bugfix.md +12 -0
chemprop-updated/.github/PULL_REQUEST_TEMPLATE/new_feature.md +15 -0
chemprop-updated/.github/workflows/ci.yml +158 -0
chemprop-updated/.gitignore +178 -0
chemprop-updated/.readthedocs.yml +19 -0
chemprop-updated/CITATIONS.bib +37 -0
chemprop-updated/CONTRIBUTING.md +40 -0
chemprop-updated/Dockerfile +50 -0
chemprop-updated/LICENSE.txt +27 -0
chemprop-updated/README.md +63 -0
chemprop-updated/chemprop/__init__.py +5 -0
chemprop-updated/chemprop/__pycache__/__init__.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/args.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/constants.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/hyperopt_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/hyperparameter_optimization.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/interpret.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/multitask_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/nn_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/rdkit.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/sklearn_predict.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/sklearn_train.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/spectra_utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/__pycache__/utils.cpython-37.pyc +0 -0
chemprop-updated/chemprop/cli/common.py +211 -0
chemprop-updated/chemprop/cli/conf.py +9 -0
chemprop-updated/chemprop/cli/convert.py +55 -0
chemprop-updated/chemprop/cli/fingerprint.py +182 -0
chemprop-updated/chemprop/cli/hpopt.py +537 -0
chemprop-updated/chemprop/cli/main.py +85 -0
chemprop-updated/chemprop/cli/predict.py +444 -0
chemprop-updated/chemprop/cli/train.py +1340 -0
chemprop-updated/chemprop/cli/utils/__init__.py +30 -0
chemprop-updated/chemprop/cli/utils/actions.py +19 -0
chemprop-updated/chemprop/cli/utils/args.py +34 -0
chemprop-updated/chemprop/cli/utils/command.py +24 -0
chemprop-updated/chemprop/cli/utils/parsing.py +446 -0
chemprop-updated/chemprop/cli/utils/utils.py +31 -0
chemprop-updated/chemprop/conf.py +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+chemprop-updated/docs/source/_static/images/message_passing.png filter=lfs diff=lfs merge=lfs -text
+chemprop/docs/source/_static/images/message_passing.png filter=lfs diff=lfs merge=lfs -text

chemprop-updated/.bumpversion.cfg ADDED Viewed

	@@ -0,0 +1,10 @@

+[bumpversion]
+current_version = 2.1.2
+commit = True
+tag = True
+[bumpversion:file:pyproject.toml]
+[bumpversion:file:chemprop/__init__.py]
+[bumpversion:file:docs/source/conf.py]

chemprop-updated/.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+**.git*
+.dockerignore
+Dockerfile

chemprop-updated/.flake8 ADDED Viewed

	@@ -0,0 +1,9 @@

+[flake8]
+ignore = E203, E266, E501, F403, E741, W503, W605
+max-line-length = 100
+max-complexity = 18
+per-file-ignores =
+    __init__.py: F401
+    chemprop/nn/predictors.py: F405
+    chemprop/nn/metrics.py: F405
+    tests/unit/nn/test_metrics.py: E121, E122, E131, E241, W291

chemprop-updated/.github/ISSUE_TEMPLATE/todo.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+name: to-do
+about: Add an item to the to-do list. More generic than a feature request
+title: "[TODO]: "
+labels: todo
+assignees: ''
+---
+**Notes**
+_these could be implementation or more specific details to keep in mind, if they'll be helpful for issue tracking_

chemprop-updated/.github/ISSUE_TEMPLATE/v1_bug_report.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+name: v1 Bug Report
+about: Report a bug in v1 (will not be fixed)
+title: "[v1 BUG]: "
+labels: bug, v1-wontfix
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**Example(s)**
+Provide some examples of where the current code fails. Feel free to share your actual code for additional context, but a minimal and isolated example is preferred.
+**Expected behavior**
+A clear and concise description of what you expected to happen. If there is correct, expected output, include that here as well.
+**Error Stack Trace**
+If the bug is resulting in an error message, provide the _full_ stack trace (not just the last line). This is helpful for debugging, especially in cases where you aren't able to provide a minimum/isolated working example with accompanying files.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**Environment**
+- python version
+- package versions: `conda list` or `pip list`
+- OS
+**Checklist**
+- [ ] all dependencies are satisifed: `conda list` or `pip list` shows the packages listed in the `pyproject.toml`
+- [ ] the unit tests are working: `pytest -v` reports no errors
+**Additional context**
+Add any other context about the problem here.

chemprop-updated/.github/ISSUE_TEMPLATE/v1_question.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+name: v1 Question
+about: Have a question about how to use Chemprop v1?
+title: "[v1 QUESTION]: "
+labels: question
+assignees: ''
+---
+**What are you trying to do?**
+Please tell us what you're trying to do with Chemprop, providing as much detail as possible
+**Previous attempts**
+If possible, provide some examples of what you've already tried and what the output was.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.

chemprop-updated/.github/ISSUE_TEMPLATE/v2_bug_report.md ADDED Viewed

	@@ -0,0 +1,35 @@

+---
+name: v2 Bug Report
+about: Create a report to help us improve
+title: "[v2 BUG]: "
+labels: bug
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**Example(s)**
+Provide some examples of where the current code fails. Feel free to share your actual code for additional context, but a minimal and isolated example is preferred.
+**Expected behavior**
+A clear and concise description of what you expected to happen. If there is correct, expected output, include that here as well.
+**Error Stack Trace**
+If the bug is resulting in an error message, provide the _full_ stack trace (not just the last line). This is helpful for debugging, especially in cases where you aren't able to provide a minimum/isolated working example with accompanying files.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**Environment**
+- python version
+- package versions: `conda list` or `pip list`
+- OS
+**Checklist**
+- [ ] all dependencies are satisifed: `conda list` or `pip list` shows the packages listed in the `pyproject.toml`
+- [ ] the unit tests are working: `pytest -v` reports no errors
+**Additional context**
+Add any other context about the problem here.

chemprop-updated/.github/ISSUE_TEMPLATE/v2_feature_request.md ADDED Viewed

	@@ -0,0 +1,23 @@

+---
+name: v2 Feature Request
+about: Suggest an idea for this project
+title: "[v2 FEATURE]: "
+labels: enhancement
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is.
+**Use-cases/examples of this new feature**
+What are some example workflows that would employ this new feature? Are there any relevant issues?
+**Desired solution/workflow**
+A clear and concise description of what you want to happen. Include some (pseudo)code, if possible
+**Discussion**
+What are some considerations around this new feature? Are there alternative approaches to consider? What should the scope of the feature be?
+**Additional context**
+Add any other context or screenshots about the feature request here.

chemprop-updated/.github/ISSUE_TEMPLATE/v2_question.md ADDED Viewed

	@@ -0,0 +1,17 @@

+---
+name: v2 Question
+about: Have a question about how to use Chemprop v2?
+title: "[v2 QUESTION]: "
+labels: question
+assignees: ''
+---
+**What are you trying to do?**
+Please tell us what you're trying to do with Chemprop, providing as much detail as possible
+**Previous attempts**
+If possible, provide some examples of what you've already tried and what the output was.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.

chemprop-updated/.github/PULL_REQUEST_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,18 @@

+## Description
+Include a brief summary of the bug/feature/etc. that this PR seeks to address
+## Example / Current workflow
+Include a sample workflow to either **(a)** reproduce the bug with current codebase or **(b)** showcase the deficiency does this PR seeks to address
+## Bugfix / Desired workflow
+Include either **(a)** the same workflow from above with the correct output produced via this PR **(b)** some (pseudo)code containing the new workflow that this PR will (seek to) implement
+## Questions
+If there are open questions about implementation strategy or scope of the PR, include them here
+## Relevant issues
+If appropriate, please tag them here and include a quick summary
+## Checklist
+- [ ] linted with flake8?
+- [ ] (if appropriate) unit tests added?

chemprop-updated/.github/PULL_REQUEST_TEMPLATE/bugfix.md ADDED Viewed

	@@ -0,0 +1,12 @@

+## Bug report
+Include a brief summary of the bug that this PR seeks to address. If possible, include relevant issue tags
+## Example
+Include a sample execution to reproduce the bug with current codebase, and some sample output showcasing that the PR fixes this bug
+## Questions
+If there are open questions about implementation strategy or scope of the PR, include them here
+## Checklist
+- [ ] linted with flake8?
+- [ ] (if necessary) appropriate unit tests added?

chemprop-updated/.github/PULL_REQUEST_TEMPLATE/new_feature.md ADDED Viewed

	@@ -0,0 +1,15 @@

+## Statement of need
+What deficiency does this PR seek to address? If there are relevant issues, please tag them here
+## Current workflow
+How is this need achieved with the current codebase?
+## Desired workflow
+Include some (pseudo)code containing the new workflow that this PR will (seek to) implement
+## Questions
+If there are open questions about implementation strategy or scope of the PR, include them here
+## Checklist
+- [ ] linted with flake8?
+- [ ] appropriate unit tests added?

chemprop-updated/.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,158 @@

+# ci.yml
+#
+# Continuous Integration for Chemprop - checks build, code formatting, and runs tests for all
+# proposed changes and on a regular schedule
+#
+# Note: this file contains extensive inline documentation to aid with knowledge transfer.
+name: Continuous Integration
+on:
+  # run on pushes/pull requests to/against main
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  # run this in the morning on weekdays to catch dependency issues
+  schedule:
+    - cron: "0 8 * * 1-5"
+  # allow manual runs
+  workflow_dispatch:
+# cancel previously running tests if new commits are made
+# https://docs.github.com/en/actions/examples/using-concurrency-expressions-and-a-test-matrix
+concurrency:
+  group: actions-id-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+env:
+  USE_LIBUV: 0  # libuv doesn't work on GitHub actions Windows runner
+jobs:
+  build:
+    name: Check Build
+    runs-on: ubuntu-latest
+    steps:
+      # clone the repo, attempt to build
+      - uses: actions/checkout@v4
+      - run: python -m pip install build
+      - run: python -m build .
+  lint:
+    name: Check Formatting
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      # clone the repo, run black and flake8 on it
+      - uses: actions/checkout@v4
+      - run: python -m pip install black==23.* flake8 isort
+      - run: black --check .
+      - run: flake8 .
+      - run: isort --check .
+  test:
+    name: Execute Tests
+    needs: lint
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        # run with a login shell (so that the conda environment is activated)
+        # and echo the commands we run as we do them (for debugging purposes)
+        shell: bash -el {0}
+    strategy:
+      # if one platform/python version fails, continue testing the others
+      fail-fast: false
+      matrix:
+        # test on all platforms with both supported versions of Python
+        os: [ubuntu-latest, macos-13, windows-latest]
+        python-version: [3.11, 3.12]
+    steps:
+      - uses: actions/checkout@v4
+      # use a version of the conda virtual environment manager to set up an
+      # isolated environment with the Python version we want
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+          auto-update-conda: true
+          show-channel-urls: true
+          conda-remove-defaults: "true"
+          environment-file: environment.yml
+          activate-environment: chemprop
+      - name: Install dependencies
+        shell: bash -l {0}
+        run: |
+          python -m pip install nbmake
+          python -m pip install ".[dev,docs,test,hpopt]"
+      - name: Test with pytest
+        shell: bash -l {0}
+        run: |
+          pytest -v tests
+      - name: Test notebooks
+        shell: bash -l {0}
+        run: |
+          python -m pip install matplotlib
+          pytest --no-cov -v --nbmake $(find examples -name '*.ipynb' ! -name 'use_featurizer_with_other_libraries.ipynb' ! -name 'shapley_value_with_customized_featurizers.ipynb')
+          pytest --no-cov -v --nbmake $(find docs/source/tutorial/python -name "*.ipynb")
+  pypi:
+    name: Build and publish Python 🐍 distributions 📦 to PyPI
+    runs-on: ubuntu-latest
+    # only run if the tests pass
+    needs: [test]
+    # run only on pushes to main on chemprop
+    if:  ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && github.repository == 'chemprop/chemprop'}}
+    steps:
+      - uses: actions/checkout@master
+      - name: Set up Python 3.12
+        uses: actions/setup-python@v3
+        with:
+          python-version: "3.11"
+      - name: Install pypa/build
+        run: >-
+          python -m
+          pip install
+          build
+          --user
+      - name: Build a binary wheel and a source tarball
+        run: >-
+          python -m
+          build
+          --sdist
+          --wheel
+          --outdir dist/
+          .
+      - name: Publish distribution 📦 to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
+          skip-existing: true
+          verbose: true
+  build-and-push-docker:
+    # shamelessly copied from:
+    # https://github.com/ReactionMechanismGenerator/RMG-Py/blob/bfaee1cad9909a17103a8e6ef9a22569c475964c/.github/workflows/CI.yml#L359C1-L386C54
+    # which is also shamelessly copied from somewhere
+    runs-on: ubuntu-latest
+    # only run if the tests pass
+    needs: [test]
+    # run only on pushes to main on chemprop
+    if:  ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && github.repository == 'chemprop/chemprop'}}
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          # repository secretes managed by the maintainers
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Build and Push
+        uses: docker/build-push-action@v4
+        with:
+          push: true
+          tags: chemprop/chemprop:latest

chemprop-updated/.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.idea
+*.DS_Store
+*.vscode
+*.csv
+*.pkl
+*.pt
+*.json
+*.sqlite3
+*.yaml
+*.tfevents.*
+*.ckpt
+chemprop/_version.py
+*.ckpt
+*.ipynb
+config.toml
+!tests/data/*

chemprop-updated/.readthedocs.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+# Required
+version: 2
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+  jobs:
+    post_install:
+      - python -m pip install --upgrade --upgrade-strategy only-if-needed --no-cache-dir ".[docs]"
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py

chemprop-updated/CITATIONS.bib ADDED Viewed

	@@ -0,0 +1,37 @@

+# this was downloaded from ACS: https://pubs.acs.org/doi/10.1021/acs.jcim.9b00237
+@article{chemprop_theory,
+    author = {Yang, Kevin and Swanson, Kyle and Jin, Wengong and Coley, Connor and Eiden, Philipp and Gao, Hua and Guzman-Perez, Angel and Hopper, Timothy and Kelley, Brian and Mathea, Miriam and Palmer, Andrew and Settels, Volker and Jaakkola, Tommi and Jensen, Klavs and Barzilay, Regina},
+    title = {Analyzing Learned Molecular Representations for Property Prediction},
+    journal = {Journal of Chemical Information and Modeling},
+    volume = {59},
+    number = {8},
+    pages = {3370-3388},
+    year = {2019},
+    doi = {10.1021/acs.jcim.9b00237},
+        note ={PMID: 31361484},
+    URL = {
+            https://doi.org/10.1021/acs.jcim.9b00237
+    },
+    eprint = {
+            https://doi.org/10.1021/acs.jcim.9b00237
+    }
+}
+# this was downloaded from ACS: https://pubs.acs.org/doi/10.1021/acs.jcim.3c01250
+@article{chemprop_software,
+    author = {Heid, Esther and Greenman, Kevin P. and Chung, Yunsie and Li, Shih-Cheng and Graff, David E. and Vermeire, Florence H. and Wu, Haoyang and Green, William H. and McGill, Charles J.},
+    title = {Chemprop: A Machine Learning Package for Chemical Property Prediction},
+    journal = {Journal of Chemical Information and Modeling},
+    volume = {64},
+    number = {1},
+    pages = {9-17},
+    year = {2024},
+    doi = {10.1021/acs.jcim.3c01250},
+        note ={PMID: 38147829},
+    URL = {
+            https://doi.org/10.1021/acs.jcim.3c01250
+    },
+    eprint = {
+            https://doi.org/10.1021/acs.jcim.3c01250
+    }
+}

chemprop-updated/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# How to contribute
+We welcome contributions from external contributors, and this document
+describes how to merge code changes into this repository.
+## Getting Started
+* Make sure you have a [GitHub account](https://github.com/signup/free).
+* [Fork](https://help.github.com/articles/fork-a-repo/) this repository on GitHub.
+* On your local machine,
+  [clone](https://help.github.com/articles/cloning-a-repository/) your fork of
+  the repository.
+## Making Changes
+* Add some really awesome code to your local fork.  It's usually a [good
+  idea](http://blog.jasonmeridth.com/posts/do-not-issue-pull-requests-from-your-master-branch/)
+  to make changes on a
+  [branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/)
+  with the branch name relating to the feature you are going to add.
+* When you are ready for others to examine and comment on your new feature,
+  navigate to your fork of `chemprop` on GitHub and open a [pull
+  request](https://help.github.com/articles/using-pull-requests/) (PR). Note that
+  after you launch a PR from one of your fork's branches, all
+  subsequent commits to that branch will be added to the open pull request
+  automatically.  Each commit added to the PR will be validated for
+  mergability, compilation and test suite compliance; the results of these tests
+  will be visible on the PR page.
+* If you're providing a new feature, you **must** add test cases and documentation.
+* When the code is ready to go, run the test suite: `pytest`.
+* When you're ready to be considered for merging, click the "Ready for review"
+  box on the PR page to let the Chemprop devs know that the changes are complete.
+  The code will not be merged until the continuous integration returns checkmarks,
+  and at least one core developer gives "Approved" reviews.
+## Additional Resources
+* [General GitHub documentation](https://help.github.com/)
+* [PR best practices](http://codeinthehole.com/writing/pull-requests-and-other-good-practices-for-teams-using-github/)
+* [A guide to contributing to software packages](http://www.contribution-guide.org)

chemprop-updated/Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+# Dockerfile
+#
+# Builds a Docker image containing Chemprop and its required dependencies.
+#
+# Build this image with:
+#  git clone https://github.com/chemprop/chemprop.git
+#  cd chemprop
+#  docker build --tag=chemprop:latest .
+#
+# Run the built image with:
+#  docker run --name chemprop_container -it chemprop:latest
+#
+# Note:
+# This image only runs on CPU - we do not provide a Dockerfile
+# for GPU use (see installation documentation).
+# Parent Image
+FROM continuumio/miniconda3:latest
+# Install libxrender1 (required by RDKit) and then clean up
+RUN apt-get update && \
+    apt-get install -y \
+    libxrender1 && \
+    apt-get autoremove -y && \
+    apt-get clean -y
+WORKDIR /opt/chemprop
+# build an empty conda environment with appropriate Python version
+RUN conda create --name chemprop_env python=3.11*
+# This runs all subsequent commands inside the chemprop_env conda environment
+#
+# Analogous to just activating the environment, which we can't actually do here
+# since that requires running conda init and restarting the shell (not possible
+# in a Dockerfile build script)
+SHELL ["conda", "run", "--no-capture-output", "-n", "chemprop_env", "/bin/bash", "-c"]
+# Follow the installation instructions then clear the cache
+ADD chemprop chemprop
+ENV PYTHONPATH /opt/chemprop
+ADD LICENSE.txt pyproject.toml README.md ./
+RUN conda install pytorch cpuonly -c pytorch && \
+    conda clean --all --yes && \
+    python -m pip install . && \
+    python -m pip cache purge
+# when running this image, open an interactive bash terminal inside the conda environment
+RUN echo "conda activate chemprop_env" > ~/.bashrc
+ENTRYPOINT ["/bin/bash", "--login"]

chemprop-updated/LICENSE.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+MIT License
+Copyright (c) 2024 The Chemprop Development Team (Regina Barzilay,
+Jackson Burns, Yunsie Chung, Anna Doner, Xiaorui Dong, David Graff,
+William Green, Kevin Greenman, Yanfei Guan, Esther Heid, Lior Hirschfeld,
+Tommi Jaakkola, Wengong Jin, Olivier Lafontant-Joseph, Shih-Cheng Li,
+Mengjie Liu, Joel Manu, Charles McGill, Angiras Menon, Nathan Morgan,
+Hao-Wei Pang, Kevin Spiekermann, Kyle Swanson, Allison Tam,
+Florence Vermeire, Haoyang Wu, and Kevin Yang, Jonathan Zheng)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

chemprop-updated/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+<picture>
+  <source media="(prefers-color-scheme: dark)" srcset="docs/source/_static/images/logo/chemprop_logo_dark_mode.svg">
+  <img alt="ChemProp Logo" src="docs/source/_static/images/logo/chemprop_logo.svg">
+</picture>
+# Chemprop
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/chemprop)](https://badge.fury.io/py/chemprop)
+[![PyPI version](https://badge.fury.io/py/chemprop.svg)](https://badge.fury.io/py/chemprop)
+[![Anaconda-Server Badge](https://anaconda.org/conda-forge/chemprop/badges/version.svg)](https://anaconda.org/conda-forge/chemprop)
+[![Build Status](https://github.com/chemprop/chemprop/workflows/tests/badge.svg)](https://github.com/chemprop/chemprop/actions/workflows/tests.yml)
+[![Documentation Status](https://readthedocs.org/projects/chemprop/badge/?version=main)](https://chemprop.readthedocs.io/en/main/?badge=main)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Downloads](https://static.pepy.tech/badge/chemprop)](https://pepy.tech/project/chemprop)
+[![Downloads](https://static.pepy.tech/badge/chemprop/month)](https://pepy.tech/project/chemprop)
+[![Downloads](https://static.pepy.tech/badge/chemprop/week)](https://pepy.tech/project/chemprop)
+Chemprop is a repository containing message passing neural networks for molecular property prediction.
+Documentation can be found [here](https://chemprop.readthedocs.io/en/main/).
+There are tutorial notebooks in the [`examples/`](https://github.com/chemprop/chemprop/tree/main/examples) directory.
+Chemprop recently underwent a ground-up rewrite and new major release (v2.0.0). A helpful transition guide from Chemprop v1 to v2 can be found [here](https://docs.google.com/spreadsheets/u/3/d/e/2PACX-1vRshySIknVBBsTs5P18jL4WeqisxDAnDE5VRnzxqYEhYrMe4GLS17w5KeKPw9sged6TmmPZ4eEZSTIy/pubhtml). This includes a side-by-side comparison of CLI argument options, a list of which arguments will be implemented in later versions of v2, and a list of changes to default hyperparameters.
+**License:** Chemprop is free to use under the [MIT License](LICENSE.txt). The Chemprop logo is free to use under [CC0 1.0](docs/source/_static/images/logo/LICENSE.txt).
+**References**: Please cite the appropriate papers if Chemprop is helpful to your research.
+- Chemprop was initially described in the papers [Analyzing Learned Molecular Representations for Property Prediction](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.9b00237) for molecules and [Machine Learning of Reaction Properties via Learned Representations of the Condensed Graph of Reaction](https://doi.org/10.1021/acs.jcim.1c00975) for reactions.
+- The interpretation functionality (available in v1, but not yet implemented in v2) is based on the paper [Multi-Objective Molecule Generation using Interpretable Substructures](https://arxiv.org/abs/2002.03244).
+- Chemprop now has its own dedicated manuscript that describes and benchmarks it in more detail: [Chemprop: A Machine Learning Package for Chemical Property Prediction](https://doi.org/10.1021/acs.jcim.3c01250).
+- A paper describing and benchmarking the changes in v2.0.0 is forthcoming.
+**Selected Applications**: Chemprop has been successfully used in the following works.
+- [A Deep Learning Approach to Antibiotic Discovery](https://www.cell.com/cell/fulltext/S0092-8674(20)30102-1) - _Cell_ (2020): Chemprop was used to predict antibiotic activity against _E. coli_, leading to the discovery of [Halicin](https://en.wikipedia.org/wiki/Halicin), a novel antibiotic candidate. Model checkpoints are availabile on [Zenodo](https://doi.org/10.5281/zenodo.6527882).
+- [Discovery of a structural class of antibiotics with explainable deep learning](https://www.nature.com/articles/s41586-023-06887-8) - _Nature_ (2023): Identified a structural class of antibiotics selective against methicillin-resistant _S. aureus_ (MRSA) and vancomycin-resistant enterococci using ensembles of Chemprop models, and explained results using Chemprop's interpret method.
+- [ADMET-AI: A machine learning ADMET platform for evaluation of large-scale chemical libraries](https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/btae416/7698030?utm_source=authortollfreelink&utm_campaign=bioinformatics&utm_medium=email&guestAccessKey=f4fca1d2-49ec-4b10-b476-5aea3bf37045): Chemprop was trained on 41 absorption, distribution, metabolism, excretion, and toxicity (ADMET) datasets from the [Therapeutics Data Commons](https://tdcommons.ai). The Chemprop models in ADMET-AI are available both as a web server at [admet.ai.greenstonebio.com](https://admet.ai.greenstonebio.com) and as a Python package at [github.com/swansonk14/admet_ai](https://github.com/swansonk14/admet_ai).
+- A more extensive list of successful Chemprop applications is given in our [2023 paper](https://doi.org/10.1021/acs.jcim.3c01250)
+## Version 1.x
+For users who have not yet made the switch to Chemprop v2.0, please reference the following resources.
+### v1 Documentation
+- Documentation of Chemprop v1 is available [here](https://chemprop.readthedocs.io/en/v1.7.1/). Note that the content of this site is several versions behind the final v1 release (v1.7.1) and does not cover the full scope of features available in chemprop v1.
+- The v1 [README](https://github.com/chemprop/chemprop/blob/v1.7.1/README.md) is the best source for documentation on more recently-added features.
+- Please also see descriptions of all the possible command line arguments in the v1 [`args.py`](https://github.com/chemprop/chemprop/blob/v1.7.1/chemprop/args.py) file.
+### v1 Tutorials and Examples
+- [Benchmark scripts](https://github.com/chemprop/chemprop_benchmark) - scripts from our 2023 paper, providing examples of many features using Chemprop v1.6.1
+- [ACS Fall 2023 Workshop](https://github.com/chemprop/chemprop-workshop-acs-fall2023) - presentation, interactive demo, exercises on Google Colab with solution key
+- [Google Colab notebook](https://colab.research.google.com/github/chemprop/chemprop/blob/v1.7.1/colab_demo.ipynb) - several examples, intended to be run in Google Colab rather than as a Jupyter notebook on your local machine
+- [nanoHUB tool](https://nanohub.org/resources/chempropdemo/) - a notebook of examples similar to the Colab notebook above, doesn't require any installation
+  - [YouTube video](https://www.youtube.com/watch?v=TeOl5E8Wo2M) - lecture accompanying nanoHUB tool
+- These [slides](https://docs.google.com/presentation/d/14pbd9LTXzfPSJHyXYkfLxnK8Q80LhVnjImg8a3WqCRM/edit?usp=sharing) provide a Chemprop tutorial and highlight additions as of April 28th, 2020
+### v1 Known Issues
+We have discontinued support for v1 since v2 has been released, but we still appreciate v1 bug reports and will tag them as [`v1-wontfix`](https://github.com/chemprop/chemprop/issues?q=label%3Av1-wontfix+) so the community can find them easily.

chemprop-updated/chemprop/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from . import data, exceptions, featurizers, models, nn, schedulers, utils
+__all__ = ["data", "featurizers", "models", "nn", "utils", "exceptions", "schedulers"]
+__version__ = "2.1.2"

chemprop-updated/chemprop/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (743 Bytes). View file

chemprop-updated/chemprop/__pycache__/args.cpython-37.pyc ADDED Viewed

Binary file (33.7 kB). View file

chemprop-updated/chemprop/__pycache__/constants.cpython-37.pyc ADDED Viewed

Binary file (430 Bytes). View file

chemprop-updated/chemprop/__pycache__/hyperopt_utils.cpython-37.pyc ADDED Viewed

Binary file (11.1 kB). View file

chemprop-updated/chemprop/__pycache__/hyperparameter_optimization.cpython-37.pyc ADDED Viewed

Binary file (6.15 kB). View file

chemprop-updated/chemprop/__pycache__/interpret.cpython-37.pyc ADDED Viewed

Binary file (14.2 kB). View file

chemprop-updated/chemprop/__pycache__/multitask_utils.cpython-37.pyc ADDED Viewed

Binary file (3.12 kB). View file

chemprop-updated/chemprop/__pycache__/nn_utils.cpython-37.pyc ADDED Viewed

Binary file (8.13 kB). View file

chemprop-updated/chemprop/__pycache__/rdkit.cpython-37.pyc ADDED Viewed

Binary file (1.43 kB). View file

chemprop-updated/chemprop/__pycache__/sklearn_predict.cpython-37.pyc ADDED Viewed

Binary file (2.82 kB). View file

chemprop-updated/chemprop/__pycache__/sklearn_train.cpython-37.pyc ADDED Viewed

Binary file (11.4 kB). View file

chemprop-updated/chemprop/__pycache__/spectra_utils.cpython-37.pyc ADDED Viewed

Binary file (5.1 kB). View file

chemprop-updated/chemprop/__pycache__/utils.cpython-37.pyc ADDED Viewed

Binary file (26.5 kB). View file

chemprop-updated/chemprop/cli/common.py ADDED Viewed

	@@ -0,0 +1,211 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+from chemprop.cli.utils import LookupAction
+from chemprop.cli.utils.args import uppercase
+from chemprop.featurizers import AtomFeatureMode, MoleculeFeaturizerRegistry, RxnMode
+logger = logging.getLogger(__name__)
+def add_common_args(parser: ArgumentParser) -> ArgumentParser:
+    data_args = parser.add_argument_group("Shared input data args")
+    data_args.add_argument(
+        "-s",
+        "--smiles-columns",
+        nargs="+",
+        help="Column names in the input CSV containing SMILES strings (uses the 0th column by default)",
+    )
+    data_args.add_argument(
+        "-r",
+        "--reaction-columns",
+        nargs="+",
+        help="Column names in the input CSV containing reaction SMILES in the format ``REACTANT>AGENT>PRODUCT``, where 'AGENT' is optional",
+    )
+    data_args.add_argument(
+        "--no-header-row",
+        action="store_true",
+        help="Turn off using the first row in the input CSV as column names",
+    )
+    dataloader_args = parser.add_argument_group("Dataloader args")
+    dataloader_args.add_argument(
+        "-n",
+        "--num-workers",
+        type=int,
+        default=0,
+        help="""Number of workers for parallel data loading where 0 means sequential
+(Warning: setting ``num_workers`` to a value greater than 0 can cause hangs on Windows and MacOS)""",
+    )
+    dataloader_args.add_argument("-b", "--batch-size", type=int, default=64, help="Batch size")
+    parser.add_argument(
+        "--accelerator", default="auto", help="Passed directly to the lightning ``Trainer()``"
+    )
+    parser.add_argument(
+        "--devices",
+        default="auto",
+        help="Passed directly to the lightning ``Trainer()`` (must be a single string of comma separated devices, e.g. '1, 2' if specifying multiple devices)",
+    )
+    featurization_args = parser.add_argument_group("Featurization args")
+    featurization_args.add_argument(
+        "--rxn-mode",
+        "--reaction-mode",
+        type=uppercase,
+        default="REAC_DIFF",
+        choices=list(RxnMode.keys()),
+        help="""Choices for construction of atom and bond features for reactions (case insensitive):
+- ``REAC_PROD``: concatenates the reactants feature with the products feature
+- ``REAC_DIFF``: concatenates the reactants feature with the difference in features between reactants and products (Default)
+- ``PROD_DIFF``: concatenates the products feature with the difference in features between reactants and products
+- ``REAC_PROD_BALANCE``: concatenates the reactants feature with the products feature, balances imbalanced reactions
+- ``REAC_DIFF_BALANCE``: concatenates the reactants feature with the difference in features between reactants and products, balances imbalanced reactions
+- ``PROD_DIFF_BALANCE``: concatenates the products feature with the difference in features between reactants and products, balances imbalanced reactions""",
+    )
+    # TODO: Update documenation for multi_hot_atom_featurizer_mode
+    featurization_args.add_argument(
+        "--multi-hot-atom-featurizer-mode",
+        type=uppercase,
+        default="V2",
+        choices=list(AtomFeatureMode.keys()),
+        help="""Choices for multi-hot atom featurization scheme. This will affect both non-reaction and reaction feturization (case insensitive):
+- ``V1``: Corresponds to the original configuration employed in the Chemprop V1
+- ``V2``: Tailored for a broad range of molecules, this configuration encompasses all elements in the first four rows of the periodic table, along with iodine. It is the default in Chemprop V2.
+- ``ORGANIC``: This configuration is designed specifically for use with organic molecules for drug research and development and includes a subset of elements most common in organic chemistry, including H, B, C, N, O, F, Si, P, S, Cl, Br, and I.
+- ``RIGR``: Modified V2 (default) featurizer using only the resonance-invariant atom and bond features.""",
+    )
+    featurization_args.add_argument(
+        "--keep-h",
+        action="store_true",
+        help="Whether hydrogens explicitly specified in input should be kept in the mol graph",
+    )
+    featurization_args.add_argument(
+        "--add-h", action="store_true", help="Whether hydrogens should be added to the mol graph"
+    )
+    featurization_args.add_argument(
+        "--molecule-featurizers",
+        "--features-generators",
+        nargs="+",
+        action=LookupAction(MoleculeFeaturizerRegistry),
+        help="Method(s) of generating molecule features to use as extra descriptors",
+    )
+    # TODO: add in v2.1 to deprecate features-generators and then remove in v2.2
+    # featurization_args.add_argument(
+    #     "--features-generators", nargs="+", help="Renamed to `--molecule-featurizers`."
+    # )
+    featurization_args.add_argument(
+        "--descriptors-path",
+        type=Path,
+        help="Path to extra descriptors to concatenate to learned representation",
+    )
+    # TODO: Add in v2.1
+    # featurization_args.add_argument(
+    #     "--phase-features-path",
+    #     help="Path to features used to indicate the phase of the data in one-hot vector form. Used in spectra datatype.",
+    # )
+    featurization_args.add_argument(
+        "--no-descriptor-scaling", action="store_true", help="Turn off extra descriptor scaling"
+    )
+    featurization_args.add_argument(
+        "--no-atom-feature-scaling", action="store_true", help="Turn off extra atom feature scaling"
+    )
+    featurization_args.add_argument(
+        "--no-atom-descriptor-scaling",
+        action="store_true",
+        help="Turn off extra atom descriptor scaling",
+    )
+    featurization_args.add_argument(
+        "--no-bond-feature-scaling", action="store_true", help="Turn off extra bond feature scaling"
+    )
+    featurization_args.add_argument(
+        "--atom-features-path",
+        nargs="+",
+        action="append",
+        help="If a single path is given, it is assumed to correspond to the 0-th molecule. Alternatively, it can be a two-tuple of molecule index and path to additional atom features to supply before message passing (e.g., ``--atom-features-path 0 /path/to/features_0.npz``) indicates that the features at the given path should be supplied to the 0-th component. To supply additional features for multiple components, repeat this argument on the command line for each component's respective values (e.g., ``--atom-features-path [...] --atom-features-path [...]``).",
+    )
+    featurization_args.add_argument(
+        "--atom-descriptors-path",
+        nargs="+",
+        action="append",
+        help="If a single path is given, it is assumed to correspond to the 0-th molecule. Alternatively, it can be a two-tuple of molecule index and path to additional atom descriptors to supply after message passing (e.g., ``--atom-descriptors-path 0 /path/to/descriptors_0.npz`` indicates that the descriptors at the given path should be supplied to the 0-th component. To supply additional descriptors for multiple components, repeat this argument on the command line for each component's respective values (e.g., ``--atom-descriptors-path [...] --atom-descriptors-path [...]``).",
+    )
+    featurization_args.add_argument(
+        "--bond-features-path",
+        nargs="+",
+        action="append",
+        help="If a single path is given, it is assumed to correspond to the 0-th molecule. Alternatively, it can be a two-tuple of molecule index and path to additional bond features to supply before message passing (e.g., ``--bond-features-path 0 /path/to/features_0.npz`` indicates that the features at the given path should be supplied to the 0-th component. To supply additional features for multiple components, repeat this argument on the command line for each component's respective values (e.g., ``--bond-features-path [...] --bond-features-path [...]``).",
+    )
+    # TODO: Add in v2.2
+    # parser.add_argument(
+    #     "--constraints-path",
+    #     help="Path to constraints applied to atomic/bond properties prediction.",
+    # )
+    return parser
+def process_common_args(args: Namespace) -> Namespace:
+    # TODO: add in v2.1 to deprecate features-generators and then remove in v2.2
+    # if args.features_generators is not None:
+    #     raise ArgumentError(
+    #         argument=None,
+    #         message="`--features-generators` has been renamed to `--molecule-featurizers`.",
+    #     )
+    for key in ["atom_features_path", "atom_descriptors_path", "bond_features_path"]:
+        inds_paths = getattr(args, key)
+        if not inds_paths:
+            continue
+        ind_path_dict = {}
+        for ind_path in inds_paths:
+            if len(ind_path) > 2:
+                raise ArgumentError(
+                    argument=None,
+                    message="Too many arguments were given for atom features/descriptors or bond features. It should be either a two-tuple of molecule index and a path, or a single path (assumed to be the 0-th molecule).",
+                )
+            if len(ind_path) == 1:
+                ind = 0
+                path = ind_path[0]
+            else:
+                ind, path = ind_path
+            if ind_path_dict.get(int(ind), None):
+                raise ArgumentError(
+                    argument=None,
+                    message=f"Duplicate atom features/descriptors or bond features given for molecule index {ind}",
+                )
+            ind_path_dict[int(ind)] = Path(path)
+        setattr(args, key, ind_path_dict)
+    return args
+def validate_common_args(args):
+    pass
+def find_models(model_paths: list[Path]):
+    collected_model_paths = []
+    for model_path in model_paths:
+        if model_path.suffix in [".ckpt", ".pt"]:
+            collected_model_paths.append(model_path)
+        elif model_path.is_dir():
+            collected_model_paths.extend(list(model_path.rglob("*.pt")))
+        else:
+            raise ArgumentError(
+                argument=None,
+                message=f"Expected a .ckpt or .pt file, or a directory. Got {model_path}",
+            )
+    return collected_model_paths

chemprop-updated/chemprop/cli/conf.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from datetime import datetime
+import logging
+import os
+from pathlib import Path
+LOG_DIR = Path(os.getenv("CHEMPROP_LOG_DIR", "chemprop_logs"))
+LOG_LEVELS = {0: logging.INFO, 1: logging.DEBUG, -1: logging.WARNING, -2: logging.ERROR}
+NOW = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+CHEMPROP_TRAIN_DIR = Path(os.getenv("CHEMPROP_TRAIN_DIR", "chemprop_training"))

chemprop-updated/chemprop/cli/convert.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+import sys
+from chemprop.cli.utils import Subcommand
+from chemprop.utils.v1_to_v2 import convert_model_file_v1_to_v2
+logger = logging.getLogger(__name__)
+class ConvertSubcommand(Subcommand):
+    COMMAND = "convert"
+    HELP = "Convert a v1 model checkpoint (.pt) to a v2 model checkpoint (.pt)."
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser.add_argument(
+            "-i",
+            "--input-path",
+            required=True,
+            type=Path,
+            help="Path to a v1 model .pt checkpoint file",
+        )
+        parser.add_argument(
+            "-o",
+            "--output-path",
+            type=Path,
+            help="Path to which the converted model will be saved (``CURRENT_DIRECTORY/STEM_OF_INPUT_v2.pt`` by default)",
+        )
+        return parser
+    @classmethod
+    def func(cls, args: Namespace):
+        if args.output_path is None:
+            args.output_path = Path(args.input_path.stem + "_v2.pt")
+        if args.output_path.suffix != ".pt":
+            raise ArgumentError(
+                argument=None, message=f"Output must be a `.pt` file. Got {args.output_path}"
+            )
+        logger.info(
+            f"Converting v1 model checkpoint '{args.input_path}' to v2 model checkpoint '{args.output_path}'..."
+        )
+        convert_model_file_v1_to_v2(args.input_path, args.output_path)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = ConvertSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    ConvertSubcommand.func(args)

chemprop-updated/chemprop/cli/fingerprint.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+import torch
+from chemprop import data
+from chemprop.cli.common import add_common_args, process_common_args, validate_common_args
+from chemprop.cli.predict import find_models
+from chemprop.cli.utils import Subcommand, build_data_from_files, make_dataset
+from chemprop.models import load_model
+from chemprop.nn.metrics import LossFunctionRegistry
+logger = logging.getLogger(__name__)
+class FingerprintSubcommand(Subcommand):
+    COMMAND = "fingerprint"
+    HELP = "Use a pretrained chemprop model to calculate learned representations."
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        parser.add_argument(
+            "-i",
+            "--test-path",
+            required=True,
+            type=Path,
+            help="Path to an input CSV file containing SMILES",
+        )
+        parser.add_argument(
+            "-o",
+            "--output",
+            "--preds-path",
+            type=Path,
+            help="Specify the path where predictions will be saved. If the file extension is .npz, they will be saved as a npz file. Otherwise, the predictions will be saved as a CSV. The index of the model will be appended to the filename's stem. By default, predictions will be saved to the same location as ``--test-path`` with '_fps' appended (e.g., 'PATH/TO/TEST_PATH_fps_0.csv').",
+        )
+        parser.add_argument(
+            "--model-paths",
+            "--model-path",
+            required=True,
+            type=Path,
+            nargs="+",
+            help="Specify location of checkpoint(s) or model file(s) to use for prediction. It can be a path to either a single pretrained model checkpoint (.ckpt) or single pretrained model file (.pt), a directory that contains these files, or a list of path(s) and directory(s). If a directory, chemprop will recursively search and predict on all found (.pt) models.",
+        )
+        parser.add_argument(
+            "--ffn-block-index",
+            required=True,
+            type=int,
+            default=-1,
+            help="The index indicates which linear layer returns the encoding in the FFN. An index of 0 denotes the post-aggregation representation through a 0-layer MLP, while an index of 1 represents the output from the first linear layer in the FFN, and so forth.",
+        )
+        return parser
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        validate_common_args(args)
+        args = process_fingerprint_args(args)
+        main(args)
+def process_fingerprint_args(args: Namespace) -> Namespace:
+    if args.test_path.suffix not in [".csv"]:
+        raise ArgumentError(
+            argument=None, message=f"Input data must be a CSV file. Got {args.test_path}"
+        )
+    if args.output is None:
+        args.output = args.test_path.parent / (args.test_path.stem + "_fps.csv")
+    if args.output.suffix not in [".csv", ".npz"]:
+        raise ArgumentError(
+            argument=None, message=f"Output must be a CSV or NPZ file. Got '{args.output}'."
+        )
+    return args
+def make_fingerprint_for_model(
+    args: Namespace, model_path: Path, multicomponent: bool, output_path: Path
+):
+    model = load_model(model_path, multicomponent)
+    model.eval()
+    bounded = any(
+        isinstance(model.criterion, LossFunctionRegistry[loss_function])
+        for loss_function in LossFunctionRegistry.keys()
+        if "bounded" in loss_function
+    )
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        target_cols=[],
+        ignore_cols=None,
+        splits_col=None,
+        weight_col=None,
+        bounded=bounded,
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers, keep_h=args.keep_h, add_h=args.add_h
+    )
+    test_data = build_data_from_files(
+        args.test_path,
+        **format_kwargs,
+        p_descriptors=args.descriptors_path,
+        p_atom_feats=args.atom_features_path,
+        p_bond_feats=args.bond_features_path,
+        p_atom_descs=args.atom_descriptors_path,
+        **featurization_kwargs,
+    )
+    logger.info(f"test size: {len(test_data[0])}")
+    test_dsets = [
+        make_dataset(d, args.rxn_mode, args.multi_hot_atom_featurizer_mode) for d in test_data
+    ]
+    if multicomponent:
+        test_dset = data.MulticomponentDataset(test_dsets)
+    else:
+        test_dset = test_dsets[0]
+    test_loader = data.build_dataloader(test_dset, args.batch_size, args.num_workers, shuffle=False)
+    logger.info(model)
+    with torch.no_grad():
+        if multicomponent:
+            encodings = [
+                model.encoding(batch.bmgs, batch.V_ds, batch.X_d, args.ffn_block_index)
+                for batch in test_loader
+            ]
+        else:
+            encodings = [
+                model.encoding(batch.bmg, batch.V_d, batch.X_d, args.ffn_block_index)
+                for batch in test_loader
+            ]
+        H = torch.cat(encodings, 0).numpy()
+    if output_path.suffix in [".npz"]:
+        np.savez(output_path, H=H)
+    elif output_path.suffix == ".csv":
+        fingerprint_columns = [f"fp_{i}" for i in range(H.shape[1])]
+        df_fingerprints = pd.DataFrame(H, columns=fingerprint_columns)
+        df_fingerprints.to_csv(output_path, index=False)
+    else:
+        raise ArgumentError(
+            argument=None, message=f"Output must be a CSV or npz file. Got {args.output}."
+        )
+    logger.info(f"Fingerprints saved to '{output_path}'")
+def main(args):
+    match (args.smiles_columns, args.reaction_columns):
+        case [None, None]:
+            n_components = 1
+        case [_, None]:
+            n_components = len(args.smiles_columns)
+        case [None, _]:
+            n_components = len(args.reaction_columns)
+        case _:
+            n_components = len(args.smiles_columns) + len(args.reaction_columns)
+    multicomponent = n_components > 1
+    for i, model_path in enumerate(find_models(args.model_paths)):
+        logger.info(f"Fingerprints with model {i} at '{model_path}'")
+        output_path = args.output.parent / f"{args.output.stem}_{i}{args.output.suffix}"
+        make_fingerprint_for_model(args, model_path, multicomponent, output_path)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = FingerprintSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    args = FingerprintSubcommand.func(args)

chemprop-updated/chemprop/cli/hpopt.py ADDED Viewed

	@@ -0,0 +1,537 @@

+from copy import deepcopy
+import logging
+from pathlib import Path
+import shutil
+import sys
+from configargparse import ArgumentParser, Namespace
+from lightning import pytorch as pl
+from lightning.pytorch.callbacks import EarlyStopping
+import numpy as np
+import torch
+from chemprop.cli.common import add_common_args, process_common_args, validate_common_args
+from chemprop.cli.train import (
+    TrainSubcommand,
+    add_train_args,
+    build_datasets,
+    build_model,
+    build_splits,
+    normalize_inputs,
+    process_train_args,
+    save_config,
+    validate_train_args,
+)
+from chemprop.cli.utils.command import Subcommand
+from chemprop.data import build_dataloader
+from chemprop.nn import AggregationRegistry, MetricRegistry
+from chemprop.nn.transforms import UnscaleTransform
+from chemprop.nn.utils import Activation
+NO_RAY = False
+DEFAULT_SEARCH_SPACE = {
+    "activation": None,
+    "aggregation": None,
+    "aggregation_norm": None,
+    "batch_size": None,
+    "depth": None,
+    "dropout": None,
+    "ffn_hidden_dim": None,
+    "ffn_num_layers": None,
+    "final_lr_ratio": None,
+    "message_hidden_dim": None,
+    "init_lr_ratio": None,
+    "max_lr": None,
+    "warmup_epochs": None,
+}
+try:
+    import ray
+    from ray import tune
+    from ray.train import CheckpointConfig, RunConfig, ScalingConfig
+    from ray.train.lightning import (
+        RayDDPStrategy,
+        RayLightningEnvironment,
+        RayTrainReportCallback,
+        prepare_trainer,
+    )
+    from ray.train.torch import TorchTrainer
+    from ray.tune.schedulers import ASHAScheduler, FIFOScheduler
+    DEFAULT_SEARCH_SPACE = {
+        "activation": tune.choice(categories=list(Activation.keys())),
+        "aggregation": tune.choice(categories=list(AggregationRegistry.keys())),
+        "aggregation_norm": tune.quniform(lower=1, upper=200, q=1),
+        "batch_size": tune.choice([16, 32, 64, 128, 256]),
+        "depth": tune.qrandint(lower=2, upper=6, q=1),
+        "dropout": tune.choice([0.0] * 8 + list(np.arange(0.05, 0.45, 0.05))),
+        "ffn_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
+        "ffn_num_layers": tune.qrandint(lower=1, upper=3, q=1),
+        "final_lr_ratio": tune.loguniform(lower=1e-2, upper=1),
+        "message_hidden_dim": tune.qrandint(lower=300, upper=2400, q=100),
+        "init_lr_ratio": tune.loguniform(lower=1e-2, upper=1),
+        "max_lr": tune.loguniform(lower=1e-4, upper=1e-2),
+        "warmup_epochs": None,
+    }
+except ImportError:
+    NO_RAY = True
+NO_HYPEROPT = False
+try:
+    from ray.tune.search.hyperopt import HyperOptSearch
+except ImportError:
+    NO_HYPEROPT = True
+NO_OPTUNA = False
+try:
+    from ray.tune.search.optuna import OptunaSearch
+except ImportError:
+    NO_OPTUNA = True
+logger = logging.getLogger(__name__)
+SEARCH_SPACE = DEFAULT_SEARCH_SPACE
+SEARCH_PARAM_KEYWORDS_MAP = {
+    "basic": ["depth", "ffn_num_layers", "dropout", "ffn_hidden_dim", "message_hidden_dim"],
+    "learning_rate": ["max_lr", "init_lr_ratio", "final_lr_ratio", "warmup_epochs"],
+    "all": list(DEFAULT_SEARCH_SPACE.keys()),
+    "init_lr": ["init_lr_ratio"],
+    "final_lr": ["final_lr_ratio"],
+}
+class HpoptSubcommand(Subcommand):
+    COMMAND = "hpopt"
+    HELP = "Perform hyperparameter optimization on the given task."
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        parser = add_train_args(parser)
+        return add_hpopt_args(parser)
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        args = process_train_args(args)
+        args = process_hpopt_args(args)
+        validate_common_args(args)
+        validate_train_args(args)
+        main(args)
+def add_hpopt_args(parser: ArgumentParser) -> ArgumentParser:
+    hpopt_args = parser.add_argument_group("Chemprop hyperparameter optimization arguments")
+    hpopt_args.add_argument(
+        "--search-parameter-keywords",
+        type=str,
+        nargs="+",
+        default=["basic"],
+        help=f"""The model parameters over which to search for an optimal hyperparameter configuration. Some options are bundles of parameters or otherwise special parameter operations. Special keywords include:
+        - ``basic``: Default set of hyperparameters for search (depth, ffn_num_layers, dropout, message_hidden_dim, and ffn_hidden_dim)
+        - ``learning_rate``: Search for max_lr, init_lr_ratio, final_lr_ratio, and warmup_epochs. The search for init_lr and final_lr values are defined as fractions of the max_lr value. The search for warmup_epochs is as a fraction of the total epochs used.
+        - ``all``: Include search for all 13 individual keyword options (including: activation, aggregation, aggregation_norm, and batch_size which aren't included in the other two keywords).
+    Individual supported parameters:
+        {list(DEFAULT_SEARCH_SPACE.keys())}
+    """,
+    )
+    hpopt_args.add_argument(
+        "--hpopt-save-dir",
+        type=Path,
+        help="Directory to save the hyperparameter optimization results",
+    )
+    raytune_args = parser.add_argument_group("Ray Tune arguments")
+    raytune_args.add_argument(
+        "--raytune-num-samples",
+        type=int,
+        default=10,
+        help="Passed directly to Ray Tune ``TuneConfig`` to control number of trials to run",
+    )
+    raytune_args.add_argument(
+        "--raytune-search-algorithm",
+        choices=["random", "hyperopt", "optuna"],
+        default="hyperopt",
+        help="Passed to Ray Tune ``TuneConfig`` to control search algorithm",
+    )
+    raytune_args.add_argument(
+        "--raytune-trial-scheduler",
+        choices=["FIFO", "AsyncHyperBand"],
+        default="FIFO",
+        help="Passed to Ray Tune ``TuneConfig`` to control trial scheduler",
+    )
+    raytune_args.add_argument(
+        "--raytune-num-workers",
+        type=int,
+        default=1,
+        help="Passed directly to Ray Tune ``ScalingConfig`` to control number of workers to use",
+    )
+    raytune_args.add_argument(
+        "--raytune-use-gpu",
+        action="store_true",
+        help="Passed directly to Ray Tune ``ScalingConfig`` to control whether to use GPUs",
+    )
+    raytune_args.add_argument(
+        "--raytune-num-checkpoints-to-keep",
+        type=int,
+        default=1,
+        help="Passed directly to Ray Tune ``CheckpointConfig`` to control number of checkpoints to keep",
+    )
+    raytune_args.add_argument(
+        "--raytune-grace-period",
+        type=int,
+        default=10,
+        help="Passed directly to Ray Tune ``ASHAScheduler`` to control grace period",
+    )
+    raytune_args.add_argument(
+        "--raytune-reduction-factor",
+        type=int,
+        default=2,
+        help="Passed directly to Ray Tune ``ASHAScheduler`` to control reduction factor",
+    )
+    raytune_args.add_argument(
+        "--raytune-temp-dir", help="Passed directly to Ray Tune init to control temporary directory"
+    )
+    raytune_args.add_argument(
+        "--raytune-num-cpus",
+        type=int,
+        help="Passed directly to Ray Tune init to control number of CPUs to use",
+    )
+    raytune_args.add_argument(
+        "--raytune-num-gpus",
+        type=int,
+        help="Passed directly to Ray Tune init to control number of GPUs to use",
+    )
+    raytune_args.add_argument(
+        "--raytune-max-concurrent-trials",
+        type=int,
+        help="Passed directly to Ray Tune TuneConfig to control maximum concurrent trials",
+    )
+    hyperopt_args = parser.add_argument_group("Hyperopt arguments")
+    hyperopt_args.add_argument(
+        "--hyperopt-n-initial-points",
+        type=int,
+        help="Passed directly to ``HyperOptSearch`` to control number of initial points to sample",
+    )
+    hyperopt_args.add_argument(
+        "--hyperopt-random-state-seed",
+        type=int,
+        default=None,
+        help="Passed directly to ``HyperOptSearch`` to control random state seed",
+    )
+    return parser
+def process_hpopt_args(args: Namespace) -> Namespace:
+    if args.hpopt_save_dir is None:
+        args.hpopt_save_dir = Path(f"chemprop_hpopt/{args.data_path.stem}")
+    args.hpopt_save_dir.mkdir(exist_ok=True, parents=True)
+    search_parameters = set()
+    available_search_parameters = list(SEARCH_SPACE.keys()) + list(SEARCH_PARAM_KEYWORDS_MAP.keys())
+    for keyword in args.search_parameter_keywords:
+        if keyword not in available_search_parameters:
+            raise ValueError(
+                f"Search parameter keyword: {keyword} not in available options: {available_search_parameters}."
+            )
+        search_parameters.update(
+            SEARCH_PARAM_KEYWORDS_MAP[keyword]
+            if keyword in SEARCH_PARAM_KEYWORDS_MAP
+            else [keyword]
+        )
+    args.search_parameter_keywords = list(search_parameters)
+    if not args.hyperopt_n_initial_points:
+        args.hyperopt_n_initial_points = args.raytune_num_samples // 2
+    return args
+def build_search_space(search_parameters: list[str], train_epochs: int) -> dict:
+    if "warmup_epochs" in search_parameters and SEARCH_SPACE.get("warmup_epochs", None) is None:
+        assert (
+            train_epochs >= 6
+        ), "Training epochs must be at least 6 to perform hyperparameter optimization for warmup_epochs."
+        SEARCH_SPACE["warmup_epochs"] = tune.qrandint(lower=1, upper=train_epochs // 2, q=1)
+    return {param: SEARCH_SPACE[param] for param in search_parameters}
+def update_args_with_config(args: Namespace, config: dict) -> Namespace:
+    args = deepcopy(args)
+    for key, value in config.items():
+        match key:
+            case "final_lr_ratio":
+                setattr(args, "final_lr", value * config.get("max_lr", args.max_lr))
+            case "init_lr_ratio":
+                setattr(args, "init_lr", value * config.get("max_lr", args.max_lr))
+            case _:
+                assert key in args, f"Key: {key} not found in args."
+                setattr(args, key, value)
+    return args
+def train_model(config, args, train_dset, val_dset, logger, output_transform, input_transforms):
+    args = update_args_with_config(args, config)
+    train_loader = build_dataloader(
+        train_dset, args.batch_size, args.num_workers, seed=args.data_seed
+    )
+    val_loader = build_dataloader(val_dset, args.batch_size, args.num_workers, shuffle=False)
+    seed = args.pytorch_seed if args.pytorch_seed is not None else torch.seed()
+    torch.manual_seed(seed)
+    model = build_model(args, train_loader.dataset, output_transform, input_transforms)
+    logger.info(model)
+    if args.tracking_metric == "val_loss":
+        T_tracking_metric = model.criterion.__class__
+    else:
+        T_tracking_metric = MetricRegistry[args.tracking_metric]
+        args.tracking_metric = "val/" + args.tracking_metric
+    monitor_mode = "max" if T_tracking_metric.higher_is_better else "min"
+    logger.debug(f"Evaluation metric: '{T_tracking_metric.alias}', mode: '{monitor_mode}'")
+    patience = args.patience if args.patience is not None else args.epochs
+    early_stopping = EarlyStopping(args.tracking_metric, patience=patience, mode=monitor_mode)
+    trainer = pl.Trainer(
+        accelerator=args.accelerator,
+        devices=args.devices,
+        max_epochs=args.epochs,
+        gradient_clip_val=args.grad_clip,
+        strategy=RayDDPStrategy(),
+        callbacks=[RayTrainReportCallback(), early_stopping],
+        plugins=[RayLightningEnvironment()],
+        deterministic=args.pytorch_seed is not None,
+    )
+    trainer = prepare_trainer(trainer)
+    trainer.fit(model, train_loader, val_loader)
+def tune_model(
+    args, train_dset, val_dset, logger, monitor_mode, output_transform, input_transforms
+):
+    match args.raytune_trial_scheduler:
+        case "FIFO":
+            scheduler = FIFOScheduler()
+        case "AsyncHyperBand":
+            scheduler = ASHAScheduler(
+                max_t=args.epochs,
+                grace_period=min(args.raytune_grace_period, args.epochs),
+                reduction_factor=args.raytune_reduction_factor,
+            )
+        case _:
+            raise ValueError(f"Invalid trial scheduler! got: {args.raytune_trial_scheduler}.")
+    resources_per_worker = {}
+    if args.raytune_num_cpus and args.raytune_max_concurrent_trials:
+        resources_per_worker["CPU"] = args.raytune_num_cpus / args.raytune_max_concurrent_trials
+    if args.raytune_num_gpus and args.raytune_max_concurrent_trials:
+        resources_per_worker["GPU"] = args.raytune_num_gpus / args.raytune_max_concurrent_trials
+    if not resources_per_worker:
+        resources_per_worker = None
+    if args.raytune_num_gpus:
+        use_gpu = True
+    else:
+        use_gpu = args.raytune_use_gpu
+    scaling_config = ScalingConfig(
+        num_workers=args.raytune_num_workers,
+        use_gpu=use_gpu,
+        resources_per_worker=resources_per_worker,
+        trainer_resources={"CPU": 0},
+    )
+    checkpoint_config = CheckpointConfig(
+        num_to_keep=args.raytune_num_checkpoints_to_keep,
+        checkpoint_score_attribute=args.tracking_metric,
+        checkpoint_score_order=monitor_mode,
+    )
+    run_config = RunConfig(
+        checkpoint_config=checkpoint_config,
+        storage_path=args.hpopt_save_dir.absolute() / "ray_results",
+    )
+    ray_trainer = TorchTrainer(
+        lambda config: train_model(
+            config, args, train_dset, val_dset, logger, output_transform, input_transforms
+        ),
+        scaling_config=scaling_config,
+        run_config=run_config,
+    )
+    match args.raytune_search_algorithm:
+        case "random":
+            search_alg = None
+        case "hyperopt":
+            if NO_HYPEROPT:
+                raise ImportError(
+                    "HyperOptSearch requires hyperopt to be installed. Use 'pip install -U hyperopt' to install or use 'pip install -e .[hpopt]' in chemprop folder if you installed from source to install all hpopt relevant packages."
+                )
+            search_alg = HyperOptSearch(
+                n_initial_points=args.hyperopt_n_initial_points,
+                random_state_seed=args.hyperopt_random_state_seed,
+            )
+        case "optuna":
+            if NO_OPTUNA:
+                raise ImportError(
+                    "OptunaSearch requires optuna to be installed. Use 'pip install -U optuna' to install or use 'pip install -e .[hpopt]' in chemprop folder if you installed from source to install all hpopt relevant packages."
+                )
+            search_alg = OptunaSearch()
+    tune_config = tune.TuneConfig(
+        metric=args.tracking_metric,
+        mode=monitor_mode,
+        num_samples=args.raytune_num_samples,
+        scheduler=scheduler,
+        search_alg=search_alg,
+        trial_dirname_creator=lambda trial: str(trial.trial_id),
+    )
+    tuner = tune.Tuner(
+        ray_trainer,
+        param_space={
+            "train_loop_config": build_search_space(args.search_parameter_keywords, args.epochs)
+        },
+        tune_config=tune_config,
+    )
+    return tuner.fit()
+def main(args: Namespace):
+    if NO_RAY:
+        raise ImportError(
+            "Ray Tune requires ray to be installed. If you installed Chemprop from PyPI, run 'pip install -U ray[tune]' to install ray. If you installed from source, use 'pip install -e .[hpopt]' in Chemprop folder to install all hpopt relevant packages."
+        )
+    if not ray.is_initialized():
+        try:
+            ray.init(
+                _temp_dir=args.raytune_temp_dir,
+                num_cpus=args.raytune_num_cpus,
+                num_gpus=args.raytune_num_gpus,
+            )
+        except OSError as e:
+            if "AF_UNIX path length cannot exceed 107 bytes" in str(e):
+                raise OSError(
+                    f"Ray Tune fails due to: {e}. This can sometimes be solved by providing a temporary directory, num_cpus, and num_gpus to Ray Tune via the CLI: --raytune-temp-dir <absolute_path> --raytune-num-cpus <int> --raytune-num-gpus <int>."
+                )
+            else:
+                raise e
+    else:
+        logger.info("Ray is already initialized.")
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        target_cols=args.target_columns,
+        ignore_cols=args.ignore_columns,
+        splits_col=args.splits_column,
+        weight_col=args.weight_column,
+        bounded=args.loss_function is not None and "bounded" in args.loss_function,
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers, keep_h=args.keep_h, add_h=args.add_h
+    )
+    train_data, val_data, test_data = build_splits(args, format_kwargs, featurization_kwargs)
+    train_dset, val_dset, test_dset = build_datasets(args, train_data[0], val_data[0], test_data[0])
+    input_transforms = normalize_inputs(train_dset, val_dset, args)
+    if "regression" in args.task_type:
+        output_scaler = train_dset.normalize_targets()
+        val_dset.normalize_targets(output_scaler)
+        logger.info(f"Train data: mean = {output_scaler.mean_} | std = {output_scaler.scale_}")
+        output_transform = UnscaleTransform.from_standard_scaler(output_scaler)
+    else:
+        output_transform = None
+    train_loader = build_dataloader(
+        train_dset, args.batch_size, args.num_workers, seed=args.data_seed
+    )
+    model = build_model(args, train_loader.dataset, output_transform, input_transforms)
+    monitor_mode = "max" if model.metrics[0].higher_is_better else "min"
+    results = tune_model(
+        args, train_dset, val_dset, logger, monitor_mode, output_transform, input_transforms
+    )
+    best_result = results.get_best_result()
+    best_config = best_result.config["train_loop_config"]
+    best_checkpoint_path = Path(best_result.checkpoint.path) / "checkpoint.ckpt"
+    best_config_save_path = args.hpopt_save_dir / "best_config.toml"
+    best_checkpoint_save_path = args.hpopt_save_dir / "best_checkpoint.ckpt"
+    all_progress_save_path = args.hpopt_save_dir / "all_progress.csv"
+    logger.info(f"Best hyperparameters saved to: '{best_config_save_path}'")
+    args = update_args_with_config(args, best_config)
+    args = TrainSubcommand.parser.parse_known_args(namespace=args)[0]
+    save_config(TrainSubcommand.parser, args, best_config_save_path)
+    logger.info(
+        f"Best hyperparameter configuration checkpoint saved to '{best_checkpoint_save_path}'"
+    )
+    shutil.copyfile(best_checkpoint_path, best_checkpoint_save_path)
+    logger.info(f"Hyperparameter optimization results saved to '{all_progress_save_path}'")
+    result_df = results.get_dataframe()
+    result_df.to_csv(all_progress_save_path, index=False)
+    ray.shutdown()
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = HpoptSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    HpoptSubcommand.func(args)

chemprop-updated/chemprop/cli/main.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import logging
+from pathlib import Path
+import sys
+from configargparse import ArgumentParser
+from chemprop.cli.conf import LOG_DIR, LOG_LEVELS, NOW
+from chemprop.cli.convert import ConvertSubcommand
+from chemprop.cli.fingerprint import FingerprintSubcommand
+from chemprop.cli.hpopt import HpoptSubcommand
+from chemprop.cli.predict import PredictSubcommand
+from chemprop.cli.train import TrainSubcommand
+from chemprop.cli.utils import pop_attr
+logger = logging.getLogger(__name__)
+SUBCOMMANDS = [
+    TrainSubcommand,
+    PredictSubcommand,
+    ConvertSubcommand,
+    FingerprintSubcommand,
+    HpoptSubcommand,
+]
+def construct_parser():
+    parser = ArgumentParser()
+    subparsers = parser.add_subparsers(title="mode", dest="mode", required=True)
+    parent = ArgumentParser(add_help=False)
+    parent.add_argument(
+        "--logfile",
+        "--log",
+        nargs="?",
+        const="default",
+        help=f"Path to which the log file should be written (specifying just the flag alone will automatically log to a file ``{LOG_DIR}/MODE/TIMESTAMP.log`` , where 'MODE' is the CLI mode chosen, e.g., ``{LOG_DIR}/MODE/{NOW}.log``)",
+    )
+    parent.add_argument("-v", action="store_true", help="Increase verbosity level to DEBUG")
+    parent.add_argument(
+        "-q",
+        action="count",
+        default=0,
+        help="Decrease verbosity level to WARNING or ERROR if specified twice",
+    )
+    parents = [parent]
+    for subcommand in SUBCOMMANDS:
+        subcommand.add(subparsers, parents)
+    return parser
+def main():
+    parser = construct_parser()
+    args = parser.parse_args()
+    logfile, v_flag, q_count, mode, func = (
+        pop_attr(args, attr) for attr in ["logfile", "v", "q", "mode", "func"]
+    )
+    if v_flag and q_count:
+        parser.error("The -v and -q options cannot be used together.")
+    match logfile:
+        case None:
+            handler = logging.StreamHandler(sys.stderr)
+        case "default":
+            (LOG_DIR / mode).mkdir(parents=True, exist_ok=True)
+            handler = logging.FileHandler(str(LOG_DIR / mode / f"{NOW}.log"))
+        case _:
+            Path(logfile).parent.mkdir(parents=True, exist_ok=True)
+            handler = logging.FileHandler(logfile)
+    verbosity = q_count * -1 if q_count else (1 if v_flag else 0)
+    logging_level = LOG_LEVELS.get(verbosity, logging.ERROR)
+    logging.basicConfig(
+        handlers=[handler],
+        format="%(asctime)s - %(levelname)s:%(name)s - %(message)s",
+        level=logging_level,
+        datefmt="%Y-%m-%dT%H:%M:%S",
+        force=True,
+    )
+    logger.info(f"Running in mode '{mode}' with args: {vars(args)}")
+    func(args)

chemprop-updated/chemprop/cli/predict.py ADDED Viewed

	@@ -0,0 +1,444 @@

+from argparse import ArgumentError, ArgumentParser, Namespace
+import logging
+from pathlib import Path
+import sys
+from typing import Iterator
+from lightning import pytorch as pl
+import numpy as np
+import pandas as pd
+import torch
+from chemprop import data
+from chemprop.cli.common import (
+    add_common_args,
+    find_models,
+    process_common_args,
+    validate_common_args,
+)
+from chemprop.cli.utils import LookupAction, Subcommand, build_data_from_files, make_dataset
+from chemprop.models.utils import load_model, load_output_columns
+from chemprop.nn.metrics import LossFunctionRegistry
+from chemprop.nn.predictors import EvidentialFFN, MulticlassClassificationFFN, MveFFN
+from chemprop.uncertainty import (
+    MVEWeightingCalibrator,
+    NoUncertaintyEstimator,
+    RegressionCalibrator,
+    RegressionEvaluator,
+    UncertaintyCalibratorRegistry,
+    UncertaintyEstimatorRegistry,
+    UncertaintyEvaluatorRegistry,
+)
+from chemprop.utils import Factory
+logger = logging.getLogger(__name__)
+class PredictSubcommand(Subcommand):
+    COMMAND = "predict"
+    HELP = "use a pretrained chemprop model for prediction"
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        return add_predict_args(parser)
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        validate_common_args(args)
+        args = process_predict_args(args)
+        main(args)
+def add_predict_args(parser: ArgumentParser) -> ArgumentParser:
+    parser.add_argument(
+        "-i",
+        "--test-path",
+        required=True,
+        type=Path,
+        help="Path to an input CSV file containing SMILES",
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        "--preds-path",
+        type=Path,
+        help="Specify path to which predictions will be saved. If the file extension is .pkl, it will be saved as a pickle file. Otherwise, chemprop will save predictions as a CSV. If multiple models are used to make predictions, the average predictions will be saved in the file, and another file ending in '_individual' with the same file extension will save the predictions for each individual model, with the column names being the target names appended with the model index (e.g., '_model_<index>').",
+    )
+    parser.add_argument(
+        "--drop-extra-columns",
+        action="store_true",
+        help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns",
+    )
+    parser.add_argument(
+        "--model-paths",
+        "--model-path",
+        required=True,
+        type=Path,
+        nargs="+",
+        help="Location of checkpoint(s) or model file(s) to use for prediction. It can be a path to either a single pretrained model checkpoint (.ckpt) or single pretrained model file (.pt), a directory that contains these files, or a list of path(s) and directory(s). If a directory, will recursively search and predict on all found (.pt) models.",
+    )
+    unc_args = parser.add_argument_group("Uncertainty and calibration args")
+    unc_args.add_argument(
+        "--cal-path", type=Path, help="Path to data file to be used for uncertainty calibration."
+    )
+    unc_args.add_argument(
+        "--uncertainty-method",
+        default="none",
+        action=LookupAction(UncertaintyEstimatorRegistry),
+        help="The method of calculating uncertainty.",
+    )
+    unc_args.add_argument(
+        "--calibration-method",
+        action=LookupAction(UncertaintyCalibratorRegistry),
+        help="The method used for calibrating the uncertainty calculated with uncertainty method.",
+    )
+    unc_args.add_argument(
+        "--evaluation-methods",
+        "--evaluation-method",
+        nargs="+",
+        action=LookupAction(UncertaintyEvaluatorRegistry),
+        help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.",
+    )
+    # unc_args.add_argument(
+    #     "--evaluation-scores-path", help="Location to save the results of uncertainty evaluations."
+    # )
+    unc_args.add_argument(
+        "--uncertainty-dropout-p",
+        type=float,
+        default=0.1,
+        help="The probability to use for Monte Carlo dropout uncertainty estimation.",
+    )
+    unc_args.add_argument(
+        "--dropout-sampling-size",
+        type=int,
+        default=10,
+        help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.",
+    )
+    unc_args.add_argument(
+        "--calibration-interval-percentile",
+        type=float,
+        default=95,
+        help="Sets the percentile used in the calibration methods. Must be in the range (1, 100).",
+    )
+    unc_args.add_argument(
+        "--conformal-alpha",
+        type=float,
+        default=0.1,
+        help="Target error rate for conformal prediction. Must be in the range (0, 1).",
+    )
+    # TODO: Decide if we want to implment this in v2.1.x
+    # unc_args.add_argument(
+    #     "--regression-calibrator-metric",
+    #     choices=["stdev", "interval"],
+    #     help="Regression calibrators can output either a stdev or an inverval.",
+    # )
+    unc_args.add_argument(
+        "--cal-descriptors-path",
+        nargs="+",
+        action="append",
+        help="Path to extra descriptors to concatenate to learned representation in calibration dataset.",
+    )
+    # TODO: Add in v2.1.x
+    # unc_args.add_argument(
+    #     "--calibration-phase-features-path",
+    #     help=" ",
+    # )
+    unc_args.add_argument(
+        "--cal-atom-features-path",
+        nargs="+",
+        action="append",
+        help="Path to the extra atom features in calibration dataset.",
+    )
+    unc_args.add_argument(
+        "--cal-atom-descriptors-path",
+        nargs="+",
+        action="append",
+        help="Path to the extra atom descriptors in calibration dataset.",
+    )
+    unc_args.add_argument(
+        "--cal-bond-features-path",
+        nargs="+",
+        action="append",
+        help="Path to the extra bond descriptors in calibration dataset.",
+    )
+    return parser
+def process_predict_args(args: Namespace) -> Namespace:
+    if args.test_path.suffix not in [".csv"]:
+        raise ArgumentError(
+            argument=None, message=f"Input data must be a CSV file. Got {args.test_path}"
+        )
+    if args.output is None:
+        args.output = args.test_path.parent / (args.test_path.stem + "_preds.csv")
+    if args.output.suffix not in [".csv", ".pkl"]:
+        raise ArgumentError(
+            argument=None, message=f"Output must be a CSV or Pickle file. Got {args.output}"
+        )
+    return args
+def prepare_data_loader(
+    args: Namespace, multicomponent: bool, is_calibration: bool, format_kwargs: dict
+):
+    data_path = args.cal_path if is_calibration else args.test_path
+    descriptors_path = args.cal_descriptors_path if is_calibration else args.descriptors_path
+    atom_feats_path = args.cal_atom_features_path if is_calibration else args.atom_features_path
+    bond_feats_path = args.cal_bond_features_path if is_calibration else args.bond_features_path
+    atom_descs_path = (
+        args.cal_atom_descriptors_path if is_calibration else args.atom_descriptors_path
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers, keep_h=args.keep_h, add_h=args.add_h
+    )
+    datas = build_data_from_files(
+        data_path,
+        **format_kwargs,
+        p_descriptors=descriptors_path,
+        p_atom_feats=atom_feats_path,
+        p_bond_feats=bond_feats_path,
+        p_atom_descs=atom_descs_path,
+        **featurization_kwargs,
+    )
+    dsets = [make_dataset(d, args.rxn_mode, args.multi_hot_atom_featurizer_mode) for d in datas]
+    dset = data.MulticomponentDataset(dsets) if multicomponent else dsets[0]
+    return data.build_dataloader(dset, args.batch_size, args.num_workers, shuffle=False)
+def make_prediction_for_models(
+    args: Namespace, model_paths: Iterator[Path], multicomponent: bool, output_path: Path
+):
+    model = load_model(model_paths[0], multicomponent)
+    output_columns = load_output_columns(model_paths[0])
+    bounded = any(
+        isinstance(model.criterion, LossFunctionRegistry[loss_function])
+        for loss_function in LossFunctionRegistry.keys()
+        if "bounded" in loss_function
+    )
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        ignore_cols=None,
+        splits_col=None,
+        weight_col=None,
+        bounded=bounded,
+    )
+    format_kwargs["target_cols"] = output_columns if args.evaluation_methods is not None else []
+    test_loader = prepare_data_loader(args, multicomponent, False, format_kwargs)
+    logger.info(f"test size: {len(test_loader.dataset)}")
+    if args.cal_path is not None:
+        format_kwargs["target_cols"] = output_columns
+        cal_loader = prepare_data_loader(args, multicomponent, True, format_kwargs)
+        logger.info(f"calibration size: {len(cal_loader.dataset)}")
+    uncertainty_estimator = Factory.build(
+        UncertaintyEstimatorRegistry[args.uncertainty_method],
+        ensemble_size=args.dropout_sampling_size,
+        dropout=args.uncertainty_dropout_p,
+    )
+    models = [load_model(model_path, multicomponent) for model_path in model_paths]
+    trainer = pl.Trainer(
+        logger=False, enable_progress_bar=True, accelerator=args.accelerator, devices=args.devices
+    )
+    test_individual_preds, test_individual_uncs = uncertainty_estimator(
+        test_loader, models, trainer
+    )
+    test_preds = torch.mean(test_individual_preds, dim=0)
+    if not isinstance(uncertainty_estimator, NoUncertaintyEstimator):
+        test_uncs = torch.mean(test_individual_uncs, dim=0)
+    else:
+        test_uncs = None
+    if args.calibration_method is not None:
+        uncertainty_calibrator = Factory.build(
+            UncertaintyCalibratorRegistry[args.calibration_method],
+            p=args.calibration_interval_percentile / 100,
+            alpha=args.conformal_alpha,
+        )
+        cal_targets = cal_loader.dataset.Y
+        cal_mask = torch.from_numpy(np.isfinite(cal_targets))
+        cal_targets = np.nan_to_num(cal_targets, nan=0.0)
+        cal_targets = torch.from_numpy(cal_targets)
+        cal_individual_preds, cal_individual_uncs = uncertainty_estimator(
+            cal_loader, models, trainer
+        )
+        cal_preds = torch.mean(cal_individual_preds, dim=0)
+        cal_uncs = torch.mean(cal_individual_uncs, dim=0)
+        if isinstance(uncertainty_calibrator, MVEWeightingCalibrator):
+            uncertainty_calibrator.fit(cal_preds, cal_individual_uncs, cal_targets, cal_mask)
+            test_uncs = uncertainty_calibrator.apply(cal_individual_uncs)
+        else:
+            if isinstance(uncertainty_calibrator, RegressionCalibrator):
+                uncertainty_calibrator.fit(cal_preds, cal_uncs, cal_targets, cal_mask)
+            else:
+                uncertainty_calibrator.fit(cal_uncs, cal_targets, cal_mask)
+            test_uncs = uncertainty_calibrator.apply(test_uncs)
+            for i in range(test_individual_uncs.shape[0]):
+                test_individual_uncs[i] = uncertainty_calibrator.apply(test_individual_uncs[i])
+    if args.evaluation_methods is not None:
+        uncertainty_evaluators = [
+            Factory.build(UncertaintyEvaluatorRegistry[method])
+            for method in args.evaluation_methods
+        ]
+        logger.info("Uncertainty evaluation metric:")
+        for evaluator in uncertainty_evaluators:
+            test_targets = test_loader.dataset.Y
+            test_mask = torch.from_numpy(np.isfinite(test_targets))
+            test_targets = np.nan_to_num(test_targets, nan=0.0)
+            test_targets = torch.from_numpy(test_targets)
+            if isinstance(evaluator, RegressionEvaluator):
+                metric_value = evaluator.evaluate(test_preds, test_uncs, test_targets, test_mask)
+            else:
+                metric_value = evaluator.evaluate(test_uncs, test_targets, test_mask)
+            logger.info(f"{evaluator.alias}: {metric_value.tolist()}")
+    if args.uncertainty_method == "none" and (
+        isinstance(model.predictor, MveFFN) or isinstance(model.predictor, EvidentialFFN)
+    ):
+        test_preds = test_preds[..., 0]
+        test_individual_preds = test_individual_preds[..., 0]
+    if output_columns is None:
+        output_columns = [
+            f"pred_{i}" for i in range(test_preds.shape[1])
+        ]  # TODO: need to improve this for cases like multi-task MVE and multi-task multiclass
+    save_predictions(args, model, output_columns, test_preds, test_uncs, output_path)
+    if len(model_paths) > 1:
+        save_individual_predictions(
+            args,
+            model,
+            model_paths,
+            output_columns,
+            test_individual_preds,
+            test_individual_uncs,
+            output_path,
+        )
+def save_predictions(args, model, output_columns, test_preds, test_uncs, output_path):
+    unc_columns = [f"{col}_unc" for col in output_columns]
+    if isinstance(model.predictor, MulticlassClassificationFFN):
+        output_columns = output_columns + [f"{col}_prob" for col in output_columns]
+        predicted_class_labels = test_preds.argmax(axis=-1)
+        formatted_probability_strings = np.apply_along_axis(
+            lambda x: ",".join(map(str, x)), 2, test_preds.numpy()
+        )
+        test_preds = np.concatenate(
+            (predicted_class_labels, formatted_probability_strings), axis=-1
+        )
+    df_test = pd.read_csv(
+        args.test_path, header=None if args.no_header_row else "infer", index_col=False
+    )
+    df_test[output_columns] = test_preds
+    if args.uncertainty_method not in ["none", "classification"]:
+        df_test[unc_columns] = np.round(test_uncs, 6)
+    if output_path.suffix == ".pkl":
+        df_test = df_test.reset_index(drop=True)
+        df_test.to_pickle(output_path)
+    else:
+        df_test.to_csv(output_path, index=False)
+    logger.info(f"Predictions saved to '{output_path}'")
+def save_individual_predictions(
+    args,
+    model,
+    model_paths,
+    output_columns,
+    test_individual_preds,
+    test_individual_uncs,
+    output_path,
+):
+    unc_columns = [
+        f"{col}_unc_model_{i}" for i in range(len(model_paths)) for col in output_columns
+    ]
+    if isinstance(model.predictor, MulticlassClassificationFFN):
+        output_columns = [
+            item
+            for i in range(len(model_paths))
+            for col in output_columns
+            for item in (f"{col}_model_{i}", f"{col}_prob_model_{i}")
+        ]
+        predicted_class_labels = test_individual_preds.argmax(axis=-1)
+        formatted_probability_strings = np.apply_along_axis(
+            lambda x: ",".join(map(str, x)), 3, test_individual_preds.numpy()
+        )
+        test_individual_preds = np.concatenate(
+            (predicted_class_labels, formatted_probability_strings), axis=-1
+        )
+    else:
+        output_columns = [
+            f"{col}_model_{i}" for i in range(len(model_paths)) for col in output_columns
+        ]
+    m, n, t = test_individual_preds.shape
+    test_individual_preds = np.transpose(test_individual_preds, (1, 0, 2)).reshape(n, m * t)
+    df_test = pd.read_csv(
+        args.test_path, header=None if args.no_header_row else "infer", index_col=False
+    )
+    df_test[output_columns] = test_individual_preds
+    if args.uncertainty_method not in ["none", "classification", "ensemble"]:
+        m, n, t = test_individual_uncs.shape
+        test_individual_uncs = np.transpose(test_individual_uncs, (1, 0, 2)).reshape(n, m * t)
+        df_test[unc_columns] = np.round(test_individual_uncs, 6)
+    output_path = output_path.parent / Path(
+        str(args.output.stem) + "_individual" + str(output_path.suffix)
+    )
+    if output_path.suffix == ".pkl":
+        df_test = df_test.reset_index(drop=True)
+        df_test.to_pickle(output_path)
+    else:
+        df_test.to_csv(output_path, index=False)
+    logger.info(f"Individual predictions saved to '{output_path}'")
+    for i, model_path in enumerate(model_paths):
+        logger.info(
+            f"Results from model path {model_path} are saved under the column name ending with 'model_{i}'"
+        )
+def main(args):
+    match (args.smiles_columns, args.reaction_columns):
+        case [None, None]:
+            n_components = 1
+        case [_, None]:
+            n_components = len(args.smiles_columns)
+        case [None, _]:
+            n_components = len(args.reaction_columns)
+        case _:
+            n_components = len(args.smiles_columns) + len(args.reaction_columns)
+    multicomponent = n_components > 1
+    model_paths = find_models(args.model_paths)
+    make_prediction_for_models(args, model_paths, multicomponent, output_path=args.output)
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser = PredictSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    args = PredictSubcommand.func(args)

chemprop-updated/chemprop/cli/train.py ADDED Viewed

	@@ -0,0 +1,1340 @@

+from copy import deepcopy
+from io import StringIO
+import json
+import logging
+from pathlib import Path
+import sys
+from tempfile import TemporaryDirectory
+from configargparse import ArgumentError, ArgumentParser, Namespace
+from lightning import pytorch as pl
+from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
+from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger
+from lightning.pytorch.strategies import DDPStrategy
+import numpy as np
+import pandas as pd
+from rich.console import Console
+from rich.table import Column, Table
+import torch
+import torch.nn as nn
+from chemprop.cli.common import (
+    add_common_args,
+    find_models,
+    process_common_args,
+    validate_common_args,
+)
+from chemprop.cli.conf import CHEMPROP_TRAIN_DIR, NOW
+from chemprop.cli.utils import (
+    LookupAction,
+    Subcommand,
+    build_data_from_files,
+    get_column_names,
+    make_dataset,
+    parse_indices,
+)
+from chemprop.cli.utils.args import uppercase
+from chemprop.data import (
+    MoleculeDataset,
+    MolGraphDataset,
+    MulticomponentDataset,
+    ReactionDatapoint,
+    SplitType,
+    build_dataloader,
+    make_split_indices,
+    split_data_by_indices,
+)
+from chemprop.data.datasets import _MolGraphDatasetMixin
+from chemprop.models import MPNN, MulticomponentMPNN, save_model
+from chemprop.nn import AggregationRegistry, LossFunctionRegistry, MetricRegistry, PredictorRegistry
+from chemprop.nn.message_passing import (
+    AtomMessagePassing,
+    BondMessagePassing,
+    MulticomponentMessagePassing,
+)
+from chemprop.nn.transforms import GraphTransform, ScaleTransform, UnscaleTransform
+from chemprop.nn.utils import Activation
+from chemprop.utils import Factory
+logger = logging.getLogger(__name__)
+_CV_REMOVAL_ERROR = (
+    "The -k/--num-folds argument was removed in v2.1.0 - use --num-replicates instead."
+)
+class TrainSubcommand(Subcommand):
+    COMMAND = "train"
+    HELP = "Train a chemprop model."
+    parser = None
+    @classmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        parser = add_common_args(parser)
+        parser = add_train_args(parser)
+        cls.parser = parser
+        return parser
+    @classmethod
+    def func(cls, args: Namespace):
+        args = process_common_args(args)
+        validate_common_args(args)
+        args = process_train_args(args)
+        validate_train_args(args)
+        args.output_dir.mkdir(exist_ok=True, parents=True)
+        config_path = args.output_dir / "config.toml"
+        save_config(cls.parser, args, config_path)
+        main(args)
+def add_train_args(parser: ArgumentParser) -> ArgumentParser:
+    parser.add_argument(
+        "--config-path",
+        type=Path,
+        is_config_file=True,
+        help="Path to a configuration file (command line arguments override values in the configuration file)",
+    )
+    parser.add_argument(
+        "-i",
+        "--data-path",
+        type=Path,
+        help="Path to an input CSV file containing SMILES and the associated target values",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-dir",
+        "--save-dir",
+        type=Path,
+        help="Directory where training outputs will be saved (defaults to ``CURRENT_DIRECTORY/chemprop_training/STEM_OF_INPUT/TIME_STAMP``)",
+    )
+    parser.add_argument(
+        "--remove-checkpoints",
+        action="store_true",
+        help="Remove intermediate checkpoint files after training is complete.",
+    )
+    # TODO: Add in v2.1; see if we can tell lightning how often to log training loss
+    # parser.add_argument(
+    #     "--log-frequency",
+    #     type=int,
+    #     default=10,
+    #     help="The number of batches between each logging of the training loss.",
+    # )
+    transfer_args = parser.add_argument_group("transfer learning args")
+    transfer_args.add_argument(
+        "--checkpoint",
+        type=Path,
+        nargs="+",
+        help="Path to checkpoint(s) or model file(s) for loading and overwriting weights. Accepts a single pre-trained model checkpoint (.ckpt), a single model file (.pt), a directory containing such files, or a list of paths and directories. If a directory is provided, it will recursively search for and use all (.pt) files found for prediction.",
+    )
+    transfer_args.add_argument(
+        "--freeze-encoder",
+        action="store_true",
+        help="Freeze the message passing layer from the checkpoint model (specified by ``--checkpoint``).",
+    )
+    transfer_args.add_argument(
+        "--model-frzn",
+        help="Path to model checkpoint file to be loaded for overwriting and freezing weights. By default, all MPNN weights are frozen with this option.",
+    )
+    transfer_args.add_argument(
+        "--frzn-ffn-layers",
+        type=int,
+        default=0,
+        help="Freeze the first ``n`` layers of the FFN from the checkpoint model (specified by ``--checkpoint``). The message passing layer should also be frozen with ``--freeze-encoder``.",
+    )
+    # transfer_args.add_argument(
+    #     "--freeze-first-only",
+    #     action="store_true",
+    #     help="Determines whether or not to use checkpoint_frzn for just the first encoder. Default (False) is to use the checkpoint to freeze all encoders. (only relevant for number_of_molecules > 1, where checkpoint model has number_of_molecules = 1)",
+    # )
+    # TODO: Add in v2.1
+    # parser.add_argument(
+    #     "--resume-experiment",
+    #     action="store_true",
+    #     help="Whether to resume the experiment. Loads test results from any folds that have already been completed and skips training those folds.",
+    # )
+    # parser.add_argument(
+    #     "--config-path",
+    #     help="Path to a :code:`.json` file containing arguments. Any arguments present in the config file will override arguments specified via the command line or by the defaults.",
+    # )
+    parser.add_argument(
+        "--ensemble-size",
+        type=int,
+        default=1,
+        help="Number of models in ensemble for each splitting of data",
+    )
+    # TODO: Add in v2.2
+    # abt_args = parser.add_argument_group("atom/bond target args")
+    # abt_args.add_argument(
+    #     "--is-atom-bond-targets",
+    #     action="store_true",
+    #     help="Whether this is atomic/bond properties prediction.",
+    # )
+    # abt_args.add_argument(
+    #     "--no-adding-bond-types",
+    #     action="store_true",
+    #     help="Whether the bond types determined by RDKit molecules added to the output of bond targets. This option is intended to be used with the :code:`is_atom_bond_targets`.",
+    # )
+    # abt_args.add_argument(
+    #     "--keeping-atom-map",
+    #     action="store_true",
+    #     help="Whether RDKit molecules keep the original atom mapping. This option is intended to be used when providing atom-mapped SMILES with the :code:`is_atom_bond_targets`.",
+    # )
+    # abt_args.add_argument(
+    #     "--no-shared-atom-bond-ffn",
+    #     action="store_true",
+    #     help="Whether the FFN weights for atom and bond targets should be independent between tasks.",
+    # )
+    # abt_args.add_argument(
+    #     "--weights-ffn-num-layers",
+    #     type=int,
+    #     default=2,
+    #     help="Number of layers in FFN for determining weights used in constrained targets.",
+    # )
+    mp_args = parser.add_argument_group("message passing")
+    mp_args.add_argument(
+        "--message-hidden-dim", type=int, default=300, help="Hidden dimension of the messages"
+    )
+    mp_args.add_argument(
+        "--message-bias", action="store_true", help="Add bias to the message passing layers"
+    )
+    mp_args.add_argument("--depth", type=int, default=3, help="Number of message passing steps")
+    mp_args.add_argument(
+        "--undirected",
+        action="store_true",
+        help="Pass messages on undirected bonds/edges (always sum the two relevant bond vectors)",
+    )
+    mp_args.add_argument(
+        "--dropout",
+        type=float,
+        default=0.0,
+        help="Dropout probability in message passing/FFN layers",
+    )
+    mp_args.add_argument(
+        "--mpn-shared",
+        action="store_true",
+        help="Whether to use the same message passing neural network for all input molecules (only relevant if ``number_of_molecules`` > 1)",
+    )
+    mp_args.add_argument(
+        "--activation",
+        type=uppercase,
+        default="RELU",
+        choices=list(Activation.keys()),
+        help="Activation function in message passing/FFN layers",
+    )
+    mp_args.add_argument(
+        "--aggregation",
+        "--agg",
+        default="norm",
+        action=LookupAction(AggregationRegistry),
+        help="Aggregation mode to use during graph predictor",
+    )
+    mp_args.add_argument(
+        "--aggregation-norm",
+        type=float,
+        default=100,
+        help="Normalization factor by which to divide summed up atomic features for ``norm`` aggregation",
+    )
+    mp_args.add_argument(
+        "--atom-messages", action="store_true", help="Pass messages on atoms rather than bonds."
+    )
+    # TODO: Add in v2.1
+    # mpsolv_args = parser.add_argument_group("message passing with solvent")
+    # mpsolv_args.add_argument(
+    #     "--reaction-solvent",
+    #     action="store_true",
+    #     help="Whether to adjust the MPNN layer to take as input a reaction and a molecule, and to encode them with separate MPNNs.",
+    # )
+    # mpsolv_args.add_argument(
+    #     "--bias-solvent",
+    #     action="store_true",
+    #     help="Whether to add bias to linear layers for solvent MPN if :code:`reaction_solvent` is True.",
+    # )
+    # mpsolv_args.add_argument(
+    #     "--hidden-size-solvent",
+    #     type=int,
+    #     default=300,
+    #     help="Dimensionality of hidden layers in solvent MPN if :code:`reaction_solvent` is True.",
+    # )
+    # mpsolv_args.add_argument(
+    #     "--depth-solvent",
+    #     type=int,
+    #     default=3,
+    #     help="Number of message passing steps for solvent if :code:`reaction_solvent` is True.",
+    # )
+    ffn_args = parser.add_argument_group("FFN args")
+    ffn_args.add_argument(
+        "--ffn-hidden-dim", type=int, default=300, help="Hidden dimension in the FFN top model"
+    )
+    ffn_args.add_argument(  # TODO: the default in v1 was 2. (see weights_ffn_num_layers option) Do we really want the default to now be 1?
+        "--ffn-num-layers", type=int, default=1, help="Number of layers in FFN top model"
+    )
+    # TODO: Decide if we want to implment this in v2
+    # ffn_args.add_argument(
+    #     "--features-only",
+    #     action="store_true",
+    #     help="Use only the additional features in an FFN, no graph network.",
+    # )
+    extra_mpnn_args = parser.add_argument_group("extra MPNN args")
+    extra_mpnn_args.add_argument(
+        "--batch-norm", action="store_true", help="Turn on batch normalization after aggregation"
+    )
+    extra_mpnn_args.add_argument(
+        "--multiclass-num-classes",
+        type=int,
+        default=3,
+        help="Number of classes when running multiclass classification",
+    )
+    # TODO: Add in v2.1
+    # extra_mpnn_args.add_argument(
+    #     "--spectral-activation",
+    #     default="exp",
+    #     choices=["softplus", "exp"],
+    #     help="Indicates which function to use in task_type spectra training to constrain outputs to be positive.",
+    # )
+    train_data_args = parser.add_argument_group("training input data args")
+    train_data_args.add_argument(
+        "-w",
+        "--weight-column",
+        help="Name of the column in the input CSV containing individual data weights",
+    )
+    train_data_args.add_argument(
+        "--target-columns",
+        nargs="+",
+        help="Name of the columns containing target values (by default, uses all columns except the SMILES column and the ``ignore_columns``)",
+    )
+    train_data_args.add_argument(
+        "--ignore-columns",
+        nargs="+",
+        help="Name of the columns to ignore when ``target_columns`` is not provided",
+    )
+    train_data_args.add_argument(
+        "--no-cache",
+        action="store_true",
+        help="Turn off caching the featurized ``MolGraph`` s at the beginning of training",
+    )
+    train_data_args.add_argument(
+        "--splits-column",
+        help="Name of the column in the input CSV file containing 'train', 'val', or 'test' for each row.",
+    )
+    # TODO: Add in v2.1
+    # train_data_args.add_argument(
+    #     "--spectra-phase-mask-path",
+    #     help="Path to a file containing a phase mask array, used for excluding particular regions in spectra predictions.",
+    # )
+    train_args = parser.add_argument_group("training args")
+    train_args.add_argument(
+        "-t",
+        "--task-type",
+        default="regression",
+        action=LookupAction(PredictorRegistry),
+        help="Type of dataset (determines the default loss function used during training, defaults to ``regression``)",
+    )
+    train_args.add_argument(
+        "-l",
+        "--loss-function",
+        action=LookupAction(LossFunctionRegistry),
+        help="Loss function to use during training (will use the default loss function for the given task type if not specified)",
+    )
+    train_args.add_argument(
+        "--v-kl",
+        "--evidential-regularization",
+        type=float,
+        default=0.0,
+        help="Specify the value used in regularization for evidential loss function. The default value recommended by Soleimany et al. (2021) is 0.2. However, the optimal value is dataset-dependent, so it is recommended that users test different values to find the best value for their model.",
+    )
+    train_args.add_argument(
+        "--eps", type=float, default=1e-8, help="Evidential regularization epsilon"
+    )
+    train_args.add_argument(
+        "--alpha", type=float, default=0.1, help="Target error bounds for quantile interval loss"
+    )
+    # TODO: Add in v2.1
+    # train_args.add_argument(  # TODO: Is threshold the same thing as the spectra target floor? I'm not sure but combined them.
+    #     "-T",
+    #     "--threshold",
+    #     "--spectra-target-floor",
+    #     type=float,
+    #     default=1e-8,
+    #     help="spectral threshold limit. v1 help string: Values in targets for dataset type spectra are replaced with this value, intended to be a small positive number used to enforce positive values.",
+    # )
+    train_args.add_argument(
+        "--metrics",
+        "--metric",
+        nargs="+",
+        action=LookupAction(MetricRegistry),
+        help="Specify the evaluation metrics. If unspecified, chemprop will use the following metrics for given dataset types: regression -> ``rmse``, classification -> ``roc``, multiclass -> ``ce`` ('cross entropy'), spectral -> ``sid``. If multiple metrics are provided, the 0-th one will be used for early stopping and checkpointing.",
+    )
+    train_args.add_argument(
+        "--tracking-metric",
+        default="val_loss",
+        help="The metric to track for early stopping and checkpointing. Defaults to the criterion used during training.",
+    )
+    train_args.add_argument(
+        "--show-individual-scores",
+        action="store_true",
+        help="Show all scores for individual targets, not just average, at the end.",
+    )
+    train_args.add_argument(
+        "--task-weights",
+        nargs="+",
+        type=float,
+        help="Weights to apply for whole tasks in the loss function",
+    )
+    train_args.add_argument(
+        "--warmup-epochs",
+        type=int,
+        default=2,
+        help="Number of epochs during which learning rate increases linearly from ``init_lr`` to ``max_lr`` (afterwards, learning rate decreases exponentially from ``max_lr`` to ``final_lr``)",
+    )
+    train_args.add_argument("--init-lr", type=float, default=1e-4, help="Initial learning rate.")
+    train_args.add_argument("--max-lr", type=float, default=1e-3, help="Maximum learning rate.")
+    train_args.add_argument("--final-lr", type=float, default=1e-4, help="Final learning rate.")
+    train_args.add_argument("--epochs", type=int, default=50, help="Number of epochs to train over")
+    train_args.add_argument(
+        "--patience",
+        type=int,
+        default=None,
+        help="Number of epochs to wait for improvement before early stopping",
+    )
+    train_args.add_argument(
+        "--grad-clip",
+        type=float,
+        help="Passed directly to the lightning trainer which controls grad clipping (see the ``Trainer()`` docstring for details)",
+    )
+    train_args.add_argument(
+        "--class-balance",
+        action="store_true",
+        help="Ensures each training batch contains an equal number of positive and negative samples.",
+    )
+    split_args = parser.add_argument_group("split args")
+    split_args.add_argument(
+        "--split",
+        "--split-type",
+        type=uppercase,
+        default="RANDOM",
+        choices=list(SplitType.keys()),
+        help="Method of splitting the data into train/val/test (case insensitive)",
+    )
+    split_args.add_argument(
+        "--split-sizes",
+        type=float,
+        nargs=3,
+        default=[0.8, 0.1, 0.1],
+        help="Split proportions for train/validation/test sets",
+    )
+    split_args.add_argument(
+        "--split-key-molecule",
+        type=int,
+        default=0,
+        help="Specify the index of the key molecule used for splitting when multiple molecules are present and constrained split_type is used (e.g., ``scaffold_balanced`` or ``random_with_repeated_smiles``). Note that this index begins with zero for the first molecule.",
+    )
+    split_args.add_argument("--num-replicates", type=int, default=1, help="Number of replicates.")
+    split_args.add_argument("-k", "--num-folds", help=_CV_REMOVAL_ERROR)
+    split_args.add_argument(
+        "--save-smiles-splits",
+        action="store_true",
+        help="Whether to store the SMILES in each train/val/test split",
+    )
+    split_args.add_argument(
+        "--splits-file",
+        type=Path,
+        help="Path to a JSON file containing pre-defined splits for the input data, formatted as a list of dictionaries with keys ``train``, ``val``, and ``test`` and values as lists of indices or formatted strings (e.g. [0, 1, 2, 4] or '0-2,4')",
+    )
+    split_args.add_argument(
+        "--data-seed",
+        type=int,
+        default=0,
+        help="Specify the random seed to use when splitting data into train/val/test sets. When ``--num-replicates`` > 1, the first replicate uses this seed and all subsequent replicates add 1 to the seed (also used for shuffling data in ``build_dataloader`` when ``shuffle`` is True).",
+    )
+    parser.add_argument(
+        "--pytorch-seed",
+        type=int,
+        default=None,
+        help="Seed for PyTorch randomness (e.g., random initial weights)",
+    )
+    return parser
+def process_train_args(args: Namespace) -> Namespace:
+    if args.output_dir is None:
+        args.output_dir = CHEMPROP_TRAIN_DIR / args.data_path.stem / NOW
+    return args
+def validate_train_args(args):
+    if args.config_path is None and args.data_path is None:
+        raise ArgumentError(argument=None, message="Data path must be provided for training.")
+    if args.num_folds is not None:  # i.e. user-specified
+        raise ArgumentError(argument=None, message=_CV_REMOVAL_ERROR)
+    if args.data_path.suffix not in [".csv"]:
+        raise ArgumentError(
+            argument=None, message=f"Input data must be a CSV file. Got {args.data_path}"
+        )
+    if args.epochs != -1 and args.epochs <= args.warmup_epochs:
+        raise ArgumentError(
+            argument=None,
+            message=f"The number of epochs should be higher than the number of epochs during warmup. Got {args.epochs} epochs and {args.warmup_epochs} warmup epochs",
+        )
+    # TODO: model_frzn is deprecated and then remove in v2.2
+    if args.checkpoint is not None and args.model_frzn is not None:
+        raise ArgumentError(
+            argument=None,
+            message="`--checkpoint` and `--model-frzn` cannot be used at the same time.",
+        )
+    if "--model-frzn" in sys.argv:
+        logger.warning(
+            "`--model-frzn` is deprecated and will be removed in v2.2. "
+            "Please use `--checkpoint` with `--freeze-encoder` instead."
+        )
+    if args.freeze_encoder and args.checkpoint is None:
+        raise ArgumentError(
+            argument=None,
+            message="`--freeze-encoder` can only be used when `--checkpoint` is used.",
+        )
+    if args.frzn_ffn_layers > 0:
+        if args.checkpoint is None and args.model_frzn is None:
+            raise ArgumentError(
+                argument=None,
+                message="`--frzn-ffn-layers` can only be used when `--checkpoint` or `--model-frzn` (depreciated in v2.1) is used.",
+            )
+        if args.checkpoint is not None and not args.freeze_encoder:
+            raise ArgumentError(
+                argument=None,
+                message="To freeze the first `n` layers of the FFN via `--frzn-ffn-layers`. The message passing layer should also be frozen with `--freeze-encoder`.",
+            )
+    if args.class_balance and args.task_type != "classification":
+        raise ArgumentError(
+            argument=None, message="Class balance is only applicable for classification tasks."
+        )
+    valid_tracking_metrics = (
+        args.metrics or [PredictorRegistry[args.task_type]._T_default_metric.alias]
+    ) + ["val_loss"]
+    if args.tracking_metric not in valid_tracking_metrics:
+        raise ArgumentError(
+            argument=None,
+            message=f"Tracking metric must be one of {','.join(valid_tracking_metrics)}. "
+            f"Got {args.tracking_metric}. Additional tracking metric options can be specified with "
+            "the `--metrics` flag.",
+        )
+    input_cols, target_cols = get_column_names(
+        args.data_path,
+        args.smiles_columns,
+        args.reaction_columns,
+        args.target_columns,
+        args.ignore_columns,
+        args.splits_column,
+        args.weight_column,
+        args.no_header_row,
+    )
+    args.input_columns = input_cols
+    args.target_columns = target_cols
+    return args
+def normalize_inputs(train_dset, val_dset, args):
+    multicomponent = isinstance(train_dset, MulticomponentDataset)
+    num_components = train_dset.n_components if multicomponent else 1
+    X_d_transform = None
+    V_f_transforms = [nn.Identity()] * num_components
+    E_f_transforms = [nn.Identity()] * num_components
+    V_d_transforms = [None] * num_components
+    graph_transforms = []
+    d_xd = train_dset.d_xd
+    d_vf = train_dset.d_vf
+    d_ef = train_dset.d_ef
+    d_vd = train_dset.d_vd
+    if d_xd > 0 and not args.no_descriptor_scaling:
+        scaler = train_dset.normalize_inputs("X_d")
+        val_dset.normalize_inputs("X_d", scaler)
+        scaler = scaler if not isinstance(scaler, list) else scaler[0]
+        if scaler is not None:
+            logger.info(
+                f"Descriptors: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            X_d_transform = ScaleTransform.from_standard_scaler(scaler)
+    if d_vf > 0 and not args.no_atom_feature_scaling:
+        scaler = train_dset.normalize_inputs("V_f")
+        val_dset.normalize_inputs("V_f", scaler)
+        scalers = [scaler] if not isinstance(scaler, list) else scaler
+        for i, scaler in enumerate(scalers):
+            if scaler is None:
+                continue
+            logger.info(
+                f"Atom features for mol {i}: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            featurizer = (
+                train_dset.datasets[i].featurizer if multicomponent else train_dset.featurizer
+            )
+            V_f_transforms[i] = ScaleTransform.from_standard_scaler(
+                scaler, pad=featurizer.atom_fdim - featurizer.extra_atom_fdim
+            )
+    if d_ef > 0 and not args.no_bond_feature_scaling:
+        scaler = train_dset.normalize_inputs("E_f")
+        val_dset.normalize_inputs("E_f", scaler)
+        scalers = [scaler] if not isinstance(scaler, list) else scaler
+        for i, scaler in enumerate(scalers):
+            if scaler is None:
+                continue
+            logger.info(
+                f"Bond features for mol {i}: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            featurizer = (
+                train_dset.datasets[i].featurizer if multicomponent else train_dset.featurizer
+            )
+            E_f_transforms[i] = ScaleTransform.from_standard_scaler(
+                scaler, pad=featurizer.bond_fdim - featurizer.extra_bond_fdim
+            )
+    for V_f_transform, E_f_transform in zip(V_f_transforms, E_f_transforms):
+        graph_transforms.append(GraphTransform(V_f_transform, E_f_transform))
+    if d_vd > 0 and not args.no_atom_descriptor_scaling:
+        scaler = train_dset.normalize_inputs("V_d")
+        val_dset.normalize_inputs("V_d", scaler)
+        scalers = [scaler] if not isinstance(scaler, list) else scaler
+        for i, scaler in enumerate(scalers):
+            if scaler is None:
+                continue
+            logger.info(
+                f"Atom descriptors for mol {i}: loc = {np.array2string(scaler.mean_, precision=3)}, scale = {np.array2string(scaler.scale_, precision=3)}"
+            )
+            V_d_transforms[i] = ScaleTransform.from_standard_scaler(scaler)
+    return X_d_transform, graph_transforms, V_d_transforms
+def load_and_use_pretrained_model_scalers(model_path: Path, train_dset, val_dset) -> None:
+    if isinstance(train_dset, MulticomponentDataset):
+        _model = MulticomponentMPNN.load_from_file(model_path)
+        blocks = _model.message_passing.blocks
+        train_dsets = train_dset.datasets
+        val_dsets = val_dset.datasets
+    else:
+        _model = MPNN.load_from_file(model_path)
+        blocks = [_model.message_passing]
+        train_dsets = [train_dset]
+        val_dsets = [val_dset]
+    for i in range(len(blocks)):
+        if isinstance(_model.X_d_transform, ScaleTransform):
+            scaler = _model.X_d_transform.to_standard_scaler()
+            train_dsets[i].normalize_inputs("X_d", scaler)
+            val_dsets[i].normalize_inputs("X_d", scaler)
+        if isinstance(blocks[i].graph_transform, GraphTransform):
+            if isinstance(blocks[i].graph_transform.V_transform, ScaleTransform):
+                V_anti_pad = (
+                    train_dsets[i].featurizer.atom_fdim - train_dsets[i].featurizer.extra_atom_fdim
+                )
+                scaler = blocks[i].graph_transform.V_transform.to_standard_scaler(
+                    anti_pad=V_anti_pad
+                )
+                train_dsets[i].normalize_inputs("V_f", scaler)
+                val_dsets[i].normalize_inputs("V_f", scaler)
+            if isinstance(blocks[i].graph_transform.E_transform, ScaleTransform):
+                E_anti_pad = (
+                    train_dsets[i].featurizer.bond_fdim - train_dsets[i].featurizer.extra_bond_fdim
+                )
+                scaler = blocks[i].graph_transform.E_transform.to_standard_scaler(
+                    anti_pad=E_anti_pad
+                )
+                train_dsets[i].normalize_inputs("E_f", scaler)
+                val_dsets[i].normalize_inputs("E_f", scaler)
+        if isinstance(blocks[i].V_d_transform, ScaleTransform):
+            scaler = blocks[i].V_d_transform.to_standard_scaler()
+            train_dsets[i].normalize_inputs("V_d", scaler)
+            val_dsets[i].normalize_inputs("V_d", scaler)
+    if isinstance(_model.predictor.output_transform, UnscaleTransform):
+        scaler = _model.predictor.output_transform.to_standard_scaler()
+        train_dset.normalize_targets(scaler)
+        val_dset.normalize_targets(scaler)
+def save_config(parser: ArgumentParser, args: Namespace, config_path: Path):
+    config_args = deepcopy(args)
+    for key, value in vars(config_args).items():
+        if isinstance(value, Path):
+            setattr(config_args, key, str(value))
+    for key in ["atom_features_path", "atom_descriptors_path", "bond_features_path"]:
+        if getattr(config_args, key) is not None:
+            for index, path in getattr(config_args, key).items():
+                getattr(config_args, key)[index] = str(path)
+    parser.write_config_file(parsed_namespace=config_args, output_file_paths=[str(config_path)])
+def save_smiles_splits(args: Namespace, output_dir, train_dset, val_dset, test_dset):
+    match (args.smiles_columns, args.reaction_columns):
+        case [_, None]:
+            column_labels = deepcopy(args.smiles_columns)
+        case [None, _]:
+            column_labels = deepcopy(args.reaction_columns)
+        case _:
+            column_labels = deepcopy(args.smiles_columns)
+            column_labels.extend(args.reaction_columns)
+    train_smis = train_dset.names
+    df_train = pd.DataFrame(train_smis, columns=column_labels)
+    df_train.to_csv(output_dir / "train_smiles.csv", index=False)
+    val_smis = val_dset.names
+    df_val = pd.DataFrame(val_smis, columns=column_labels)
+    df_val.to_csv(output_dir / "val_smiles.csv", index=False)
+    if test_dset is not None:
+        test_smis = test_dset.names
+        df_test = pd.DataFrame(test_smis, columns=column_labels)
+        df_test.to_csv(output_dir / "test_smiles.csv", index=False)
+def build_splits(args, format_kwargs, featurization_kwargs):
+    """build the train/val/test splits"""
+    logger.info(f"Pulling data from file: {args.data_path}")
+    all_data = build_data_from_files(
+        args.data_path,
+        p_descriptors=args.descriptors_path,
+        p_atom_feats=args.atom_features_path,
+        p_bond_feats=args.bond_features_path,
+        p_atom_descs=args.atom_descriptors_path,
+        **format_kwargs,
+        **featurization_kwargs,
+    )
+    if args.splits_column is not None:
+        df = pd.read_csv(
+            args.data_path, header=None if args.no_header_row else "infer", index_col=False
+        )
+        grouped = df.groupby(df[args.splits_column].str.lower())
+        train_indices = grouped.groups.get("train", pd.Index([])).tolist()
+        val_indices = grouped.groups.get("val", pd.Index([])).tolist()
+        test_indices = grouped.groups.get("test", pd.Index([])).tolist()
+        train_indices, val_indices, test_indices = [train_indices], [val_indices], [test_indices]
+    elif args.splits_file is not None:
+        with open(args.splits_file, "rb") as json_file:
+            split_idxss = json.load(json_file)
+        train_indices = [parse_indices(d["train"]) for d in split_idxss]
+        val_indices = [parse_indices(d["val"]) for d in split_idxss]
+        test_indices = [parse_indices(d["test"]) for d in split_idxss]
+        args.num_replicates = len(split_idxss)
+    else:
+        splitting_data = all_data[args.split_key_molecule]
+        if isinstance(splitting_data[0], ReactionDatapoint):
+            splitting_mols = [datapoint.rct for datapoint in splitting_data]
+        else:
+            splitting_mols = [datapoint.mol for datapoint in splitting_data]
+        train_indices, val_indices, test_indices = make_split_indices(
+            splitting_mols, args.split, args.split_sizes, args.data_seed, args.num_replicates
+        )
+    train_data, val_data, test_data = split_data_by_indices(
+        all_data, train_indices, val_indices, test_indices
+    )
+    for i_split in range(len(train_data)):
+        sizes = [len(train_data[i_split][0]), len(val_data[i_split][0]), len(test_data[i_split][0])]
+        logger.info(f"train/val/test split_{i_split} sizes: {sizes}")
+    return train_data, val_data, test_data
+def summarize(
+    target_cols: list[str], task_type: str, dataset: _MolGraphDatasetMixin
+) -> tuple[list, list]:
+    if task_type in [
+        "regression",
+        "regression-mve",
+        "regression-evidential",
+        "regression-quantile",
+    ]:
+        if isinstance(dataset, MulticomponentDataset):
+            y = dataset.datasets[0].Y
+        else:
+            y = dataset.Y
+        y_mean = np.nanmean(y, axis=0)
+        y_std = np.nanstd(y, axis=0)
+        y_median = np.nanmedian(y, axis=0)
+        mean_dev_abs = np.abs(y - y_mean)
+        num_targets = np.sum(~np.isnan(y), axis=0)
+        frac_1_sigma = np.sum((mean_dev_abs < y_std), axis=0) / num_targets
+        frac_2_sigma = np.sum((mean_dev_abs < 2 * y_std), axis=0) / num_targets
+        column_headers = ["Statistic"] + [f"Value ({target_cols[i]})" for i in range(y.shape[1])]
+        table_rows = [
+            ["Num. smiles"] + [f"{len(y)}" for i in range(y.shape[1])],
+            ["Num. targets"] + [f"{num_targets[i]}" for i in range(y.shape[1])],
+            ["Num. NaN"] + [f"{len(y) - num_targets[i]}" for i in range(y.shape[1])],
+            ["Mean"] + [f"{mean:0.3g}" for mean in y_mean],
+            ["Std. dev."] + [f"{std:0.3g}" for std in y_std],
+            ["Median"] + [f"{median:0.3g}" for median in y_median],
+            ["% within 1 s.d."] + [f"{sigma:0.0%}" for sigma in frac_1_sigma],
+            ["% within 2 s.d."] + [f"{sigma:0.0%}" for sigma in frac_2_sigma],
+        ]
+        return (column_headers, table_rows)
+    elif task_type in [
+        "classification",
+        "classification-dirichlet",
+        "multiclass",
+        "multiclass-dirichlet",
+    ]:
+        if isinstance(dataset, MulticomponentDataset):
+            y = dataset.datasets[0].Y
+        else:
+            y = dataset.Y
+        mask = np.isnan(y)
+        classes = np.sort(np.unique(y[~mask]))
+        class_counts = np.stack([(classes[:, None] == y[:, i]).sum(1) for i in range(y.shape[1])])
+        class_fracs = class_counts / y.shape[0]
+        nan_count = np.nansum(mask, axis=0)
+        nan_frac = nan_count / y.shape[0]
+        column_headers = ["Class"] + [f"Count/Percent {target_cols[i]}" for i in range(y.shape[1])]
+        table_rows = [
+            [f"{k}"] + [f"{class_counts[j, i]}/{class_fracs[j, i]:0.0%}" for j in range(y.shape[1])]
+            for i, k in enumerate(classes)
+        ]
+        nan_row = ["NaN"] + [f"{nan_count[i]}/{nan_frac[i]:0.0%}" for i in range(y.shape[1])]
+        table_rows.append(nan_row)
+        total_row = ["Total"] + [f"{y.shape[0]}/{100.00}%" for i in range(y.shape[1])]
+        table_rows.append(total_row)
+        return (column_headers, table_rows)
+    else:
+        raise ValueError(f"unsupported task type! Task type '{task_type}' was not recognized.")
+def build_table(column_headers: list[str], table_rows: list[str], title: str | None = None) -> str:
+    right_justified_columns = [
+        Column(header=column_header, justify="right") for column_header in column_headers
+    ]
+    table = Table(*right_justified_columns, title=title)
+    for row in table_rows:
+        table.add_row(*row)
+    console = Console(record=True, file=StringIO(), width=200)
+    console.print(table)
+    return console.export_text()
+def build_datasets(args, train_data, val_data, test_data):
+    """build the train/val/test datasets, where :attr:`test_data` may be None"""
+    multicomponent = len(train_data) > 1
+    if multicomponent:
+        train_dsets = [
+            make_dataset(data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+            for data in train_data
+        ]
+        val_dsets = [
+            make_dataset(data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+            for data in val_data
+        ]
+        train_dset = MulticomponentDataset(train_dsets)
+        val_dset = MulticomponentDataset(val_dsets)
+        if len(test_data[0]) > 0:
+            test_dsets = [
+                make_dataset(data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+                for data in test_data
+            ]
+            test_dset = MulticomponentDataset(test_dsets)
+        else:
+            test_dset = None
+    else:
+        train_data = train_data[0]
+        val_data = val_data[0]
+        test_data = test_data[0]
+        train_dset = make_dataset(train_data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+        val_dset = make_dataset(val_data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+        if len(test_data) > 0:
+            test_dset = make_dataset(test_data, args.rxn_mode, args.multi_hot_atom_featurizer_mode)
+        else:
+            test_dset = None
+    if args.task_type != "spectral":
+        for dataset, label in zip(
+            [train_dset, val_dset, test_dset], ["Training", "Validation", "Test"]
+        ):
+            column_headers, table_rows = summarize(args.target_columns, args.task_type, dataset)
+            output = build_table(column_headers, table_rows, f"Summary of {label} Data")
+            logger.info("\n" + output)
+    return train_dset, val_dset, test_dset
+def build_model(
+    args,
+    train_dset: MolGraphDataset | MulticomponentDataset,
+    output_transform: UnscaleTransform,
+    input_transforms: tuple[ScaleTransform, list[GraphTransform], list[ScaleTransform]],
+) -> MPNN:
+    mp_cls = AtomMessagePassing if args.atom_messages else BondMessagePassing
+    X_d_transform, graph_transforms, V_d_transforms = input_transforms
+    if isinstance(train_dset, MulticomponentDataset):
+        mp_blocks = [
+            mp_cls(
+                train_dset.datasets[i].featurizer.atom_fdim,
+                train_dset.datasets[i].featurizer.bond_fdim,
+                d_h=args.message_hidden_dim,
+                d_vd=(
+                    train_dset.datasets[i].d_vd
+                    if isinstance(train_dset.datasets[i], MoleculeDataset)
+                    else 0
+                ),
+                bias=args.message_bias,
+                depth=args.depth,
+                undirected=args.undirected,
+                dropout=args.dropout,
+                activation=args.activation,
+                V_d_transform=V_d_transforms[i],
+                graph_transform=graph_transforms[i],
+            )
+            for i in range(train_dset.n_components)
+        ]
+        if args.mpn_shared:
+            if args.reaction_columns is not None and args.smiles_columns is not None:
+                raise ArgumentError(
+                    argument=None,
+                    message="Cannot use shared MPNN with both molecule and reaction data.",
+                )
+        mp_block = MulticomponentMessagePassing(mp_blocks, train_dset.n_components, args.mpn_shared)
+        # NOTE(degraff): this if/else block should be handled by the init of MulticomponentMessagePassing
+        # if args.mpn_shared:
+        #     mp_block = MulticomponentMessagePassing(mp_blocks[0], n_components, args.mpn_shared)
+        # else:
+        d_xd = train_dset.datasets[0].d_xd
+        n_tasks = train_dset.datasets[0].Y.shape[1]
+        mpnn_cls = MulticomponentMPNN
+    else:
+        mp_block = mp_cls(
+            train_dset.featurizer.atom_fdim,
+            train_dset.featurizer.bond_fdim,
+            d_h=args.message_hidden_dim,
+            d_vd=train_dset.d_vd if isinstance(train_dset, MoleculeDataset) else 0,
+            bias=args.message_bias,
+            depth=args.depth,
+            undirected=args.undirected,
+            dropout=args.dropout,
+            activation=args.activation,
+            V_d_transform=V_d_transforms[0],
+            graph_transform=graph_transforms[0],
+        )
+        d_xd = train_dset.d_xd
+        n_tasks = train_dset.Y.shape[1]
+        mpnn_cls = MPNN
+    agg = Factory.build(AggregationRegistry[args.aggregation], norm=args.aggregation_norm)
+    predictor_cls = PredictorRegistry[args.task_type]
+    if args.loss_function is not None:
+        task_weights = torch.ones(n_tasks) if args.task_weights is None else args.task_weights
+        criterion = Factory.build(
+            LossFunctionRegistry[args.loss_function],
+            task_weights=task_weights,
+            v_kl=args.v_kl,
+            # threshold=args.threshold, TODO: Add in v2.1
+            eps=args.eps,
+            alpha=args.alpha,
+        )
+    else:
+        criterion = None
+    if args.metrics is not None:
+        metrics = [Factory.build(MetricRegistry[metric]) for metric in args.metrics]
+    else:
+        metrics = None
+    predictor = Factory.build(
+        predictor_cls,
+        input_dim=mp_block.output_dim + d_xd,
+        n_tasks=n_tasks,
+        hidden_dim=args.ffn_hidden_dim,
+        n_layers=args.ffn_num_layers,
+        dropout=args.dropout,
+        activation=args.activation,
+        criterion=criterion,
+        task_weights=args.task_weights,
+        n_classes=args.multiclass_num_classes,
+        output_transform=output_transform,
+        # spectral_activation=args.spectral_activation, TODO: Add in v2.1
+    )
+    if args.loss_function is None:
+        logger.info(
+            f"No loss function was specified! Using class default: {predictor_cls._T_default_criterion}"
+        )
+    return mpnn_cls(
+        mp_block,
+        agg,
+        predictor,
+        args.batch_norm,
+        metrics,
+        args.warmup_epochs,
+        args.init_lr,
+        args.max_lr,
+        args.final_lr,
+        X_d_transform=X_d_transform,
+    )
+def train_model(
+    args, train_loader, val_loader, test_loader, output_dir, output_transform, input_transforms
+):
+    if args.checkpoint is not None:
+        model_paths = find_models(args.checkpoint)
+        if args.ensemble_size != len(model_paths):
+            logger.warning(
+                f"The number of models in ensemble for each splitting of data is set to {len(model_paths)}."
+            )
+            args.ensemble_size = len(model_paths)
+    for model_idx in range(args.ensemble_size):
+        model_output_dir = output_dir / f"model_{model_idx}"
+        model_output_dir.mkdir(exist_ok=True, parents=True)
+        if args.pytorch_seed is None:
+            seed = torch.seed()
+            deterministic = False
+        else:
+            seed = args.pytorch_seed + model_idx
+            deterministic = True
+        torch.manual_seed(seed)
+        if args.checkpoint or args.model_frzn is not None:
+            mpnn_cls = (
+                MulticomponentMPNN
+                if isinstance(train_loader.dataset, MulticomponentDataset)
+                else MPNN
+            )
+            model_path = model_paths[model_idx] if args.checkpoint else args.model_frzn
+            model = mpnn_cls.load_from_file(model_path)
+            if args.checkpoint:
+                model.apply(
+                    lambda m: setattr(m, "p", args.dropout)
+                    if isinstance(m, torch.nn.Dropout)
+                    else None
+                )
+            # TODO: model_frzn is deprecated and then remove in v2.2
+            if args.model_frzn or args.freeze_encoder:
+                model.message_passing.apply(lambda module: module.requires_grad_(False))
+                model.message_passing.eval()
+                model.bn.apply(lambda module: module.requires_grad_(False))
+                model.bn.eval()
+                for idx in range(args.frzn_ffn_layers):
+                    model.predictor.ffn[idx].requires_grad_(False)
+                    model.predictor.ffn[idx + 1].eval()
+        else:
+            model = build_model(args, train_loader.dataset, output_transform, input_transforms)
+        logger.info(model)
+        try:
+            trainer_logger = TensorBoardLogger(
+                model_output_dir, "trainer_logs", default_hp_metric=False
+            )
+        except ModuleNotFoundError as e:
+            logger.warning(
+                f"Unable to import TensorBoardLogger, reverting to CSVLogger (original error: {e})."
+            )
+            trainer_logger = CSVLogger(model_output_dir, "trainer_logs")
+        if args.tracking_metric == "val_loss":
+            T_tracking_metric = model.criterion.__class__
+            tracking_metric = args.tracking_metric
+        else:
+            T_tracking_metric = MetricRegistry[args.tracking_metric]
+            tracking_metric = "val/" + args.tracking_metric
+        monitor_mode = "max" if T_tracking_metric.higher_is_better else "min"
+        logger.debug(f"Evaluation metric: '{T_tracking_metric.alias}', mode: '{monitor_mode}'")
+        if args.remove_checkpoints:
+            temp_dir = TemporaryDirectory()
+            checkpoint_dir = Path(temp_dir.name)
+        else:
+            checkpoint_dir = model_output_dir
+        checkpoint_filename = (
+            f"best-epoch={{epoch}}-{tracking_metric.replace('/', '_')}="
+            f"{{{tracking_metric}:.2f}}"
+        )
+        checkpointing = ModelCheckpoint(
+            checkpoint_dir / "checkpoints",
+            checkpoint_filename,
+            tracking_metric,
+            mode=monitor_mode,
+            save_last=True,
+            auto_insert_metric_name=False,
+        )
+        if args.epochs != -1:
+            patience = args.patience if args.patience is not None else args.epochs
+            early_stopping = EarlyStopping(tracking_metric, patience=patience, mode=monitor_mode)
+            callbacks = [checkpointing, early_stopping]
+        else:
+            callbacks = [checkpointing]
+        trainer = pl.Trainer(
+            logger=trainer_logger,
+            enable_progress_bar=True,
+            accelerator=args.accelerator,
+            devices=args.devices,
+            max_epochs=args.epochs,
+            callbacks=callbacks,
+            gradient_clip_val=args.grad_clip,
+            deterministic=deterministic,
+        )
+        trainer.fit(model, train_loader, val_loader)
+        if test_loader is not None:
+            if isinstance(trainer.strategy, DDPStrategy):
+                torch.distributed.destroy_process_group()
+                best_ckpt_path = trainer.checkpoint_callback.best_model_path
+                trainer = pl.Trainer(
+                    logger=trainer_logger,
+                    enable_progress_bar=True,
+                    accelerator=args.accelerator,
+                    devices=1,
+                )
+                model = model.load_from_checkpoint(best_ckpt_path)
+                predss = trainer.predict(model, dataloaders=test_loader)
+            else:
+                predss = trainer.predict(dataloaders=test_loader)
+            preds = torch.concat(predss, 0)
+            if model.predictor.n_targets > 1:
+                preds = preds[..., 0]
+            preds = preds.numpy()
+            evaluate_and_save_predictions(
+                preds, test_loader, model.metrics[:-1], model_output_dir, args
+            )
+        best_model_path = checkpointing.best_model_path
+        model = model.__class__.load_from_checkpoint(best_model_path)
+        p_model = model_output_dir / "best.pt"
+        save_model(p_model, model, args.target_columns)
+        logger.info(f"Best model saved to '{p_model}'")
+        if args.remove_checkpoints:
+            temp_dir.cleanup()
+def evaluate_and_save_predictions(preds, test_loader, metrics, model_output_dir, args):
+    if isinstance(test_loader.dataset, MulticomponentDataset):
+        test_dset = test_loader.dataset.datasets[0]
+    else:
+        test_dset = test_loader.dataset
+    targets = test_dset.Y
+    mask = torch.from_numpy(np.isfinite(targets))
+    targets = np.nan_to_num(targets, nan=0.0)
+    weights = torch.ones(len(test_dset))
+    lt_mask = torch.from_numpy(test_dset.lt_mask) if test_dset.lt_mask[0] is not None else None
+    gt_mask = torch.from_numpy(test_dset.gt_mask) if test_dset.gt_mask[0] is not None else None
+    individual_scores = dict()
+    for metric in metrics:
+        individual_scores[metric.alias] = []
+        for i, col in enumerate(args.target_columns):
+            if "multiclass" in args.task_type:
+                preds_slice = torch.from_numpy(preds[:, i : i + 1, :])
+                targets_slice = torch.from_numpy(targets[:, i : i + 1])
+            else:
+                preds_slice = torch.from_numpy(preds[:, i : i + 1])
+                targets_slice = torch.from_numpy(targets[:, i : i + 1])
+            preds_loss = metric(
+                preds_slice,
+                targets_slice,
+                mask[:, i : i + 1],
+                weights,
+                lt_mask[:, i] if lt_mask is not None else None,
+                gt_mask[:, i] if gt_mask is not None else None,
+            )
+            individual_scores[metric.alias].append(preds_loss)
+    logger.info("Test Set results:")
+    for metric in metrics:
+        avg_loss = sum(individual_scores[metric.alias]) / len(individual_scores[metric.alias])
+        logger.info(f"test/{metric.alias}: {avg_loss}")
+    if args.show_individual_scores:
+        logger.info("Entire Test Set individual results:")
+        for metric in metrics:
+            for i, col in enumerate(args.target_columns):
+                logger.info(f"test/{col}/{metric.alias}: {individual_scores[metric.alias][i]}")
+    names = test_loader.dataset.names
+    if isinstance(test_loader.dataset, MulticomponentDataset):
+        namess = list(zip(*names))
+    else:
+        namess = [names]
+    columns = args.input_columns + args.target_columns
+    if "multiclass" in args.task_type:
+        columns = columns + [f"{col}_prob" for col in args.target_columns]
+        formatted_probability_strings = np.apply_along_axis(
+            lambda x: ",".join(map(str, x)), 2, preds
+        )
+        predicted_class_labels = preds.argmax(axis=-1)
+        df_preds = pd.DataFrame(
+            list(zip(*namess, *predicted_class_labels.T, *formatted_probability_strings.T)),
+            columns=columns,
+        )
+    else:
+        df_preds = pd.DataFrame(list(zip(*namess, *preds.T)), columns=columns)
+    df_preds.to_csv(model_output_dir / "test_predictions.csv", index=False)
+def main(args):
+    format_kwargs = dict(
+        no_header_row=args.no_header_row,
+        smiles_cols=args.smiles_columns,
+        rxn_cols=args.reaction_columns,
+        target_cols=args.target_columns,
+        ignore_cols=args.ignore_columns,
+        splits_col=args.splits_column,
+        weight_col=args.weight_column,
+        bounded=args.loss_function is not None and "bounded" in args.loss_function,
+    )
+    featurization_kwargs = dict(
+        molecule_featurizers=args.molecule_featurizers, keep_h=args.keep_h, add_h=args.add_h
+    )
+    splits = build_splits(args, format_kwargs, featurization_kwargs)
+    for replicate_idx, (train_data, val_data, test_data) in enumerate(zip(*splits)):
+        if args.num_replicates == 1:
+            output_dir = args.output_dir
+        else:
+            output_dir = args.output_dir / f"replicate_{replicate_idx}"
+        output_dir.mkdir(exist_ok=True, parents=True)
+        train_dset, val_dset, test_dset = build_datasets(args, train_data, val_data, test_data)
+        if args.save_smiles_splits:
+            save_smiles_splits(args, output_dir, train_dset, val_dset, test_dset)
+        if args.checkpoint or args.model_frzn is not None:
+            model_paths = find_models(args.checkpoint)
+            if len(model_paths) > 1:
+                logger.warning(
+                    "Multiple checkpoint files were loaded, but only the scalers from "
+                    f"{model_paths[0]} are used. It is assumed that all models provided have the "
+                    "same data scalings, meaning they were trained on the same data."
+                )
+            model_path = model_paths[0] if args.checkpoint else args.model_frzn
+            load_and_use_pretrained_model_scalers(model_path, train_dset, val_dset)
+            input_transforms = (None, None, None)
+            output_transform = None
+        else:
+            input_transforms = normalize_inputs(train_dset, val_dset, args)
+            if "regression" in args.task_type:
+                output_scaler = train_dset.normalize_targets()
+                val_dset.normalize_targets(output_scaler)
+                logger.info(
+                    f"Train data: mean = {output_scaler.mean_} | std = {output_scaler.scale_}"
+                )
+                output_transform = UnscaleTransform.from_standard_scaler(output_scaler)
+            else:
+                output_transform = None
+        if not args.no_cache:
+            train_dset.cache = True
+            val_dset.cache = True
+        train_loader = build_dataloader(
+            train_dset,
+            args.batch_size,
+            args.num_workers,
+            class_balance=args.class_balance,
+            seed=args.data_seed,
+        )
+        if args.class_balance:
+            logger.debug(
+                f"With `--class-balance`, effective train size = {len(train_loader.sampler)}"
+            )
+        val_loader = build_dataloader(val_dset, args.batch_size, args.num_workers, shuffle=False)
+        if test_dset is not None:
+            test_loader = build_dataloader(
+                test_dset, args.batch_size, args.num_workers, shuffle=False
+            )
+        else:
+            test_loader = None
+        train_model(
+            args,
+            train_loader,
+            val_loader,
+            test_loader,
+            output_dir,
+            output_transform,
+            input_transforms,
+        )
+if __name__ == "__main__":
+    # TODO: update this old code or remove it.
+    parser = ArgumentParser()
+    parser = TrainSubcommand.add_args(parser)
+    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, force=True)
+    args = parser.parse_args()
+    TrainSubcommand.func(args)

chemprop-updated/chemprop/cli/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from .actions import LookupAction
+from .args import bounded
+from .command import Subcommand
+from .parsing import (
+    build_data_from_files,
+    get_column_names,
+    make_datapoints,
+    make_dataset,
+    parse_indices,
+)
+from .utils import _pop_attr, _pop_attr_d, pop_attr
+__all__ = [
+    "bounded",
+    "LookupAction",
+    "Subcommand",
+    "build_data_from_files",
+    "make_datapoints",
+    "make_dataset",
+    "get_column_names",
+    "parse_indices",
+    "actions",
+    "args",
+    "command",
+    "parsing",
+    "utils",
+    "pop_attr",
+    "_pop_attr",
+    "_pop_attr_d",
+]

chemprop-updated/chemprop/cli/utils/actions.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from argparse import _StoreAction
+from typing import Any, Mapping
+def LookupAction(obj: Mapping[str, Any]):
+    class LookupAction_(_StoreAction):
+        def __init__(self, option_strings, dest, default=None, choices=None, **kwargs):
+            if default not in obj.keys() and default is not None:
+                raise ValueError(
+                    f"Invalid value for arg 'default': '{default}'. "
+                    f"Expected one of {tuple(obj.keys())}"
+                )
+            kwargs["choices"] = choices if choices is not None else obj.keys()
+            kwargs["default"] = default
+            super().__init__(option_strings, dest, **kwargs)
+    return LookupAction_

chemprop-updated/chemprop/cli/utils/args.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import functools
+__all__ = ["bounded"]
+def bounded(lo: float | None = None, hi: float | None = None):
+    if lo is None and hi is None:
+        raise ValueError("No bounds provided!")
+    def decorator(f):
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+            x = f(*args, **kwargs)
+            if (lo is not None and hi is not None) and not lo <= x <= hi:
+                raise ValueError(f"Parsed value outside of range [{lo}, {hi}]! got: {x}")
+            if hi is not None and x > hi:
+                raise ValueError(f"Parsed value below {hi}! got: {x}")
+            if lo is not None and x < lo:
+                raise ValueError(f"Parsed value above {lo}]! got: {x}")
+            return x
+        return wrapper
+    return decorator
+def uppercase(x: str):
+    return x.upper()
+def lowercase(x: str):
+    return x.lower()

chemprop-updated/chemprop/cli/utils/command.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from abc import ABC, abstractmethod
+from argparse import ArgumentParser, Namespace, _SubParsersAction
+class Subcommand(ABC):
+    COMMAND: str
+    HELP: str | None = None
+    @classmethod
+    def add(cls, subparsers: _SubParsersAction, parents) -> ArgumentParser:
+        parser = subparsers.add_parser(cls.COMMAND, help=cls.HELP, parents=parents)
+        cls.add_args(parser).set_defaults(func=cls.func)
+        return parser
+    @classmethod
+    @abstractmethod
+    def add_args(cls, parser: ArgumentParser) -> ArgumentParser:
+        pass
+    @classmethod
+    @abstractmethod
+    def func(cls, args: Namespace):
+        pass

chemprop-updated/chemprop/cli/utils/parsing.py ADDED Viewed

	@@ -0,0 +1,446 @@

+import logging
+from os import PathLike
+from typing import Literal, Mapping, Sequence
+import numpy as np
+import pandas as pd
+from chemprop.data.datapoints import MoleculeDatapoint, ReactionDatapoint
+from chemprop.data.datasets import MoleculeDataset, ReactionDataset
+from chemprop.featurizers.atom import get_multi_hot_atom_featurizer
+from chemprop.featurizers.bond import MultiHotBondFeaturizer, RIGRBondFeaturizer
+from chemprop.featurizers.molecule import MoleculeFeaturizerRegistry
+from chemprop.featurizers.molgraph import (
+    CondensedGraphOfReactionFeaturizer,
+    SimpleMoleculeMolGraphFeaturizer,
+)
+from chemprop.utils import make_mol
+logger = logging.getLogger(__name__)
+def parse_csv(
+    path: PathLike,
+    smiles_cols: Sequence[str] | None,
+    rxn_cols: Sequence[str] | None,
+    target_cols: Sequence[str] | None,
+    ignore_cols: Sequence[str] | None,
+    splits_col: str | None,
+    weight_col: str | None,
+    bounded: bool = False,
+    no_header_row: bool = False,
+):
+    df = pd.read_csv(path, header=None if no_header_row else "infer", index_col=False)
+    if smiles_cols is not None and rxn_cols is not None:
+        smiss = df[smiles_cols].T.values.tolist()
+        rxnss = df[rxn_cols].T.values.tolist()
+        input_cols = [*smiles_cols, *rxn_cols]
+    elif smiles_cols is not None and rxn_cols is None:
+        smiss = df[smiles_cols].T.values.tolist()
+        rxnss = None
+        input_cols = smiles_cols
+    elif smiles_cols is None and rxn_cols is not None:
+        smiss = None
+        rxnss = df[rxn_cols].T.values.tolist()
+        input_cols = rxn_cols
+    else:
+        smiss = df.iloc[:, [0]].T.values.tolist()
+        rxnss = None
+        input_cols = [df.columns[0]]
+    if target_cols is None:
+        target_cols = list(
+            column
+            for column in df.columns
+            if column
+            not in set(  # if splits or weight is None, df.columns will never have None
+                input_cols + (ignore_cols or []) + [splits_col] + [weight_col]
+            )
+        )
+    Y = df[target_cols]
+    weights = None if weight_col is None else df[weight_col].to_numpy(np.single)
+    if bounded:
+        lt_mask = Y.applymap(lambda x: "<" in x).to_numpy()
+        gt_mask = Y.applymap(lambda x: ">" in x).to_numpy()
+        Y = Y.applymap(lambda x: x.strip("<").strip(">")).to_numpy(np.single)
+    else:
+        Y = Y.to_numpy(np.single)
+        lt_mask = None
+        gt_mask = None
+    return smiss, rxnss, Y, weights, lt_mask, gt_mask
+def get_column_names(
+    path: PathLike,
+    smiles_cols: Sequence[str] | None,
+    rxn_cols: Sequence[str] | None,
+    target_cols: Sequence[str] | None,
+    ignore_cols: Sequence[str] | None,
+    splits_col: str | None,
+    weight_col: str | None,
+    no_header_row: bool = False,
+) -> tuple[list[str], list[str]]:
+    df_cols = pd.read_csv(path, index_col=False, nrows=0).columns.tolist()
+    if no_header_row:
+        return ["SMILES"], ["pred_" + str(i) for i in range((len(df_cols) - 1))]
+    input_cols = (smiles_cols or []) + (rxn_cols or [])
+    if len(input_cols) == 0:
+        input_cols = [df_cols[0]]
+    if target_cols is None:
+        target_cols = list(
+            column
+            for column in df_cols
+            if column
+            not in set(
+                input_cols + (ignore_cols or []) + ([splits_col] or []) + ([weight_col] or [])
+            )
+        )
+    return input_cols, target_cols
+def make_datapoints(
+    smiss: list[list[str]] | None,
+    rxnss: list[list[str]] | None,
+    Y: np.ndarray,
+    weights: np.ndarray | None,
+    lt_mask: np.ndarray | None,
+    gt_mask: np.ndarray | None,
+    X_d: np.ndarray | None,
+    V_fss: list[list[np.ndarray] | list[None]] | None,
+    E_fss: list[list[np.ndarray] | list[None]] | None,
+    V_dss: list[list[np.ndarray] | list[None]] | None,
+    molecule_featurizers: list[str] | None,
+    keep_h: bool,
+    add_h: bool,
+) -> tuple[list[list[MoleculeDatapoint]], list[list[ReactionDatapoint]]]:
+    """Make the :class:`MoleculeDatapoint`s and :class:`ReactionDatapoint`s for a given
+    dataset.
+    Parameters
+    ----------
+    smiss : list[list[str]] | None
+        a list of ``j`` lists of ``n`` SMILES strings, where ``j`` is the number of molecules per
+        datapoint and ``n`` is the number of datapoints. If ``None``, the corresponding list of
+        :class:`MoleculeDatapoint`\s will be empty.
+    rxnss : list[list[str]] | None
+        a list of ``k`` lists of ``n`` reaction SMILES strings, where ``k`` is the number of
+        reactions per datapoint. If ``None``, the corresponding list of :class:`ReactionDatapoint`\s
+        will be empty.
+    Y : np.ndarray
+        the target values of shape ``n x m``, where ``m`` is the number of targets
+    weights : np.ndarray | None
+        the weights of the datapoints to use in the loss function of shape ``n x m``. If ``None``,
+        the weights all default to 1.
+    lt_mask : np.ndarray | None
+        a boolean mask of shape ``n x m`` indicating whether the targets are less than inequality
+        targets. If ``None``, ``lt_mask`` for all datapoints will be ``None``.
+    gt_mask : np.ndarray | None
+        a boolean mask of shape ``n x m`` indicating whether the targets are greater than inequality
+        targets. If ``None``, ``gt_mask`` for all datapoints will be ``None``.
+    X_d : np.ndarray | None
+        the extra descriptors of shape ``n x p``, where ``p`` is the number of extra descriptors. If
+        ``None``, ``x_d`` for all datapoints will be ``None``.
+    V_fss : list[list[np.ndarray] | list[None]] | None
+        a list of ``j`` lists of ``n`` np.ndarrays each of shape ``v_jn x q_j``, where ``v_jn`` is
+        the number of atoms in the j-th molecule of the n-th datapoint and ``q_j`` is the number of
+        extra atom features used for the j-th molecules. Any of the ``j`` lists can be a list of
+        None values if the corresponding component does not use extra atom features. If ``None``,
+        ``V_f`` for all datapoints will be ``None``.
+    E_fss : list[list[np.ndarray] | list[None]] | None
+        a list of ``j`` lists of ``n`` np.ndarrays each of shape ``e_jn x r_j``, where ``e_jn`` is
+        the number of bonds in the j-th molecule of the n-th datapoint and ``r_j`` is the number of
+        extra bond features used for the j-th molecules. Any of the ``j`` lists can be a list of
+        None values if the corresponding component does not use extra bond features. If ``None``,
+        ``E_f`` for all datapoints will be ``None``.
+    V_dss : list[list[np.ndarray] | list[None]] | None
+        a list of ``j`` lists of ``n`` np.ndarrays each of shape ``v_jn x s_j``, where ``s_j`` is
+        the number of extra atom descriptors used for the j-th molecules. Any of the ``j`` lists can
+        be a list of None values if the corresponding component does not use extra atom features. If
+        ``None``, ``V_d`` for all datapoints will be ``None``.
+    molecule_featurizers : list[str] | None
+        a list of molecule featurizer names to generate additional molecule features to use as extra
+        descriptors. If there are multiple molecules per datapoint, the featurizers will be applied
+        to each molecule and concatenated. Note that a :code:`ReactionDatapoint` has two
+        RDKit :class:`~rdkit.Chem.Mol` objects, reactant(s) and product(s). Each
+        ``molecule_featurizer`` will be applied to both of these objects.
+    keep_h : bool
+    add_h : bool
+    Returns
+    -------
+    list[list[MoleculeDatapoint]]
+        a list of ``j`` lists of ``n`` :class:`MoleculeDatapoint`\s
+    list[list[ReactionDatapoint]]
+        a list of ``k`` lists of ``n`` :class:`ReactionDatapoint`\s
+    .. note::
+        either ``j`` or ``k`` may be 0, in which case the corresponding list will be empty.
+    Raises
+    ------
+    ValueError
+        if both ``smiss`` and ``rxnss`` are ``None``.
+        if ``smiss`` and ``rxnss`` are both given and have different lengths.
+    """
+    if smiss is None and rxnss is None:
+        raise ValueError("args 'smiss' and 'rnxss' were both `None`!")
+    elif rxnss is None:
+        N = len(smiss[0])
+        rxnss = []
+    elif smiss is None:
+        N = len(rxnss[0])
+        smiss = []
+    elif len(smiss[0]) != len(rxnss[0]):
+        raise ValueError(
+            f"args 'smiss' and 'rxnss' must have same length! got {len(smiss[0])} and {len(rxnss[0])}"
+        )
+    else:
+        N = len(smiss[0])
+    if len(smiss) > 0:
+        molss = [[make_mol(smi, keep_h, add_h) for smi in smis] for smis in smiss]
+    if len(rxnss) > 0:
+        rctss = [
+            [
+                make_mol(f"{rct_smi}.{agt_smi}" if agt_smi else rct_smi, keep_h, add_h)
+                for rct_smi, agt_smi, _ in (rxn.split(">") for rxn in rxns)
+            ]
+            for rxns in rxnss
+        ]
+        pdtss = [
+            [make_mol(pdt_smi, keep_h, add_h) for _, _, pdt_smi in (rxn.split(">") for rxn in rxns)]
+            for rxns in rxnss
+        ]
+    weights = np.ones(N, dtype=np.single) if weights is None else weights
+    gt_mask = [None] * N if gt_mask is None else gt_mask
+    lt_mask = [None] * N if lt_mask is None else lt_mask
+    n_mols = len(smiss) if smiss else 0
+    V_fss = [[None] * N] * n_mols if V_fss is None else V_fss
+    E_fss = [[None] * N] * n_mols if E_fss is None else E_fss
+    V_dss = [[None] * N] * n_mols if V_dss is None else V_dss
+    if X_d is None and molecule_featurizers is None:
+        X_d = [None] * N
+    elif molecule_featurizers is None:
+        pass
+    else:
+        molecule_featurizers = [MoleculeFeaturizerRegistry[mf]() for mf in molecule_featurizers]
+        if len(smiss) > 0:
+            mol_descriptors = np.hstack(
+                [
+                    np.vstack([np.hstack([mf(mol) for mf in molecule_featurizers]) for mol in mols])
+                    for mols in molss
+                ]
+            )
+            if X_d is None:
+                X_d = mol_descriptors
+            else:
+                X_d = np.hstack([X_d, mol_descriptors])
+        if len(rxnss) > 0:
+            rct_pdt_descriptors = np.hstack(
+                [
+                    np.vstack(
+                        [
+                            np.hstack(
+                                [mf(mol) for mf in molecule_featurizers for mol in (rct, pdt)]
+                            )
+                            for rct, pdt in zip(rcts, pdts)
+                        ]
+                    )
+                    for rcts, pdts in zip(rctss, pdtss)
+                ]
+            )
+            if X_d is None:
+                X_d = rct_pdt_descriptors
+            else:
+                X_d = np.hstack([X_d, rct_pdt_descriptors])
+    mol_data = [
+        [
+            MoleculeDatapoint(
+                mol=molss[mol_idx][i],
+                name=smis[i],
+                y=Y[i],
+                weight=weights[i],
+                gt_mask=gt_mask[i],
+                lt_mask=lt_mask[i],
+                x_d=X_d[i],
+                x_phase=None,
+                V_f=V_fss[mol_idx][i],
+                E_f=E_fss[mol_idx][i],
+                V_d=V_dss[mol_idx][i],
+            )
+            for i in range(N)
+        ]
+        for mol_idx, smis in enumerate(smiss)
+    ]
+    rxn_data = [
+        [
+            ReactionDatapoint(
+                rct=rctss[rxn_idx][i],
+                pdt=pdtss[rxn_idx][i],
+                name=rxns[i],
+                y=Y[i],
+                weight=weights[i],
+                gt_mask=gt_mask[i],
+                lt_mask=lt_mask[i],
+                x_d=X_d[i],
+                x_phase=None,
+            )
+            for i in range(N)
+        ]
+        for rxn_idx, rxns in enumerate(rxnss)
+    ]
+    return mol_data, rxn_data
+def build_data_from_files(
+    p_data: PathLike,
+    no_header_row: bool,
+    smiles_cols: Sequence[str] | None,
+    rxn_cols: Sequence[str] | None,
+    target_cols: Sequence[str] | None,
+    ignore_cols: Sequence[str] | None,
+    splits_col: str | None,
+    weight_col: str | None,
+    bounded: bool,
+    p_descriptors: PathLike,
+    p_atom_feats: dict[int, PathLike],
+    p_bond_feats: dict[int, PathLike],
+    p_atom_descs: dict[int, PathLike],
+    **featurization_kwargs: Mapping,
+) -> list[list[MoleculeDatapoint] | list[ReactionDatapoint]]:
+    smiss, rxnss, Y, weights, lt_mask, gt_mask = parse_csv(
+        p_data,
+        smiles_cols,
+        rxn_cols,
+        target_cols,
+        ignore_cols,
+        splits_col,
+        weight_col,
+        bounded,
+        no_header_row,
+    )
+    n_molecules = len(smiss) if smiss is not None else 0
+    n_datapoints = len(Y)
+    X_ds = load_input_feats_and_descs(p_descriptors, None, None, feat_desc="X_d")
+    V_fss = load_input_feats_and_descs(p_atom_feats, n_molecules, n_datapoints, feat_desc="V_f")
+    E_fss = load_input_feats_and_descs(p_bond_feats, n_molecules, n_datapoints, feat_desc="E_f")
+    V_dss = load_input_feats_and_descs(p_atom_descs, n_molecules, n_datapoints, feat_desc="V_d")
+    mol_data, rxn_data = make_datapoints(
+        smiss,
+        rxnss,
+        Y,
+        weights,
+        lt_mask,
+        gt_mask,
+        X_ds,
+        V_fss,
+        E_fss,
+        V_dss,
+        **featurization_kwargs,
+    )
+    return mol_data + rxn_data
+def load_input_feats_and_descs(
+    paths: dict[int, PathLike] | PathLike,
+    n_molecules: int | None,
+    n_datapoints: int | None,
+    feat_desc: str,
+):
+    if paths is None:
+        return None
+    match feat_desc:
+        case "X_d":
+            path = paths
+            loaded_feature = np.load(path)
+            features = loaded_feature["arr_0"]
+        case _:
+            for index in paths:
+                if index >= n_molecules:
+                    raise ValueError(
+                        f"For {n_molecules} molecules, atom/bond features/descriptors can only be "
+                        f"specified for indices 0-{n_molecules - 1}! Got index {index}."
+                    )
+            features = []
+            for idx in range(n_molecules):
+                path = paths.get(idx, None)
+                if path is not None:
+                    loaded_feature = np.load(path)
+                    loaded_feature = [
+                        loaded_feature[f"arr_{i}"] for i in range(len(loaded_feature))
+                    ]
+                else:
+                    loaded_feature = [None] * n_datapoints
+                features.append(loaded_feature)
+    return features
+def make_dataset(
+    data: Sequence[MoleculeDatapoint] | Sequence[ReactionDatapoint],
+    reaction_mode: str,
+    multi_hot_atom_featurizer_mode: Literal["V1", "V2", "ORGANIC", "RIGR"] = "V2",
+) -> MoleculeDataset | ReactionDataset:
+    atom_featurizer = get_multi_hot_atom_featurizer(multi_hot_atom_featurizer_mode)
+    match multi_hot_atom_featurizer_mode:
+        case "RIGR":
+            bond_featurizer = RIGRBondFeaturizer()
+        case "V1" | "V2" | "ORGANIC":
+            bond_featurizer = MultiHotBondFeaturizer()
+        case _:
+            raise TypeError(
+                f"Unsupported atom featurizer mode '{multi_hot_atom_featurizer_mode=}'!"
+            )
+    if isinstance(data[0], MoleculeDatapoint):
+        extra_atom_fdim = data[0].V_f.shape[1] if data[0].V_f is not None else 0
+        extra_bond_fdim = data[0].E_f.shape[1] if data[0].E_f is not None else 0
+        featurizer = SimpleMoleculeMolGraphFeaturizer(
+            atom_featurizer=atom_featurizer,
+            bond_featurizer=bond_featurizer,
+            extra_atom_fdim=extra_atom_fdim,
+            extra_bond_fdim=extra_bond_fdim,
+        )
+        return MoleculeDataset(data, featurizer)
+    featurizer = CondensedGraphOfReactionFeaturizer(
+        mode_=reaction_mode, atom_featurizer=atom_featurizer
+    )
+    return ReactionDataset(data, featurizer)
+def parse_indices(idxs):
+    """Parses a string of indices into a list of integers. e.g. '0,1,2-4' -> [0, 1, 2, 3, 4]"""
+    if isinstance(idxs, str):
+        indices = []
+        for idx in idxs.split(","):
+            if "-" in idx:
+                start, end = map(int, idx.split("-"))
+                indices.extend(range(start, end + 1))
+            else:
+                indices.append(int(idx))
+        return indices
+    return idxs

chemprop-updated/chemprop/cli/utils/utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Any
+__all__ = ["pop_attr"]
+def pop_attr(o: object, attr: str, *args) -> Any | None:
+    """like ``pop()`` but for attribute maps"""
+    match len(args):
+        case 0:
+            return _pop_attr(o, attr)
+        case 1:
+            return _pop_attr_d(o, attr, args[0])
+        case _:
+            raise TypeError(f"Expected at most 2 arguments! got: {len(args)}")
+def _pop_attr(o: object, attr: str) -> Any:
+    val = getattr(o, attr)
+    delattr(o, attr)
+    return val
+def _pop_attr_d(o: object, attr: str, default: Any | None = None) -> Any | None:
+    try:
+        val = getattr(o, attr)
+        delattr(o, attr)
+    except AttributeError:
+        val = default
+    return val

chemprop-updated/chemprop/conf.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Global configuration variables for chemprop"""
+from chemprop.featurizers.molgraph.molecule import SimpleMoleculeMolGraphFeaturizer
+DEFAULT_ATOM_FDIM, DEFAULT_BOND_FDIM = SimpleMoleculeMolGraphFeaturizer().shape
+DEFAULT_HIDDEN_DIM = 300