chemCPA / embeddings /grover /generate_embeddings.py
github-actions[bot]
HF snapshot
a48f0ae
# ---
# jupyter:
# jupytext:
# notebook_metadata_filter: -kernelspec
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.14.1
# ---
# %% [markdown]
# # GROVER
# Generate GROVER fingerprints for SMILES-drugs coming from LINCS + SciPlex3.
#
# Steps:
# 1. Load `lincs_trapnell.smiles` as the list of SMILES to be encoded
# 2. Generate fingerprints using GROVER
# 3. Save SMILES -> fingerprint mapping as a pandas df.
#
# from pathlib import Path
#
# import numpy as np
# import pandas as pd
# %%
import rdkit
# %%
import scanpy as sc
from rdkit import Chem
rdkit.__version__
# %%
# SET
datasets_fpath = Path("/home/icb/simon.boehm/Masters_thesis/MT_code/datasets")
all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"
# %% [markdown]
# ## Step 1: Generate fingerprints
#
# - TODO: Right now we generate `rdkit_2d_normalized` features. Are these the correct ones?
# - TODO: There are pretrained & finetuned models also available, maybe that's useful for us:
# - SIDER: Drug side effect prediction task
# - ClinTox: Drug toxicity prediction task
# - ChEMBL log P prediction task
# %% language="bash"
# set -euox pipefail
#
# # move csv of all smiles to be encoded into current workdir
# cp ../lincs_trapnell.smiles data/embeddings/lincs_trapnell.csv
# file="data/embeddings/lincs_trapnell.csv"
#
# # First we generate the feature embedding for the SMILES, which is an extra input
# # into GROVER
# echo "FILE: $file"
# features=$(echo $file | sed 's:.csv:.npz:')
# if [[ ! -f $features ]]; then
# echo "Generating features: $features"
# python scripts/save_features.py --data_path "$file" \
# --save_path "$features" \
# --features_generator rdkit_2d_normalized \
# --restart
# fi;
#
# # Second we input SMILES + Features into grover and get the fingerprint out
# # 'both' means we get a concatenated fingerprint of combined atoms + bonds features
# outfile=$(echo $file | sed 's:.csv:_grover_base_both.npz:')
# echo "EMB: $outfile"
# if [[ ! -f $outfile ]]; then
# echo "Generating embedding: $outfile"
# python main.py fingerprint --data_path "$file" \
# --features_path "$features" \
# --checkpoint_path data/model/grover_base.pt \
# --fingerprint_source both \
# --output "$outfile"
# fi;
# %%
lincs_trapnell_base = np.load("data/embeddings/lincs_trapnell_grover_base_both.npz")
print("Shape of GROVER_base embedding:", lincs_trapnell_base["fps"].shape)
# %% [markdown]
# ## Step 2: Generate DataFrame with SMILES -> Embedding mapping
# %%
def flatten(x: np.ndarray):
assert len(x.shape) == 2 and x.shape[0] == 1
return x[0]
embeddings_fpath = Path("data/embeddings")
smiles_file = embeddings_fpath / "lincs_trapnell.csv"
emb_file = embeddings_fpath / "lincs_trapnell_grover_base_both.npz"
# read list of smiles
smiles_df = pd.read_csv(smiles_file)
# read generated embedding (.npz has only one key, 'fps')
emb = np.load(emb_file)["fps"]
assert len(smiles_df) == emb.shape[0]
# generate a DataFrame with SMILES and Embedding in each row
final_df = pd.DataFrame(
emb,
index=smiles_df["smiles"].values,
columns=[f"latent_{i+1}" for i in range(emb.shape[1])],
)
# remove duplicates indices (=SMILES) (This is probably useless)
final_df = final_df[~final_df.index.duplicated(keep="first")]
final_df.to_parquet(embeddings_fpath / "grover_base.parquet")
# %%
df = pd.read_parquet("data/embeddings/grover_base.parquet")
# %%
df
# %% [markdown]
# ## Step 3: Check
# Make extra sure the index of the generated dataframe is correct by loading our list of canonical SMILES again
# %%
all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"
all_smiles = pd.read_csv(all_smiles_fpath)["smiles"].values
assert sorted(list(df.index)) == sorted(list(all_smiles))