File size: 3,583 Bytes
a48f0ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# ---
# jupyter:
# jupytext:
# notebook_metadata_filter: -kernelspec
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.14.1
# ---
# %% [markdown]
# **Requirements**
# * According to this [paper](https://arxiv.org/pdf/1904.01561.pdf), features are computed with [descriptastorus](https://github.com/bp-kelley/descriptastorus) package
# * Install via: `pip install git+https://github.com/bp-kelley/descriptastorus`
# %% [markdown]
# ## General imports
# %%
import sys
# this depends on the notebook depth and must be adapted per notebook
sys.path.insert(0, "/")
# %%
import numpy as np
# %%
import scanpy as sc
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from chemCPA.helper import canonicalize_smiles
from chemCPA.paths import DATA_DIR, EMBEDDING_DIR
# %% [markdown]
# ## Load Smiles list
# %%
adata = sc.read(DATA_DIR / "adata_biolord_split_30.h5ad")
# %%
smiles_list = adata.obs["smiles"].unique()
# exclude nan from smiles_list
smiles_list = [canonicalize_smiles(s) for s in smiles_list if s != "nan"]
# %%
print(f"Number of smiles strings: {len(smiles_list)}")
# %%
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
generator = MakeGenerator(("RDKit2D",))
for name, numpy_type in generator.GetColumns():
print(f"{name}({numpy_type.__name__})")
# %%
n_jobs = 16
data = Parallel(n_jobs=n_jobs)(
delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0, leave=True)
)
# %%
data = [d[1:] for d in data]
# %%
embedding = np.array(data)
embedding.shape
# %% [markdown]
# ## Check `nans` and `infs`
# %% [markdown]
# Check for `nans`
# %%
drug_idx, feature_idx = np.where(np.isnan(embedding))
print(f"drug_idx:\n {drug_idx}")
print(f"feature_idx:\n {feature_idx}")
# %% [markdown]
# Check for `infs` and add to idx lists
# %%
drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding))
drug_idx = np.concatenate((drug_idx, drug_idx_infs))
feature_idx = np.concatenate((feature_idx, feature_idx_infs))
# %% [markdown]
# Features that have these invalid values:
# %% tags=[]
np.array(generator.GetColumns())[np.unique(feature_idx)]
# %% [markdown]
# Set values to `0`
# %%
embedding[drug_idx, feature_idx]
# %%
embedding[drug_idx, feature_idx] = 0
# %% [markdown]
# ## Save
# %%
import pandas as pd
df = pd.DataFrame(data=embedding, index=smiles_list, columns=[f"latent_{i}" for i in range(embedding.shape[1])])
# Drop first feature from generator (RDKit2D_calculated)
df.drop(columns=["latent_0"], inplace=True)
# Drop columns with 0 standard deviation
threshold = 0.01
columns = [f"latent_{idx+1}" for idx in np.where(df.std() <= threshold)[0]]
print(f"Deleting columns with std<={threshold}: {columns}")
df.drop(columns=[f"latent_{idx+1}" for idx in np.where(df.std() <= 0.01)[0]], inplace=True)
# %% [markdown]
# Check that correct columns were deleted:
# %%
np.where(df.std() <= threshold)
# %% [markdown]
# ### Normalise dataframe
# %%
normalized_df = (df - df.mean()) / df.std()
# %%
normalized_df
# %% [markdown]
# Check destination folder
# %%
model_name = "rdkit2D"
dataset_name = "biolord"
fname = f"{model_name}_embedding_{dataset_name}.parquet"
directory = EMBEDDING_DIR / "rdkit" / "data" / "embeddings"
directory.mkdir(parents=True, exist_ok=True)
# %% [markdown]
# Save normalised version
# %%
normalized_df.to_parquet(directory / fname)
# %% [markdown]
# Check that it worked
# %%
df = pd.read_parquet(directory / fname)
df
# %%
directory / fname
# %%
|