**Requirements** 
* According to this [paper](https://arxiv.org/pdf/1904.01561.pdf), features are computed with [descriptastorus](https://github.com/bp-kelley/descriptastorus) package
* Install via: `pip install git+https://github.com/bp-kelley/descriptastorus`

## General imports

In [4]:
import sys

# this depends on the notebook depth and must be adapted per notebook
sys.path.insert(0, "/")  
from chemCPA.paths import DATA_DIR, EMBEDDING_DIR

In [5]:
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Load Smiles list

In [6]:
import scanpy as sc
from chemCPA.helper import canonicalize_smiles

In [7]:
adata = sc.read(DATA_DIR/ "adata_biolord_split_30.h5ad")

  utils.warn_names_duplicates("obs")


In [8]:
smiles_list = adata.obs["smiles"].unique()
# exclude nan from smiles_list 
smiles_list = [canonicalize_smiles(s) for s in smiles_list if s !="nan"]

In [9]:
print(f'Number of smiles strings: {len(smiles_list)}')

Number of smiles strings: 186


In [10]:
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
generator = MakeGenerator(("RDKit2D",))
for name, numpy_type in generator.GetColumns():
    print(f"{name}({numpy_type.__name__})")

RDKit2D_calculated(bool)
BalabanJ(float64)
BertzCT(float64)
Chi0(float64)
Chi0n(float64)
Chi0v(float64)
Chi1(float64)
Chi1n(float64)
Chi1v(float64)
Chi2n(float64)
Chi2v(float64)
Chi3n(float64)
Chi3v(float64)
Chi4n(float64)
Chi4v(float64)
EState_VSA1(float64)
EState_VSA10(float64)
EState_VSA11(float64)
EState_VSA2(float64)
EState_VSA3(float64)
EState_VSA4(float64)
EState_VSA5(float64)
EState_VSA6(float64)
EState_VSA7(float64)
EState_VSA8(float64)
EState_VSA9(float64)
ExactMolWt(float64)
FpDensityMorgan1(float64)
FpDensityMorgan2(float64)
FpDensityMorgan3(float64)
FractionCSP3(float64)
HallKierAlpha(float64)
HeavyAtomCount(float64)
HeavyAtomMolWt(float64)
Ipc(float64)
Kappa1(float64)
Kappa2(float64)
Kappa3(float64)
LabuteASA(float64)
MaxAbsEStateIndex(float64)
MaxAbsPartialCharge(float64)
MaxEStateIndex(float64)
MaxPartialCharge(float64)
MinAbsEStateIndex(float64)
MinAbsPartialCharge(float64)
MinEStateIndex(float64)
MinPartialCharge(float64)
MolLogP(float64)
MolMR(float64)
MolWt(float64)

In [11]:
n_jobs = 16
data = Parallel(n_jobs=n_jobs)(delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0, leave=True) )

  0%|          | 0/186 [00:00<?, ?it/s]

In [12]:
data = [d[1:] for d in data]

In [13]:
embedding = np.array(data)
embedding.shape

(186, 200)

## Check `nans` and `infs`

Check for `nans`

In [14]:
drug_idx, feature_idx = np.where(np.isnan(embedding))
print(f'drug_idx:\n {drug_idx}')
print(f'feature_idx:\n {feature_idx}')

drug_idx:
 []
feature_idx:
 []


Check for `infs` and add to idx lists

In [15]:
drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding))

drug_idx = np.concatenate((drug_idx, drug_idx_infs))
feature_idx = np.concatenate((feature_idx, feature_idx_infs))

Features that have these invalid values:

In [16]:
np.array(generator.GetColumns())[np.unique(feature_idx)]

array([], shape=(0, 2), dtype=object)

Set values to `0`

In [17]:
embedding[drug_idx, feature_idx] 

array([], dtype=float64)

In [18]:
embedding[drug_idx, feature_idx] = 0

## Save

In [19]:
import pandas as pd

df = pd.DataFrame(data=embedding,index=smiles_list,columns=[f'latent_{i}' for i in range(embedding.shape[1])]) 

# Drop first feature from generator (RDKit2D_calculated)
df.drop(columns=['latent_0'], inplace=True)

# Drop columns with 0 standard deviation
threshold = 0.01
columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= threshold)[0]]
print(f'Deleting columns with std<={threshold}: {columns}')
df.drop(columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= 0.01)[0]], inplace=True)

Deleting columns with std<=0.01: ['latent_60', 'latent_89', 'latent_102', 'latent_137', 'latent_145', 'latent_146', 'latent_147', 'latent_149', 'latent_151', 'latent_152', 'latent_159', 'latent_160', 'latent_163', 'latent_164', 'latent_167', 'latent_174', 'latent_177', 'latent_182', 'latent_186', 'latent_188', 'latent_193', 'latent_195']


Check that correct columns were deleted: 

In [20]:
np.where(df.std() <= threshold)

(array([], dtype=int64),)

### Normalise dataframe

In [21]:
normalized_df=(df-df.mean())/df.std()

In [22]:
normalized_df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_187,latent_189,latent_190,latent_191,latent_192,latent_194,latent_196,latent_197,latent_198,latent_199
O=C([O-])CCCc1ccccc1.[Na+],-1.666455,-1.600675,-1.385933,-1.134785,-1.605154,-1.460731,-1.536649,-1.465486,-1.543553,-1.452940,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,0.121276
CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)cccn4n3)c2)CC1,0.863033,0.376632,0.455178,0.475603,0.463247,0.462348,0.777506,0.503243,0.879424,0.503081,...,1.457503,-0.19722,-0.273387,3.977884,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,0.060135
COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccncc3)ccc2c1,0.944786,0.248064,0.295501,0.318775,0.412792,0.268612,0.506380,0.178487,0.384114,0.284651,...,1.457503,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.496321
COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc23)c(Cl)cc1Cl,0.603507,0.746555,0.789836,0.920548,0.783927,0.701164,0.700273,0.582936,0.573394,0.651072,...,1.457503,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,0.628009,-0.284531,-0.508369
COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O,-0.393464,-0.067916,-0.163103,-0.268156,-0.140663,-0.313933,-0.456744,-0.450171,-0.621424,-0.534237,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,0.379172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c3[nH]c4ncc(C)cc4c23)c1,1.286344,0.740428,0.887842,0.900547,0.732054,0.910649,1.136028,1.014856,1.426770,1.140227,...,1.457503,-0.19722,-0.273387,3.977884,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.325822
CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2c1.Cl.O,0.123124,0.267321,0.453349,0.473807,0.289093,0.295629,0.117262,0.090182,-0.130664,0.137535,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.746358
Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2ccccc21,-0.028239,-0.011372,0.230844,0.391784,0.111804,0.200441,0.027626,0.214944,-0.017353,0.364936,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.495217
CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4)cc3)c3ccccc3)c2c1,1.115222,0.770763,0.847426,0.860852,0.884172,0.997570,1.217879,0.869661,1.210711,0.882431,...,-0.535847,-0.19722,3.638151,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-1.140448


Check destination folder

In [23]:
model_name = "rdkit2D"
dataset_name= "biolord"
fname = f'{model_name}_embedding_{dataset_name}.parquet'

directory = EMBEDDING_DIR /'rdkit' / 'data' /'embeddings'
directory.mkdir(parents=True, exist_ok=True)

Save normalised version

In [24]:
normalized_df.to_parquet(directory / fname)

Check that it worked

In [25]:
df = pd.read_parquet(directory/ fname)
df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_187,latent_189,latent_190,latent_191,latent_192,latent_194,latent_196,latent_197,latent_198,latent_199
O=C([O-])CCCc1ccccc1.[Na+],-1.666455,-1.600675,-1.385933,-1.134785,-1.605154,-1.460731,-1.536649,-1.465486,-1.543553,-1.452940,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,0.121276
CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)cccn4n3)c2)CC1,0.863033,0.376632,0.455178,0.475603,0.463247,0.462348,0.777506,0.503243,0.879424,0.503081,...,1.457503,-0.19722,-0.273387,3.977884,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,0.060135
COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccncc3)ccc2c1,0.944786,0.248064,0.295501,0.318775,0.412792,0.268612,0.506380,0.178487,0.384114,0.284651,...,1.457503,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.496321
COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc23)c(Cl)cc1Cl,0.603507,0.746555,0.789836,0.920548,0.783927,0.701164,0.700273,0.582936,0.573394,0.651072,...,1.457503,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,0.628009,-0.284531,-0.508369
COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O,-0.393464,-0.067916,-0.163103,-0.268156,-0.140663,-0.313933,-0.456744,-0.450171,-0.621424,-0.534237,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,0.379172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c3[nH]c4ncc(C)cc4c23)c1,1.286344,0.740428,0.887842,0.900547,0.732054,0.910649,1.136028,1.014856,1.426770,1.140227,...,1.457503,-0.19722,-0.273387,3.977884,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.325822
CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2c1.Cl.O,0.123124,0.267321,0.453349,0.473807,0.289093,0.295629,0.117262,0.090182,-0.130664,0.137535,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.746358
Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2ccccc21,-0.028239,-0.011372,0.230844,0.391784,0.111804,0.200441,0.027626,0.214944,-0.017353,0.364936,...,-0.535847,-0.19722,-0.273387,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-0.495217
CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4)cc3)c3ccccc3)c2c1,1.115222,0.770763,0.847426,0.860852,0.884172,0.997570,1.217879,0.869661,1.210711,0.882431,...,-0.535847,-0.19722,3.638151,-0.250038,-0.073324,-0.199302,-0.19722,-0.270527,-0.284531,-1.140448


In [26]:
directory/ fname

PosixPath('/nfs/staff-ssd/hetzell/code/chemCPA_v2/project_folder/embeddings/rdkit/data/embeddings/rdkit2D_embedding_biolord.parquet')