# --- # jupyter: # jupytext: # notebook_metadata_filter: -kernelspec # text_representation: # extension: .py # format_name: percent # format_version: '1.3' # jupytext_version: 1.14.1 # --- # %% [markdown] # **Requirements** # * According to this [paper](https://arxiv.org/pdf/1904.01561.pdf), features are computed with [descriptastorus](https://github.com/bp-kelley/descriptastorus) package # * Install via: `pip install git+https://github.com/bp-kelley/descriptastorus` # %% [markdown] # ## General imports # %% import sys # this depends on the notebook depth and must be adapted per notebook sys.path.insert(0, "/") # %% import numpy as np # %% import scanpy as sc from joblib import Parallel, delayed from tqdm.notebook import tqdm from chemCPA.helper import canonicalize_smiles from chemCPA.paths import DATA_DIR, EMBEDDING_DIR # %% [markdown] # ## Load Smiles list # %% adata = sc.read(DATA_DIR / "adata_biolord_split_30.h5ad") # %% smiles_list = adata.obs["smiles"].unique() # exclude nan from smiles_list smiles_list = [canonicalize_smiles(s) for s in smiles_list if s != "nan"] # %% print(f"Number of smiles strings: {len(smiles_list)}") # %% from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator generator = MakeGenerator(("RDKit2D",)) for name, numpy_type in generator.GetColumns(): print(f"{name}({numpy_type.__name__})") # %% n_jobs = 16 data = Parallel(n_jobs=n_jobs)( delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0, leave=True) ) # %% data = [d[1:] for d in data] # %% embedding = np.array(data) embedding.shape # %% [markdown] # ## Check `nans` and `infs` # %% [markdown] # Check for `nans` # %% drug_idx, feature_idx = np.where(np.isnan(embedding)) print(f"drug_idx:\n {drug_idx}") print(f"feature_idx:\n {feature_idx}") # %% [markdown] # Check for `infs` and add to idx lists # %% drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding)) drug_idx = np.concatenate((drug_idx, drug_idx_infs)) feature_idx = np.concatenate((feature_idx, feature_idx_infs)) # %% [markdown] # Features that have these invalid values: # %% tags=[] np.array(generator.GetColumns())[np.unique(feature_idx)] # %% [markdown] # Set values to `0` # %% embedding[drug_idx, feature_idx] # %% embedding[drug_idx, feature_idx] = 0 # %% [markdown] # ## Save # %% import pandas as pd df = pd.DataFrame(data=embedding, index=smiles_list, columns=[f"latent_{i}" for i in range(embedding.shape[1])]) # Drop first feature from generator (RDKit2D_calculated) df.drop(columns=["latent_0"], inplace=True) # Drop columns with 0 standard deviation threshold = 0.01 columns = [f"latent_{idx+1}" for idx in np.where(df.std() <= threshold)[0]] print(f"Deleting columns with std<={threshold}: {columns}") df.drop(columns=[f"latent_{idx+1}" for idx in np.where(df.std() <= 0.01)[0]], inplace=True) # %% [markdown] # Check that correct columns were deleted: # %% np.where(df.std() <= threshold) # %% [markdown] # ### Normalise dataframe # %% normalized_df = (df - df.mean()) / df.std() # %% normalized_df # %% [markdown] # Check destination folder # %% model_name = "rdkit2D" dataset_name = "biolord" fname = f"{model_name}_embedding_{dataset_name}.parquet" directory = EMBEDDING_DIR / "rdkit" / "data" / "embeddings" directory.mkdir(parents=True, exist_ok=True) # %% [markdown] # Save normalised version # %% normalized_df.to_parquet(directory / fname) # %% [markdown] # Check that it worked # %% df = pd.read_parquet(directory / fname) df # %% directory / fname # %%