|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import sys |
|
|
|
|
|
sys.path.insert(0, "/") |
|
|
|
import numpy as np |
|
|
|
|
|
import scanpy as sc |
|
from joblib import Parallel, delayed |
|
from tqdm.notebook import tqdm |
|
|
|
from chemCPA.helper import canonicalize_smiles |
|
from chemCPA.paths import DATA_DIR, EMBEDDING_DIR |
|
|
|
|
|
|
|
|
|
|
|
|
|
adata = sc.read(DATA_DIR / "adata_biolord_split_30.h5ad") |
|
|
|
|
|
smiles_list = adata.obs["smiles"].unique() |
|
|
|
smiles_list = [canonicalize_smiles(s) for s in smiles_list if s != "nan"] |
|
|
|
|
|
print(f"Number of smiles strings: {len(smiles_list)}") |
|
|
|
|
|
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator |
|
|
|
generator = MakeGenerator(("RDKit2D",)) |
|
for name, numpy_type in generator.GetColumns(): |
|
print(f"{name}({numpy_type.__name__})") |
|
|
|
|
|
n_jobs = 16 |
|
data = Parallel(n_jobs=n_jobs)( |
|
delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0, leave=True) |
|
) |
|
|
|
|
|
data = [d[1:] for d in data] |
|
|
|
|
|
embedding = np.array(data) |
|
embedding.shape |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
drug_idx, feature_idx = np.where(np.isnan(embedding)) |
|
print(f"drug_idx:\n {drug_idx}") |
|
print(f"feature_idx:\n {feature_idx}") |
|
|
|
|
|
|
|
|
|
|
|
drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding)) |
|
|
|
drug_idx = np.concatenate((drug_idx, drug_idx_infs)) |
|
feature_idx = np.concatenate((feature_idx, feature_idx_infs)) |
|
|
|
|
|
|
|
|
|
|
|
np.array(generator.GetColumns())[np.unique(feature_idx)] |
|
|
|
|
|
|
|
|
|
|
|
embedding[drug_idx, feature_idx] |
|
|
|
|
|
embedding[drug_idx, feature_idx] = 0 |
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
|
df = pd.DataFrame(data=embedding, index=smiles_list, columns=[f"latent_{i}" for i in range(embedding.shape[1])]) |
|
|
|
|
|
df.drop(columns=["latent_0"], inplace=True) |
|
|
|
|
|
threshold = 0.01 |
|
columns = [f"latent_{idx+1}" for idx in np.where(df.std() <= threshold)[0]] |
|
print(f"Deleting columns with std<={threshold}: {columns}") |
|
df.drop(columns=[f"latent_{idx+1}" for idx in np.where(df.std() <= 0.01)[0]], inplace=True) |
|
|
|
|
|
|
|
|
|
|
|
np.where(df.std() <= threshold) |
|
|
|
|
|
|
|
|
|
|
|
normalized_df = (df - df.mean()) / df.std() |
|
|
|
|
|
normalized_df |
|
|
|
|
|
|
|
|
|
|
|
model_name = "rdkit2D" |
|
dataset_name = "biolord" |
|
fname = f"{model_name}_embedding_{dataset_name}.parquet" |
|
|
|
directory = EMBEDDING_DIR / "rdkit" / "data" / "embeddings" |
|
directory.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
|
|
|
normalized_df.to_parquet(directory / fname) |
|
|
|
|
|
|
|
|
|
|
|
df = pd.read_parquet(directory / fname) |
|
df |
|
|
|
|
|
directory / fname |
|
|
|
|
|
|