chemCPA / preprocessing /4_sciplex_SMILES.py
github-actions[bot]
HF snapshot
a48f0ae
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.16.1
# kernelspec:
# display_name: chemical_CPA
# language: python
# name: python3
# ---
# # 4 SCIPLEX SMILES
#
# This is an updated version of `sciplex_SMILES.ipynb` which relies on a `drug_dict` to assign SMILES strings.
# The `sciplex_SMILES.ipynb` notebook is not applicable to the full sciplex data as it relies on the `.obs_names`.
# Hence, the second half of the dataset (left out in the original CPA publication) would be left without SMILES entries.
#
# **Requires**
# * `'sciplex3_matched_genes_lincs.h5ad'`
# * `'sciplex3_lincs_genes.h5ad'`
# * `'trapnell_final_V7.h5ad'`
#
# **Output**
# * `'trapnell_cpa(_lincs_genes).h5ad'`
# * `'trapnell_cpa_subset(_lincs_genes).h5ad'`
#
#
# ## Description
# This script assigns SMILES strings to drug conditions in the sciplex dataset, serving as a counterpart to `2_lincs_SMILES.py` but handling sciplex data. Below is a summary of its key steps:
#
# 1. **Load Data**: The script uses either `sciplex3_lincs_genes.h5ad` or `sciplex3_matched_genes_lincs.h5ad` as the target dataset to which SMILES strings are added. The choice depends on the `LINCS_GENES` flag: if `LINCS_GENES` is `True`, the dataset with the LINCS gene subset (`sciplex3_lincs_genes.h5ad`) is used; if `False`, the matched genes dataset (`sciplex3_matched_genes_lincs.h5ad`) is used.
#
# 2. **Create and Assign SMILES**: A dictionary (`drug_dict`) is created by zipping the `condition` and `SMILES` columns from `trapnell_final_V7.h5ad`. The script assigns SMILES to the target dataset by applying `drug_dict` to the `condition` column.
#
# 3. **Canonicalization and Validation**: The SMILES strings are validated using `rdkit` to make them canonical. The notebook also checks that each drug condition is assigned a unique SMILES string.
#
# 4. **Subset Creation**: A subset of the target dataset is created by sampling up to 50 observations per drug condition to reduce data size. The subsets are concatenated into `adata_cpa_subset`.
#
# 5. **Output**: Depending on the `LINCS_GENES` flag, the script creates two output files:
#
# - If `LINCS_GENES` is `True`, the produced files are the whole set `trapnell_cpa_lincs_genes.h5ad` and the subset as `trapnell_cpa_subset_lincs_genes.h5ad`.
# - Analogously if`LINCS_GENES` is `False`, the produced files are `trapnell_cpa.h5ad` and `trapnell_cpa_subset.h5ad`.
#
#
#
#
# ## Imports
# +
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rdkit
import scanpy as sc
from rdkit import Chem
import warnings
from chemCPA.paths import DATA_DIR, PROJECT_DIR
import os
import sys
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
import raw_data.datasets as datasets
import logging
logging.basicConfig(level=logging.INFO)
from notebook_utils import suppress_output
with suppress_output():
sc.set_figure_params(dpi=80, frameon=False)
sc.logging.print_header()
warnings.filterwarnings('ignore')
# -
# %load_ext autoreload
# %autoreload 2
# ## Load data
# Note: Run notebook for both adata objects (LINCS_GENES)
# +
# Switch between 977 (True) and 2000 (False) gene set
LINCS_GENES = False
adata_cpa = sc.read(DATA_DIR/f"sciplex3_{'matched_genes_lincs' if not LINCS_GENES else 'lincs_genes'}.h5ad")
adata_cpi = sc.read(datasets.trapnell_final_v7())
# -
adata_cpa
# Determine output directory
adata_out = DATA_DIR / f"trapnell_cpa{'_lincs_genes' if LINCS_GENES else ''}.h5ad"
adata_out_subset = DATA_DIR / f"trapnell_cpa_subset{'_lincs_genes' if LINCS_GENES else ''}.h5ad"
# Overview over adata files
# +
# adata_cpa
# +
# adata_cpi
# -
# __________
# ### Drug is combined with acid
# In the `adata_cpi` we distinguish between `'ENMD-2076'` and `'ENMD-2076 L-(+)-Tartaric acid'`.
# They have different also different SMILES strings in `.obs.SMILES`.
# Since we do not keep this different in the `.obs.condition` columns,
# which is a copy of `.obs.product_name` for `adata_cpa`, see `'lincs_sciplex_gene_matching.ipynb'`,
# I am ignoring this. As result we only have 188 drugs in the sciplex dataset.
adata_cpi.obs.product_name[adata_cpi.obs.SMILES == 'O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\C=C\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1 |r,c:24,26,28,36,38,t:17,22,32|']
# +
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
def mol_with_atom_index(mol):
for atom in mol.GetAtoms():
atom.SetAtomMapNum(atom.GetIdx())
return mol
# Test in a kinase inhibitor
mol = Chem.MolFromSmiles("CN1CCN(CC1)C1=CC(NC2=NNC(C)=C2)=NC(\\C=C\\C2=CC=CC=C2)=N1")
# Default
mol
# -
# Test in a kinase inhibitor
mol = Chem.MolFromSmiles("O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\C=C\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1")
# Default
mol
# ___________
# ## Create drug SMILES dict
drug_dict = dict(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES))
# The dict has 188 different entries
len(drug_dict)
# Checking that the `'ENMD-2076'` entry does not include the adid:
Chem.MolFromSmiles(drug_dict['ENMD-2076'])
# This is a good wat to check the unique `(drug, smiles)` combinations that exist in the `adata_cpi`
# +
# np.unique([f'{condition}_{smiles}' for condition, smiles in list(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES))])
# -
# ## Rename drug `(+)-JQ1`
# This had a different name in the old Sciplex dataset, where it was called `JQ1`. We rename it for consistency.
adata_cpa.obs["condition"] = adata_cpa.obs["condition"].cat.rename_categories({"(+)-JQ1": "JQ1"})
# ## Add SMILES to `adata_cpa`
adata_cpa.obs['SMILES'] = adata_cpa.obs.condition.map(drug_dict)
adata_cpa[adata_cpa.obs["condition"] == "JQ1"].obs["SMILES"].unique()
# ## Check that SMILES match `obs.condition` data
#
# Print some stats on the `condition` columns
print(f'We have {len(list(adata_cpa.obs.condition.value_counts().index))} drug names in adata_cpa: \n\n\t{list(adata_cpa.obs.condition.value_counts().index)}\n\n')
print(f'We have {len(list(adata_cpi.obs.condition.value_counts().index))} drug names in adata_cpi: \n\n\t{list(adata_cpi.obs.condition.value_counts().index)}')
# Check that assigned SMILES match the condition,
# it should be just one smiles string per condition
(adata_cpa.obs.condition=='nan').sum()
# ### Check for nans
(adata_cpa.obs.condition=='nan').sum()
# ### Take care of `control` SMILES
counts = adata_cpa[adata_cpa.obs.condition=='control'].obs.SMILES.value_counts()
list(counts.index[counts>0])
# Add DMSO SMILES:`CS(C)=O`
adata_cpa.obs["SMILES"] = adata_cpa.obs["SMILES"].astype("category").cat.rename_categories({"": "CS(C)=O"})
adata_cpa.obs.loc[adata_cpa.obs.condition=='control', 'SMILES'].value_counts()
# ### Check double assigned condition
for pert, df in adata_cpa.obs.groupby('condition'):
n_smiles = (df.SMILES.value_counts()!=0).sum()
print(f"{pert}: {n_smiles}") if n_smiles > 1 else None
# Check that condition align with SMILES
#
# If everything is correct there should be no output
for pert, df in adata_cpa.obs.groupby('condition'):
n_smiles = (df.SMILES.value_counts()!=0).sum()
print(f"{pert}: {n_smiles}") if n_smiles > 1 else None
# ## Make SMILES canonical
# +
print(f'rdkit version: {rdkit.__version__}\n')
adata_cpa.obs.SMILES = adata_cpa.obs.SMILES.apply(Chem.CanonSmiles)
# -
# ## Add a random split to adata_cpa
# +
# This does not make sense
# from sklearn.model_selection import train_test_split
# if 'split' not in list(adata_cpa.obs):
# print("Addig 'split' to 'adata_cpa.obs'.")
# unique_drugs = np.unique(adata_cpa.obs.SMILES)
# drugs_train, drugs_tmp = train_test_split(unique_drugs, test_size=0.2)
# drugs_val, drugs_test = train_test_split(drugs_tmp, test_size=0.5)
# adata_cpa.obs['split'] = 'train'
# adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_val), 'split'] = 'test'
# adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_test), 'split'] = 'ood'
# -
# ## Create subset `adata_cpa_subset` from `adata_cpa`
# +
adatas = []
for drug in np.unique(adata_cpa.obs.condition):
tmp = adata_cpa[adata_cpa.obs.condition == drug].copy()
tmp = sc.pp.subsample(tmp, n_obs=50, copy=True, random_state=42)
adatas.append(tmp)
adata_cpa_subset = adatas[0].concatenate(adatas[1:])
adata_cpa_subset.uns = adata_cpa.uns.copy()
adata_cpa_subset
# -
# ## Safe both adata objects
adata_cpa.write(adata_out)
adata_cpa_subset.write(adata_out_subset)
# ### Loading the result for `adata_out`
adata = sc.read(adata_out_subset)
adata.obs.dose.value_counts()
adata_cpa.uns["log1p"]