# --- | |
# jupyter: | |
# jupytext: | |
# text_representation: | |
# extension: .py | |
# format_name: light | |
# format_version: '1.5' | |
# jupytext_version: 1.16.1 | |
# kernelspec: | |
# display_name: chemical_CPA | |
# language: python | |
# name: python3 | |
# --- | |
# # 4 SCIPLEX SMILES | |
# | |
# This is an updated version of `sciplex_SMILES.ipynb` which relies on a `drug_dict` to assign SMILES strings. | |
# The `sciplex_SMILES.ipynb` notebook is not applicable to the full sciplex data as it relies on the `.obs_names`. | |
# Hence, the second half of the dataset (left out in the original CPA publication) would be left without SMILES entries. | |
# | |
# **Requires** | |
# * `'sciplex3_matched_genes_lincs.h5ad'` | |
# * `'sciplex3_lincs_genes.h5ad'` | |
# * `'trapnell_final_V7.h5ad'` | |
# | |
# **Output** | |
# * `'trapnell_cpa(_lincs_genes).h5ad'` | |
# * `'trapnell_cpa_subset(_lincs_genes).h5ad'` | |
# | |
# | |
# ## Description | |
# This script assigns SMILES strings to drug conditions in the sciplex dataset, serving as a counterpart to `2_lincs_SMILES.py` but handling sciplex data. Below is a summary of its key steps: | |
# | |
# 1. **Load Data**: The script uses either `sciplex3_lincs_genes.h5ad` or `sciplex3_matched_genes_lincs.h5ad` as the target dataset to which SMILES strings are added. The choice depends on the `LINCS_GENES` flag: if `LINCS_GENES` is `True`, the dataset with the LINCS gene subset (`sciplex3_lincs_genes.h5ad`) is used; if `False`, the matched genes dataset (`sciplex3_matched_genes_lincs.h5ad`) is used. | |
# | |
# 2. **Create and Assign SMILES**: A dictionary (`drug_dict`) is created by zipping the `condition` and `SMILES` columns from `trapnell_final_V7.h5ad`. The script assigns SMILES to the target dataset by applying `drug_dict` to the `condition` column. | |
# | |
# 3. **Canonicalization and Validation**: The SMILES strings are validated using `rdkit` to make them canonical. The notebook also checks that each drug condition is assigned a unique SMILES string. | |
# | |
# 4. **Subset Creation**: A subset of the target dataset is created by sampling up to 50 observations per drug condition to reduce data size. The subsets are concatenated into `adata_cpa_subset`. | |
# | |
# 5. **Output**: Depending on the `LINCS_GENES` flag, the script creates two output files: | |
# | |
# - If `LINCS_GENES` is `True`, the produced files are the whole set `trapnell_cpa_lincs_genes.h5ad` and the subset as `trapnell_cpa_subset_lincs_genes.h5ad`. | |
# - Analogously if`LINCS_GENES` is `False`, the produced files are `trapnell_cpa.h5ad` and `trapnell_cpa_subset.h5ad`. | |
# | |
# | |
# | |
# | |
# ## Imports | |
# + | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import rdkit | |
import scanpy as sc | |
from rdkit import Chem | |
import warnings | |
from chemCPA.paths import DATA_DIR, PROJECT_DIR | |
import os | |
import sys | |
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
sys.path.append(root_dir) | |
import raw_data.datasets as datasets | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
from notebook_utils import suppress_output | |
with suppress_output(): | |
sc.set_figure_params(dpi=80, frameon=False) | |
sc.logging.print_header() | |
warnings.filterwarnings('ignore') | |
# - | |
# %load_ext autoreload | |
# %autoreload 2 | |
# ## Load data | |
# Note: Run notebook for both adata objects (LINCS_GENES) | |
# + | |
# Switch between 977 (True) and 2000 (False) gene set | |
LINCS_GENES = False | |
adata_cpa = sc.read(DATA_DIR/f"sciplex3_{'matched_genes_lincs' if not LINCS_GENES else 'lincs_genes'}.h5ad") | |
adata_cpi = sc.read(datasets.trapnell_final_v7()) | |
# - | |
adata_cpa | |
# Determine output directory | |
adata_out = DATA_DIR / f"trapnell_cpa{'_lincs_genes' if LINCS_GENES else ''}.h5ad" | |
adata_out_subset = DATA_DIR / f"trapnell_cpa_subset{'_lincs_genes' if LINCS_GENES else ''}.h5ad" | |
# Overview over adata files | |
# + | |
# adata_cpa | |
# + | |
# adata_cpi | |
# - | |
# __________ | |
# ### Drug is combined with acid | |
# In the `adata_cpi` we distinguish between `'ENMD-2076'` and `'ENMD-2076 L-(+)-Tartaric acid'`. | |
# They have different also different SMILES strings in `.obs.SMILES`. | |
# Since we do not keep this different in the `.obs.condition` columns, | |
# which is a copy of `.obs.product_name` for `adata_cpa`, see `'lincs_sciplex_gene_matching.ipynb'`, | |
# I am ignoring this. As result we only have 188 drugs in the sciplex dataset. | |
adata_cpi.obs.product_name[adata_cpi.obs.SMILES == 'O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\C=C\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1 |r,c:24,26,28,36,38,t:17,22,32|'] | |
# + | |
from rdkit import Chem | |
from rdkit.Chem.Draw import IPythonConsole | |
from rdkit.Chem import Draw | |
def mol_with_atom_index(mol): | |
for atom in mol.GetAtoms(): | |
atom.SetAtomMapNum(atom.GetIdx()) | |
return mol | |
# Test in a kinase inhibitor | |
mol = Chem.MolFromSmiles("CN1CCN(CC1)C1=CC(NC2=NNC(C)=C2)=NC(\\C=C\\C2=CC=CC=C2)=N1") | |
# Default | |
mol | |
# - | |
# Test in a kinase inhibitor | |
mol = Chem.MolFromSmiles("O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\C=C\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1") | |
# Default | |
mol | |
# ___________ | |
# ## Create drug SMILES dict | |
drug_dict = dict(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES)) | |
# The dict has 188 different entries | |
len(drug_dict) | |
# Checking that the `'ENMD-2076'` entry does not include the adid: | |
Chem.MolFromSmiles(drug_dict['ENMD-2076']) | |
# This is a good wat to check the unique `(drug, smiles)` combinations that exist in the `adata_cpi` | |
# + | |
# np.unique([f'{condition}_{smiles}' for condition, smiles in list(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES))]) | |
# - | |
# ## Rename drug `(+)-JQ1` | |
# This had a different name in the old Sciplex dataset, where it was called `JQ1`. We rename it for consistency. | |
adata_cpa.obs["condition"] = adata_cpa.obs["condition"].cat.rename_categories({"(+)-JQ1": "JQ1"}) | |
# ## Add SMILES to `adata_cpa` | |
adata_cpa.obs['SMILES'] = adata_cpa.obs.condition.map(drug_dict) | |
adata_cpa[adata_cpa.obs["condition"] == "JQ1"].obs["SMILES"].unique() | |
# ## Check that SMILES match `obs.condition` data | |
# | |
# Print some stats on the `condition` columns | |
print(f'We have {len(list(adata_cpa.obs.condition.value_counts().index))} drug names in adata_cpa: \n\n\t{list(adata_cpa.obs.condition.value_counts().index)}\n\n') | |
print(f'We have {len(list(adata_cpi.obs.condition.value_counts().index))} drug names in adata_cpi: \n\n\t{list(adata_cpi.obs.condition.value_counts().index)}') | |
# Check that assigned SMILES match the condition, | |
# it should be just one smiles string per condition | |
(adata_cpa.obs.condition=='nan').sum() | |
# ### Check for nans | |
(adata_cpa.obs.condition=='nan').sum() | |
# ### Take care of `control` SMILES | |
counts = adata_cpa[adata_cpa.obs.condition=='control'].obs.SMILES.value_counts() | |
list(counts.index[counts>0]) | |
# Add DMSO SMILES:`CS(C)=O` | |
adata_cpa.obs["SMILES"] = adata_cpa.obs["SMILES"].astype("category").cat.rename_categories({"": "CS(C)=O"}) | |
adata_cpa.obs.loc[adata_cpa.obs.condition=='control', 'SMILES'].value_counts() | |
# ### Check double assigned condition | |
for pert, df in adata_cpa.obs.groupby('condition'): | |
n_smiles = (df.SMILES.value_counts()!=0).sum() | |
print(f"{pert}: {n_smiles}") if n_smiles > 1 else None | |
# Check that condition align with SMILES | |
# | |
# If everything is correct there should be no output | |
for pert, df in adata_cpa.obs.groupby('condition'): | |
n_smiles = (df.SMILES.value_counts()!=0).sum() | |
print(f"{pert}: {n_smiles}") if n_smiles > 1 else None | |
# ## Make SMILES canonical | |
# + | |
print(f'rdkit version: {rdkit.__version__}\n') | |
adata_cpa.obs.SMILES = adata_cpa.obs.SMILES.apply(Chem.CanonSmiles) | |
# - | |
# ## Add a random split to adata_cpa | |
# + | |
# This does not make sense | |
# from sklearn.model_selection import train_test_split | |
# if 'split' not in list(adata_cpa.obs): | |
# print("Addig 'split' to 'adata_cpa.obs'.") | |
# unique_drugs = np.unique(adata_cpa.obs.SMILES) | |
# drugs_train, drugs_tmp = train_test_split(unique_drugs, test_size=0.2) | |
# drugs_val, drugs_test = train_test_split(drugs_tmp, test_size=0.5) | |
# adata_cpa.obs['split'] = 'train' | |
# adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_val), 'split'] = 'test' | |
# adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_test), 'split'] = 'ood' | |
# - | |
# ## Create subset `adata_cpa_subset` from `adata_cpa` | |
# + | |
adatas = [] | |
for drug in np.unique(adata_cpa.obs.condition): | |
tmp = adata_cpa[adata_cpa.obs.condition == drug].copy() | |
tmp = sc.pp.subsample(tmp, n_obs=50, copy=True, random_state=42) | |
adatas.append(tmp) | |
adata_cpa_subset = adatas[0].concatenate(adatas[1:]) | |
adata_cpa_subset.uns = adata_cpa.uns.copy() | |
adata_cpa_subset | |
# - | |
# ## Safe both adata objects | |
adata_cpa.write(adata_out) | |
adata_cpa_subset.write(adata_out_subset) | |
# ### Loading the result for `adata_out` | |
adata = sc.read(adata_out_subset) | |
adata.obs.dose.value_counts() | |
adata_cpa.uns["log1p"] | |