# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.1
#   kernelspec:
#     display_name: chemical_CPA
#     language: python
#     name: python3
# ---

# # 4 SCIPLEX SMILES
#
# This is an updated version of `sciplex_SMILES.ipynb` which relies on a `drug_dict` to assign SMILES strings.  
# The `sciplex_SMILES.ipynb` notebook is not applicable to the full sciplex data as it relies on the `.obs_names`.  
# Hence, the second half of the dataset (left out in the original CPA publication) would be left without SMILES entries. 
#
# **Requires**
# * `'sciplex3_matched_genes_lincs.h5ad'`
# * `'sciplex3_lincs_genes.h5ad'`
# * `'trapnell_final_V7.h5ad'`
#
# **Output**
# * `'trapnell_cpa(_lincs_genes).h5ad'`
# * `'trapnell_cpa_subset(_lincs_genes).h5ad'`
#
#
# ## Description
# This script assigns SMILES strings to drug conditions in the sciplex dataset, serving as a counterpart to `2_lincs_SMILES.py` but handling sciplex data. Below is a summary of its key steps:
#
# 1. **Load Data**: The script uses either `sciplex3_lincs_genes.h5ad` or `sciplex3_matched_genes_lincs.h5ad` as the target dataset to which SMILES strings are added. The choice depends on the `LINCS_GENES` flag: if `LINCS_GENES` is `True`, the dataset with the LINCS gene subset (`sciplex3_lincs_genes.h5ad`) is used; if `False`, the matched genes dataset (`sciplex3_matched_genes_lincs.h5ad`) is used.
#
# 2. **Create and Assign SMILES**: A dictionary (`drug_dict`) is created by zipping the `condition` and `SMILES` columns from `trapnell_final_V7.h5ad`. The script assigns SMILES to the target dataset by applying `drug_dict` to the `condition` column.
#
# 3. **Canonicalization and Validation**: The SMILES strings are validated using `rdkit` to make them canonical. The notebook also checks that each drug condition is assigned a unique SMILES string.
#
# 4. **Subset Creation**: A subset of the target dataset is created by sampling up to 50 observations per drug condition to reduce data size. The subsets are concatenated into `adata_cpa_subset`.
#
# 5. **Output**: Depending on the `LINCS_GENES` flag, the script creates two output files:
#
#    - If `LINCS_GENES` is `True`, the produced files are the whole set `trapnell_cpa_lincs_genes.h5ad` and the subset as `trapnell_cpa_subset_lincs_genes.h5ad`.
#    - Analogously if`LINCS_GENES` is `False`, the produced files are `trapnell_cpa.h5ad` and `trapnell_cpa_subset.h5ad`.
#
#
#

#
# ## Imports

# +
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rdkit
import scanpy as sc
from rdkit import Chem

import warnings
from chemCPA.paths import DATA_DIR, PROJECT_DIR

import os
import sys
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(root_dir)
import raw_data.datasets as datasets
import logging

logging.basicConfig(level=logging.INFO)
from notebook_utils import suppress_output

with suppress_output():
    sc.set_figure_params(dpi=80, frameon=False)
    sc.logging.print_header()
    warnings.filterwarnings('ignore')
# -

# %load_ext autoreload
# %autoreload 2

# ## Load data
# Note: Run notebook for both adata objects (LINCS_GENES)

# +
# Switch between 977 (True) and 2000 (False) gene set 
LINCS_GENES = False 

adata_cpa = sc.read(DATA_DIR/f"sciplex3_{'matched_genes_lincs' if not LINCS_GENES else 'lincs_genes'}.h5ad") 
adata_cpi = sc.read(datasets.trapnell_final_v7())
# -

adata_cpa

# Determine output directory

adata_out = DATA_DIR / f"trapnell_cpa{'_lincs_genes' if LINCS_GENES else ''}.h5ad"
adata_out_subset = DATA_DIR / f"trapnell_cpa_subset{'_lincs_genes' if LINCS_GENES else ''}.h5ad"

# Overview over adata files

# +
# adata_cpa

# +
# adata_cpi
# -

# __________
# ### Drug is combined with acid

# In the `adata_cpi` we distinguish between `'ENMD-2076'` and `'ENMD-2076 L-(+)-Tartaric acid'`.  
# They have different also different SMILES strings in `.obs.SMILES`. 
# Since we do not keep this different in the `.obs.condition` columns,  
# which is a copy of `.obs.product_name` for `adata_cpa`, see `'lincs_sciplex_gene_matching.ipynb'`,  
# I am ignoring this. As result we only have 188 drugs in the sciplex dataset.

adata_cpi.obs.product_name[adata_cpi.obs.SMILES == 'O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\C=C\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1 |r,c:24,26,28,36,38,t:17,22,32|']

# +
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw


def mol_with_atom_index(mol):
    for atom in mol.GetAtoms():
        atom.SetAtomMapNum(atom.GetIdx())
    return mol

# Test in a kinase inhibitor
mol = Chem.MolFromSmiles("CN1CCN(CC1)C1=CC(NC2=NNC(C)=C2)=NC(\\C=C\\C2=CC=CC=C2)=N1")
# Default
mol
# -

# Test in a kinase inhibitor
mol = Chem.MolFromSmiles("O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\C=C\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1")
# Default
mol

# ___________

# ## Create drug SMILES dict

drug_dict = dict(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES))

# The dict has 188 different entries

len(drug_dict)

# Checking that the `'ENMD-2076'` entry does not include the adid:

Chem.MolFromSmiles(drug_dict['ENMD-2076'])

# This is a good wat to check the unique `(drug, smiles)` combinations that exist in the `adata_cpi`

# +
# np.unique([f'{condition}_{smiles}' for condition, smiles in list(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES))])
# -

# ## Rename drug `(+)-JQ1`
# This had a different name in the old Sciplex dataset, where it was called `JQ1`. We rename it for consistency.

adata_cpa.obs["condition"] = adata_cpa.obs["condition"].cat.rename_categories({"(+)-JQ1": "JQ1"})

# ## Add SMILES to `adata_cpa`

adata_cpa.obs['SMILES'] = adata_cpa.obs.condition.map(drug_dict)

adata_cpa[adata_cpa.obs["condition"] == "JQ1"].obs["SMILES"].unique()

# ## Check that SMILES match `obs.condition` data
#
# Print some stats on the `condition` columns

print(f'We have {len(list(adata_cpa.obs.condition.value_counts().index))} drug names in adata_cpa: \n\n\t{list(adata_cpa.obs.condition.value_counts().index)}\n\n')
print(f'We have {len(list(adata_cpi.obs.condition.value_counts().index))} drug names in adata_cpi: \n\n\t{list(adata_cpi.obs.condition.value_counts().index)}')

# Check that assigned SMILES match the condition,  
# it should be just one smiles string per condition

(adata_cpa.obs.condition=='nan').sum()

# ### Check for nans

(adata_cpa.obs.condition=='nan').sum()

# ### Take care of `control` SMILES

counts = adata_cpa[adata_cpa.obs.condition=='control'].obs.SMILES.value_counts()
list(counts.index[counts>0])

# Add DMSO SMILES:`CS(C)=O`

adata_cpa.obs["SMILES"] = adata_cpa.obs["SMILES"].astype("category").cat.rename_categories({"": "CS(C)=O"})

adata_cpa.obs.loc[adata_cpa.obs.condition=='control', 'SMILES'].value_counts()

# ### Check double assigned condition

for pert, df in adata_cpa.obs.groupby('condition'):
    n_smiles = (df.SMILES.value_counts()!=0).sum()
    print(f"{pert}: {n_smiles}") if n_smiles > 1 else None

# Check that condition align with SMILES
#
# If everything is correct there should be no output

for pert, df in adata_cpa.obs.groupby('condition'):
    n_smiles = (df.SMILES.value_counts()!=0).sum()
    print(f"{pert}: {n_smiles}") if n_smiles > 1 else None

# ## Make SMILES canonical

# +
print(f'rdkit version: {rdkit.__version__}\n')

adata_cpa.obs.SMILES = adata_cpa.obs.SMILES.apply(Chem.CanonSmiles)
# -

# ## Add a random split to adata_cpa

# +
# This does not make sense

# from sklearn.model_selection import train_test_split

# if 'split' not in list(adata_cpa.obs):
#     print("Addig 'split' to 'adata_cpa.obs'.")
#     unique_drugs = np.unique(adata_cpa.obs.SMILES)
#     drugs_train, drugs_tmp = train_test_split(unique_drugs, test_size=0.2)
#     drugs_val, drugs_test = train_test_split(drugs_tmp, test_size=0.5)

#     adata_cpa.obs['split'] = 'train'
#     adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_val), 'split'] = 'test'
#     adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_test), 'split'] = 'ood'
# -

# ## Create subset `adata_cpa_subset` from `adata_cpa`

# +
adatas = []

for drug in np.unique(adata_cpa.obs.condition): 
    tmp = adata_cpa[adata_cpa.obs.condition == drug].copy()
    tmp = sc.pp.subsample(tmp, n_obs=50, copy=True, random_state=42)
    adatas.append(tmp)

adata_cpa_subset = adatas[0].concatenate(adatas[1:])
adata_cpa_subset.uns = adata_cpa.uns.copy()

adata_cpa_subset
# -

# ## Safe both adata objects

adata_cpa.write(adata_out)
adata_cpa_subset.write(adata_out_subset)

# ### Loading the result for `adata_out`

adata = sc.read(adata_out_subset)
adata.obs.dose.value_counts()

adata_cpa.uns["log1p"]