File size: 2,811 Bytes
a48f0ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.1
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# + jupyter={"outputs_hidden": true} pycharm={"name": "#%%\n"}
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

matplotlib.style.use("fivethirtyeight")
matplotlib.style.use("seaborn-talk")
matplotlib.rcParams['font.family'] = "monospace"
matplotlib.rcParams['figure.dpi'] = 200
matplotlib.pyplot.rcParams['savefig.facecolor'] = 'white'
sn.set_context("poster")
IPythonConsole.ipython_useSVG = False


# + jupyter={"outputs_hidden": false} pycharm={"name": "#%%\n"}
trapnell_df = pd.read_csv("../embeddings/trapnell_drugs_smiles.csv", names=["drug", "smiles", "pathway"])
trapnell_df["smiles"] = trapnell_df.smiles.str.strip()
lincs_df = pd.read_csv("../embeddings/lincs_drugs_smiles.csv", names=["drug", "smiles"])
lincs_df["smiles"] = lincs_df.smiles.str.strip()


# + jupyter={"outputs_hidden": false} pycharm={"name": "#%%\n"}
def tanimoto_score(input_smiles, target_smiles):
    input_fp = Chem.RDKFingerprint(Chem.MolFromSmiles(input_smiles))
    target_fp = Chem.RDKFingerprint(Chem.MolFromSmiles(target_smiles))
    return DataStructs.TanimotoSimilarity(input_fp, target_fp)


# -

# ## Checking 3 hold out drugs
# Looking for the most similar drugs in LINCS to our 3 hold out drug in Trapnell

# + jupyter={"outputs_hidden": false} pycharm={"name": "#%%\n"}
loo_drugs = trapnell_df[trapnell_df.drug.isin(["Quisinostat", "Flavopiridol", "BMS-754807"])]
loo_drugs

# + jupyter={"outputs_hidden": false} pycharm={"name": "#%%\n"}
smiles_orig = []
smiles_lincs = []
for i, (drug, smiles, pathway) in loo_drugs.iterrows():
    tanimoto_sim_col = f"tanimoto_sim_{drug}"
    lincs_df[tanimoto_sim_col] = lincs_df.smiles.apply(lambda lincs_smiles: tanimoto_score(lincs_smiles, smiles))
    most_similar = lincs_df.sort_values(tanimoto_sim_col, ascending=False).head(1)
    smiles_orig.append(smiles)
    smiles_lincs.append(most_similar["smiles"].item())
    print(drug, any(lincs_df.smiles.isin([smiles])), most_similar[tanimoto_sim_col].item(), most_similar["drug"].item())
    print(lincs_df.sort_values(tanimoto_sim_col, ascending=False).head(5)[["drug", tanimoto_sim_col]])

# + jupyter={"outputs_hidden": false} pycharm={"name": "#%%\n"}
for orig, lincs in zip(smiles_orig, smiles_lincs):
    im = Draw.MolsToGridImage([Chem.MolFromSmiles(orig), Chem.MolFromSmiles(lincs)], subImgSize=(600, 400),
                              legends=[orig, lincs])
    plt.tight_layout()
    display(im)