File size: 3,997 Bytes
a48f0ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# ---
# jupyter:
#   jupytext:
#     notebook_metadata_filter: -kernelspec
#     text_representation:
#       extension: .py
#       format_name: percent
#       format_version: '1.3'
#       jupytext_version: 1.14.1
# ---

# %% [markdown]
# # GROVER
# Generate GROVER fingerprints for SMILES-drugs coming from LINCS + SciPlex3.
#
# Steps:
# 1. Load `lincs_trapnell.smiles` as the list of SMILES to be encoded
# 2. Generate fingerprints using GROVER
# 3. Save SMILES -> fingerprint mapping as a pandas df.
#
# from pathlib import Path
#
# import numpy as np
# import pandas as pd

# %%
import rdkit

# %%
import scanpy as sc
from rdkit import Chem

rdkit.__version__

# %%
# SET
datasets_fpath = Path("/home/icb/simon.boehm/Masters_thesis/MT_code/datasets")
all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"

# %% [markdown]
# ## Step 1: Generate fingerprints
#
# - TODO: Right now we generate `rdkit_2d_normalized` features. Are these the correct ones?
# - TODO: There are pretrained & finetuned models also available, maybe that's useful for us:
#     - SIDER: Drug side effect prediction task
#     - ClinTox: Drug toxicity prediction task
#     - ChEMBL log P prediction task

# %% language="bash"
# set -euox pipefail
#
# # move csv of all smiles to be encoded into current workdir
# cp ../lincs_trapnell.smiles data/embeddings/lincs_trapnell.csv
# file="data/embeddings/lincs_trapnell.csv"
#
# # First we generate the feature embedding for the SMILES, which is an extra input
# # into GROVER
# echo "FILE: $file"
# features=$(echo $file | sed 's:.csv:.npz:')
# if [[ ! -f $features ]]; then
#     echo "Generating features: $features"
#     python scripts/save_features.py --data_path "$file" \
#                             --save_path "$features" \
#                             --features_generator rdkit_2d_normalized \
#                             --restart
# fi;
#
# # Second we input SMILES + Features into grover and get the fingerprint out
# # 'both' means we get a concatenated fingerprint of combined atoms + bonds features
# outfile=$(echo $file | sed 's:.csv:_grover_base_both.npz:')
# echo "EMB: $outfile"
# if [[ ! -f $outfile ]]; then
#     echo "Generating embedding: $outfile"
#     python main.py fingerprint --data_path "$file" \
#                        --features_path "$features" \
#                        --checkpoint_path data/model/grover_base.pt \
#                        --fingerprint_source both \
#                        --output "$outfile"
# fi;

# %%
lincs_trapnell_base = np.load("data/embeddings/lincs_trapnell_grover_base_both.npz")
print("Shape of GROVER_base embedding:", lincs_trapnell_base["fps"].shape)


# %% [markdown]
# ## Step 2: Generate DataFrame with SMILES -> Embedding mapping

# %%
def flatten(x: np.ndarray):
    assert len(x.shape) == 2 and x.shape[0] == 1
    return x[0]


embeddings_fpath = Path("data/embeddings")
smiles_file = embeddings_fpath / "lincs_trapnell.csv"
emb_file = embeddings_fpath / "lincs_trapnell_grover_base_both.npz"

# read list of smiles
smiles_df = pd.read_csv(smiles_file)
# read generated embedding (.npz has only one key, 'fps')
emb = np.load(emb_file)["fps"]
assert len(smiles_df) == emb.shape[0]

# generate a DataFrame with SMILES and Embedding in each row
final_df = pd.DataFrame(
    emb,
    index=smiles_df["smiles"].values,
    columns=[f"latent_{i+1}" for i in range(emb.shape[1])],
)
# remove duplicates indices (=SMILES) (This is probably useless)
final_df = final_df[~final_df.index.duplicated(keep="first")]
final_df.to_parquet(embeddings_fpath / "grover_base.parquet")

# %%
df = pd.read_parquet("data/embeddings/grover_base.parquet")

# %%
df

# %% [markdown]
# ## Step 3: Check
# Make extra sure the index of the generated dataframe is correct by loading our list of canonical SMILES again

# %%
all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"
all_smiles = pd.read_csv(all_smiles_fpath)["smiles"].values
assert sorted(list(df.index)) == sorted(list(all_smiles))