Spaces:

b1ro
/

chemCPA

Build error

chemCPA / embeddings /grover /generate_embeddings.py

github-actions[bot]

HF snapshot

a48f0ae 2 months ago

4 kB

	# ---
	# jupyter:
	# jupytext:
	# notebook_metadata_filter: -kernelspec
	# text_representation:
	# extension: .py
	# format_name: percent
	# format_version: '1.3'
	# jupytext_version: 1.14.1
	# ---

	# %% [markdown]
	# # GROVER
	# Generate GROVER fingerprints for SMILES-drugs coming from LINCS + SciPlex3.
	#
	# Steps:
	# 1. Load `lincs_trapnell.smiles` as the list of SMILES to be encoded
	# 2. Generate fingerprints using GROVER
	# 3. Save SMILES -> fingerprint mapping as a pandas df.
	#
	# from pathlib import Path
	#
	# import numpy as np
	# import pandas as pd

	# %%
	import rdkit

	# %%
	import scanpy as sc
	from rdkit import Chem

	rdkit.__version__

	# %%
	# SET
	datasets_fpath = Path("/home/icb/simon.boehm/Masters_thesis/MT_code/datasets")
	all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"

	# %% [markdown]
	# ## Step 1: Generate fingerprints
	#
	# - TODO: Right now we generate `rdkit_2d_normalized` features. Are these the correct ones?
	# - TODO: There are pretrained & finetuned models also available, maybe that's useful for us:
	# - SIDER: Drug side effect prediction task
	# - ClinTox: Drug toxicity prediction task
	# - ChEMBL log P prediction task

	# %% language="bash"
	# set -euox pipefail
	#
	# # move csv of all smiles to be encoded into current workdir
	# cp ../lincs_trapnell.smiles data/embeddings/lincs_trapnell.csv
	# file="data/embeddings/lincs_trapnell.csv"
	#
	# # First we generate the feature embedding for the SMILES, which is an extra input
	# # into GROVER
	# echo "FILE: $file"
	# features=$(echo $file \| sed 's:.csv:.npz:')
	# if [[ ! -f $features ]]; then
	# echo "Generating features: $features"
	# python scripts/save_features.py --data_path "$file" \
	# --save_path "$features" \
	# --features_generator rdkit_2d_normalized \
	# --restart
	# fi;
	#
	# # Second we input SMILES + Features into grover and get the fingerprint out
	# # 'both' means we get a concatenated fingerprint of combined atoms + bonds features
	# outfile=$(echo $file \| sed 's:.csv:_grover_base_both.npz:')
	# echo "EMB: $outfile"
	# if [[ ! -f $outfile ]]; then
	# echo "Generating embedding: $outfile"
	# python main.py fingerprint --data_path "$file" \
	# --features_path "$features" \
	# --checkpoint_path data/model/grover_base.pt \
	# --fingerprint_source both \
	# --output "$outfile"
	# fi;

	# %%
	lincs_trapnell_base = np.load("data/embeddings/lincs_trapnell_grover_base_both.npz")
	print("Shape of GROVER_base embedding:", lincs_trapnell_base["fps"].shape)


	# %% [markdown]
	# ## Step 2: Generate DataFrame with SMILES -> Embedding mapping

	# %%
	def flatten(x: np.ndarray):
	assert len(x.shape) == 2 and x.shape[0] == 1
	return x[0]


	embeddings_fpath = Path("data/embeddings")
	smiles_file = embeddings_fpath / "lincs_trapnell.csv"
	emb_file = embeddings_fpath / "lincs_trapnell_grover_base_both.npz"

	# read list of smiles
	smiles_df = pd.read_csv(smiles_file)
	# read generated embedding (.npz has only one key, 'fps')
	emb = np.load(emb_file)["fps"]
	assert len(smiles_df) == emb.shape[0]

	# generate a DataFrame with SMILES and Embedding in each row
	final_df = pd.DataFrame(
	emb,
	index=smiles_df["smiles"].values,
	columns=[f"latent_{i+1}" for i in range(emb.shape[1])],
	)
	# remove duplicates indices (=SMILES) (This is probably useless)
	final_df = final_df[~final_df.index.duplicated(keep="first")]
	final_df.to_parquet(embeddings_fpath / "grover_base.parquet")

	# %%
	df = pd.read_parquet("data/embeddings/grover_base.parquet")

	# %%
	df

	# %% [markdown]
	# ## Step 3: Check
	# Make extra sure the index of the generated dataframe is correct by loading our list of canonical SMILES again

	# %%
	all_smiles_fpath = Path.cwd().parent / "lincs_trapnell.smiles"
	all_smiles = pd.read_csv(all_smiles_fpath)["smiles"].values
	assert sorted(list(df.index)) == sorted(list(all_smiles))