File size: 2,600 Bytes
a48f0ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Computes embeddings for the dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RDKIT"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Computes embeddings for the dataset\n",
"\n",
"# RDKIT\n",
"from chemCPA.paths import DATA_DIR, PROJECT_DIR, ROOT, EMBEDDING_DIR\n",
"import sys\n",
"import os\n",
"from tqdm.auto import tqdm\n",
"\n",
"# Add the parent directory of embeddings to Python path\n",
"sys.path.append(str(ROOT))\n",
"\n",
"import embeddings.rdkit.embedding_rdkit as embedding_rdkit\n",
"\n",
"# Define the datasets to process with their corresponding SMILES keys\n",
"datasets = [\n",
" ('lincs_smiles.h5ad', 'SMILES'),\n",
" ('lincs_full_smiles.h5ad', 'smiles'), # Changed SMILES key to lowercase\n",
" ('sciplex_complete.h5ad', 'SMILES'),\n",
" ('adata_MCF7.h5ad', 'SMILES'),\n",
" ('adata_MCF7_lincs_genes.h5ad', 'SMILES'),\n",
" ('adata_K562.h5ad', 'SMILES'),\n",
" ('adata_K562_lincs_genes.h5ad', 'SMILES'),\n",
" ('adata_A549.h5ad', 'SMILES'),\n",
" ('adata_A549_lincs_genes.h5ad', 'SMILES'),\n",
" ('sciplex_complete_subset_lincs_genes_v2.h5ad', 'SMILES'),\n",
" ('sciplex_complete_middle_subset_v2.h5ad', 'SMILES'),\n",
" ('sciplex_complete_middle_subset_lincs_genes_v2.h5ad', 'SMILES'),\n",
" ('sciplex_complete_v2.h5ad', 'SMILES'),\n",
" ('sciplex_complete_lincs_genes_v2.h5ad', 'SMILES')\n",
"]\n",
"\n",
"# Process each dataset\n",
"for dataset, smiles_key in tqdm(datasets, desc=\"Computing RDKit embeddings\"):\n",
" h5ad_path = os.path.join(DATA_DIR, dataset)\n",
" base_name = os.path.splitext(dataset)[0]\n",
" output_filename = f\"{base_name}_rdkit2D_embedding.parquet\"\n",
" output_path = os.path.join(EMBEDDING_DIR, 'rdkit', output_filename)\n",
" \n",
" # Create the output directory if it doesn't exist\n",
" os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
" \n",
" try:\n",
" embedding_rdkit.compute_rdkit_embeddings(h5ad_path, output_path=output_path, smiles_key=smiles_key)\n",
" except Exception as e:\n",
" tqdm.write(f\"Error processing {dataset}: {str(e)}\")"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|