File size: 2,600 Bytes
a48f0ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Computes embeddings for the dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RDKIT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Computes embeddings for the dataset\n",
    "\n",
    "# RDKIT\n",
    "from chemCPA.paths import DATA_DIR, PROJECT_DIR, ROOT, EMBEDDING_DIR\n",
    "import sys\n",
    "import os\n",
    "from tqdm.auto import tqdm\n",
    "\n",
    "# Add the parent directory of embeddings to Python path\n",
    "sys.path.append(str(ROOT))\n",
    "\n",
    "import embeddings.rdkit.embedding_rdkit as embedding_rdkit\n",
    "\n",
    "# Define the datasets to process with their corresponding SMILES keys\n",
    "datasets = [\n",
    "    ('lincs_smiles.h5ad', 'SMILES'),\n",
    "    ('lincs_full_smiles.h5ad', 'smiles'),  # Changed SMILES key to lowercase\n",
    "    ('sciplex_complete.h5ad', 'SMILES'),\n",
    "    ('adata_MCF7.h5ad', 'SMILES'),\n",
    "    ('adata_MCF7_lincs_genes.h5ad', 'SMILES'),\n",
    "    ('adata_K562.h5ad', 'SMILES'),\n",
    "    ('adata_K562_lincs_genes.h5ad', 'SMILES'),\n",
    "    ('adata_A549.h5ad', 'SMILES'),\n",
    "    ('adata_A549_lincs_genes.h5ad', 'SMILES'),\n",
    "    ('sciplex_complete_subset_lincs_genes_v2.h5ad', 'SMILES'),\n",
    "    ('sciplex_complete_middle_subset_v2.h5ad', 'SMILES'),\n",
    "    ('sciplex_complete_middle_subset_lincs_genes_v2.h5ad', 'SMILES'),\n",
    "    ('sciplex_complete_v2.h5ad', 'SMILES'),\n",
    "    ('sciplex_complete_lincs_genes_v2.h5ad', 'SMILES')\n",
    "]\n",
    "\n",
    "# Process each dataset\n",
    "for dataset, smiles_key in tqdm(datasets, desc=\"Computing RDKit embeddings\"):\n",
    "    h5ad_path = os.path.join(DATA_DIR, dataset)\n",
    "    base_name = os.path.splitext(dataset)[0]\n",
    "    output_filename = f\"{base_name}_rdkit2D_embedding.parquet\"\n",
    "    output_path = os.path.join(EMBEDDING_DIR, 'rdkit', output_filename)\n",
    "    \n",
    "    # Create the output directory if it doesn't exist\n",
    "    os.makedirs(os.path.dirname(output_path), exist_ok=True)\n",
    "    \n",
    "    try:\n",
    "        embedding_rdkit.compute_rdkit_embeddings(h5ad_path, output_path=output_path, smiles_key=smiles_key)\n",
    "    except Exception as e:\n",
    "        tqdm.write(f\"Error processing {dataset}: {str(e)}\")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}