{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Computes embeddings for the dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# RDKIT" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Computes embeddings for the dataset\n", "\n", "# RDKIT\n", "from chemCPA.paths import DATA_DIR, PROJECT_DIR, ROOT, EMBEDDING_DIR\n", "import sys\n", "import os\n", "from tqdm.auto import tqdm\n", "\n", "# Add the parent directory of embeddings to Python path\n", "sys.path.append(str(ROOT))\n", "\n", "import embeddings.rdkit.embedding_rdkit as embedding_rdkit\n", "\n", "# Define the datasets to process with their corresponding SMILES keys\n", "datasets = [\n", " ('lincs_smiles.h5ad', 'SMILES'),\n", " ('lincs_full_smiles.h5ad', 'smiles'), # Changed SMILES key to lowercase\n", " ('sciplex_complete.h5ad', 'SMILES'),\n", " ('adata_MCF7.h5ad', 'SMILES'),\n", " ('adata_MCF7_lincs_genes.h5ad', 'SMILES'),\n", " ('adata_K562.h5ad', 'SMILES'),\n", " ('adata_K562_lincs_genes.h5ad', 'SMILES'),\n", " ('adata_A549.h5ad', 'SMILES'),\n", " ('adata_A549_lincs_genes.h5ad', 'SMILES'),\n", " ('sciplex_complete_subset_lincs_genes_v2.h5ad', 'SMILES'),\n", " ('sciplex_complete_middle_subset_v2.h5ad', 'SMILES'),\n", " ('sciplex_complete_middle_subset_lincs_genes_v2.h5ad', 'SMILES'),\n", " ('sciplex_complete_v2.h5ad', 'SMILES'),\n", " ('sciplex_complete_lincs_genes_v2.h5ad', 'SMILES')\n", "]\n", "\n", "# Process each dataset\n", "for dataset, smiles_key in tqdm(datasets, desc=\"Computing RDKit embeddings\"):\n", " h5ad_path = os.path.join(DATA_DIR, dataset)\n", " base_name = os.path.splitext(dataset)[0]\n", " output_filename = f\"{base_name}_rdkit2D_embedding.parquet\"\n", " output_path = os.path.join(EMBEDDING_DIR, 'rdkit', output_filename)\n", " \n", " # Create the output directory if it doesn't exist\n", " os.makedirs(os.path.dirname(output_path), exist_ok=True)\n", " \n", " try:\n", " embedding_rdkit.compute_rdkit_embeddings(h5ad_path, output_path=output_path, smiles_key=smiles_key)\n", " except Exception as e:\n", " tqdm.write(f\"Error processing {dataset}: {str(e)}\")" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }