# Computes embeddings for the dataset

# RDKIT

In [None]:
# Computes embeddings for the dataset

# RDKIT
from chemCPA.paths import DATA_DIR, PROJECT_DIR, ROOT, EMBEDDING_DIR
import sys
import os
from tqdm.auto import tqdm

# Add the parent directory of embeddings to Python path
sys.path.append(str(ROOT))

import embeddings.rdkit.embedding_rdkit as embedding_rdkit

# Define the datasets to process with their corresponding SMILES keys
datasets = [
    ('lincs_smiles.h5ad', 'SMILES'),
    ('lincs_full_smiles.h5ad', 'smiles'),  # Changed SMILES key to lowercase
    ('sciplex_complete.h5ad', 'SMILES'),
    ('adata_MCF7.h5ad', 'SMILES'),
    ('adata_MCF7_lincs_genes.h5ad', 'SMILES'),
    ('adata_K562.h5ad', 'SMILES'),
    ('adata_K562_lincs_genes.h5ad', 'SMILES'),
    ('adata_A549.h5ad', 'SMILES'),
    ('adata_A549_lincs_genes.h5ad', 'SMILES'),
    ('sciplex_complete_subset_lincs_genes_v2.h5ad', 'SMILES'),
    ('sciplex_complete_middle_subset_v2.h5ad', 'SMILES'),
    ('sciplex_complete_middle_subset_lincs_genes_v2.h5ad', 'SMILES'),
    ('sciplex_complete_v2.h5ad', 'SMILES'),
    ('sciplex_complete_lincs_genes_v2.h5ad', 'SMILES')
]

# Process each dataset
for dataset, smiles_key in tqdm(datasets, desc="Computing RDKit embeddings"):
    h5ad_path = os.path.join(DATA_DIR, dataset)
    base_name = os.path.splitext(dataset)[0]
    output_filename = f"{base_name}_rdkit2D_embedding.parquet"
    output_path = os.path.join(EMBEDDING_DIR, 'rdkit', output_filename)
    
    # Create the output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    try:
        embedding_rdkit.compute_rdkit_embeddings(h5ad_path, output_path=output_path, smiles_key=smiles_key)
    except Exception as e:
        tqdm.write(f"Error processing {dataset}: {str(e)}")