|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from chemCPA.paths import DATA_DIR, PROJECT_DIR, ROOT, EMBEDDING_DIR |
|
import sys |
|
import os |
|
from tqdm.auto import tqdm |
|
import pandas as pd |
|
|
|
|
|
sys.path.append(str(ROOT)) |
|
|
|
import embeddings.rdkit.embedding_rdkit as embedding_rdkit |
|
|
|
|
|
datasets = [ |
|
('lincs_smiles.h5ad', 'SMILES'), |
|
('lincs_full_smiles.h5ad', 'canonical_smiles'), |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
('sciplex_complete_v2.h5ad', 'SMILES'), |
|
('sciplex_complete_lincs_genes_v2.h5ad', 'SMILES') |
|
|
|
] |
|
|
|
|
|
FIXED_EMBEDDING_DIM = 200 |
|
|
|
|
|
SKIP_VARIANCE_FILTER = False |
|
|
|
print("\nComputing and analyzing embeddings:") |
|
print(f"Using fixed embedding dimension: {FIXED_EMBEDDING_DIM}") |
|
print(f"Skip variance filtering: {SKIP_VARIANCE_FILTER}") |
|
print("-" * 50) |
|
|
|
|
|
for dataset, smiles_key in tqdm(datasets, desc="Computing RDKit embeddings"): |
|
h5ad_path = os.path.join(DATA_DIR, dataset) |
|
base_name = os.path.splitext(dataset)[0] |
|
output_filename = f"{base_name}_rdkit2D_embedding.parquet" |
|
output_path = os.path.join(EMBEDDING_DIR, 'rdkit', output_filename) |
|
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True) |
|
|
|
try: |
|
|
|
embedding_rdkit.compute_rdkit_embeddings( |
|
h5ad_path, |
|
output_path=output_path, |
|
smiles_key=smiles_key, |
|
skip_variance_filter=SKIP_VARIANCE_FILTER |
|
) |
|
|
|
|
|
embeddings_df = pd.read_parquet(output_path) |
|
|
|
print(f"\nEmbedding analysis for {dataset}:") |
|
print(f"Shape: {embeddings_df.shape}") |
|
print(f"Number of features: {embeddings_df.shape[1]}") |
|
print(f"Memory usage: {embeddings_df.memory_usage().sum() / 1024**2:.2f} MB") |
|
print(f"File location: {output_path}") |
|
print("-" * 50) |
|
|
|
except Exception as e: |
|
tqdm.write(f"Error processing {dataset}: {str(e)}") |
|
|