{ "cells": [ { "cell_type": "markdown", "id": "45d54f76-45c5-46c1-aeb2-dde40c63e8fc", "metadata": {}, "source": [ "**Requirements** \n", "* According to this [paper](https://arxiv.org/pdf/1904.01561.pdf), features are computed with [descriptastorus](https://github.com/bp-kelley/descriptastorus) package\n", "* Install via: `pip install git+https://github.com/bp-kelley/descriptastorus`" ] }, { "cell_type": "markdown", "id": "fa137ded", "metadata": {}, "source": [ "## General imports" ] }, { "cell_type": "code", "execution_count": 4, "id": "6c950b63", "metadata": {}, "outputs": [], "source": [ "import sys\n", "\n", "# this depends on the notebook depth and must be adapted per notebook\n", "sys.path.insert(0, \"/\") \n", "from chemCPA.paths import DATA_DIR, EMBEDDING_DIR" ] }, { "cell_type": "code", "execution_count": 5, "id": "4643260d", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from joblib import Parallel, delayed\n", "from tqdm.notebook import tqdm" ] }, { "cell_type": "markdown", "id": "db8608d6", "metadata": {}, "source": [ "## Load Smiles list" ] }, { "cell_type": "code", "execution_count": 6, "id": "db1601d1", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "from chemCPA.helper import canonicalize_smiles" ] }, { "cell_type": "code", "execution_count": 7, "id": "842d22c4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/nfs/staff-hdd/hetzell/miniconda3/envs/chemical_CPA/lib/python3.7/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", " utils.warn_names_duplicates(\"obs\")\n" ] } ], "source": [ "adata = sc.read(DATA_DIR/ \"adata_biolord_split_30.h5ad\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "734a3db6", "metadata": {}, "outputs": [], "source": [ "smiles_list = adata.obs[\"smiles\"].unique()\n", "# exclude nan from smiles_list \n", "smiles_list = [canonicalize_smiles(s) for s in smiles_list if s !=\"nan\"]" ] }, { "cell_type": "code", "execution_count": 9, "id": "653d99cf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of smiles strings: 186\n" ] } ], "source": [ "print(f'Number of smiles strings: {len(smiles_list)}')" ] }, { "cell_type": "code", "execution_count": 10, "id": "a5dc19a2-d321-49e6-a62d-e6024073146e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RDKit2D_calculated(bool)\n", "BalabanJ(float64)\n", "BertzCT(float64)\n", "Chi0(float64)\n", "Chi0n(float64)\n", "Chi0v(float64)\n", "Chi1(float64)\n", "Chi1n(float64)\n", "Chi1v(float64)\n", "Chi2n(float64)\n", "Chi2v(float64)\n", "Chi3n(float64)\n", "Chi3v(float64)\n", "Chi4n(float64)\n", "Chi4v(float64)\n", "EState_VSA1(float64)\n", "EState_VSA10(float64)\n", "EState_VSA11(float64)\n", "EState_VSA2(float64)\n", "EState_VSA3(float64)\n", "EState_VSA4(float64)\n", "EState_VSA5(float64)\n", "EState_VSA6(float64)\n", "EState_VSA7(float64)\n", "EState_VSA8(float64)\n", "EState_VSA9(float64)\n", "ExactMolWt(float64)\n", "FpDensityMorgan1(float64)\n", "FpDensityMorgan2(float64)\n", "FpDensityMorgan3(float64)\n", "FractionCSP3(float64)\n", "HallKierAlpha(float64)\n", "HeavyAtomCount(float64)\n", "HeavyAtomMolWt(float64)\n", "Ipc(float64)\n", "Kappa1(float64)\n", "Kappa2(float64)\n", "Kappa3(float64)\n", "LabuteASA(float64)\n", "MaxAbsEStateIndex(float64)\n", "MaxAbsPartialCharge(float64)\n", "MaxEStateIndex(float64)\n", "MaxPartialCharge(float64)\n", "MinAbsEStateIndex(float64)\n", "MinAbsPartialCharge(float64)\n", "MinEStateIndex(float64)\n", "MinPartialCharge(float64)\n", "MolLogP(float64)\n", "MolMR(float64)\n", "MolWt(float64)\n", "NHOHCount(float64)\n", "NOCount(float64)\n", "NumAliphaticCarbocycles(float64)\n", "NumAliphaticHeterocycles(float64)\n", "NumAliphaticRings(float64)\n", "NumAromaticCarbocycles(float64)\n", "NumAromaticHeterocycles(float64)\n", "NumAromaticRings(float64)\n", "NumHAcceptors(float64)\n", "NumHDonors(float64)\n", "NumHeteroatoms(float64)\n", "NumRadicalElectrons(float64)\n", "NumRotatableBonds(float64)\n", "NumSaturatedCarbocycles(float64)\n", "NumSaturatedHeterocycles(float64)\n", "NumSaturatedRings(float64)\n", "NumValenceElectrons(float64)\n", "PEOE_VSA1(float64)\n", "PEOE_VSA10(float64)\n", "PEOE_VSA11(float64)\n", "PEOE_VSA12(float64)\n", "PEOE_VSA13(float64)\n", "PEOE_VSA14(float64)\n", "PEOE_VSA2(float64)\n", "PEOE_VSA3(float64)\n", "PEOE_VSA4(float64)\n", "PEOE_VSA5(float64)\n", "PEOE_VSA6(float64)\n", "PEOE_VSA7(float64)\n", "PEOE_VSA8(float64)\n", "PEOE_VSA9(float64)\n", "RingCount(float64)\n", "SMR_VSA1(float64)\n", "SMR_VSA10(float64)\n", "SMR_VSA2(float64)\n", "SMR_VSA3(float64)\n", "SMR_VSA4(float64)\n", "SMR_VSA5(float64)\n", "SMR_VSA6(float64)\n", "SMR_VSA7(float64)\n", "SMR_VSA8(float64)\n", "SMR_VSA9(float64)\n", "SlogP_VSA1(float64)\n", "SlogP_VSA10(float64)\n", "SlogP_VSA11(float64)\n", "SlogP_VSA12(float64)\n", "SlogP_VSA2(float64)\n", "SlogP_VSA3(float64)\n", "SlogP_VSA4(float64)\n", "SlogP_VSA5(float64)\n", "SlogP_VSA6(float64)\n", "SlogP_VSA7(float64)\n", "SlogP_VSA8(float64)\n", "SlogP_VSA9(float64)\n", "TPSA(float64)\n", "VSA_EState1(float64)\n", "VSA_EState10(float64)\n", "VSA_EState2(float64)\n", "VSA_EState3(float64)\n", "VSA_EState4(float64)\n", "VSA_EState5(float64)\n", "VSA_EState6(float64)\n", "VSA_EState7(float64)\n", "VSA_EState8(float64)\n", "VSA_EState9(float64)\n", "fr_Al_COO(float64)\n", "fr_Al_OH(float64)\n", "fr_Al_OH_noTert(float64)\n", "fr_ArN(float64)\n", "fr_Ar_COO(float64)\n", "fr_Ar_N(float64)\n", "fr_Ar_NH(float64)\n", "fr_Ar_OH(float64)\n", "fr_COO(float64)\n", "fr_COO2(float64)\n", "fr_C_O(float64)\n", "fr_C_O_noCOO(float64)\n", "fr_C_S(float64)\n", "fr_HOCCN(float64)\n", "fr_Imine(float64)\n", "fr_NH0(float64)\n", "fr_NH1(float64)\n", "fr_NH2(float64)\n", "fr_N_O(float64)\n", "fr_Ndealkylation1(float64)\n", "fr_Ndealkylation2(float64)\n", "fr_Nhpyrrole(float64)\n", "fr_SH(float64)\n", "fr_aldehyde(float64)\n", "fr_alkyl_carbamate(float64)\n", "fr_alkyl_halide(float64)\n", "fr_allylic_oxid(float64)\n", "fr_amide(float64)\n", "fr_amidine(float64)\n", "fr_aniline(float64)\n", "fr_aryl_methyl(float64)\n", "fr_azide(float64)\n", "fr_azo(float64)\n", "fr_barbitur(float64)\n", "fr_benzene(float64)\n", "fr_benzodiazepine(float64)\n", "fr_bicyclic(float64)\n", "fr_diazo(float64)\n", "fr_dihydropyridine(float64)\n", "fr_epoxide(float64)\n", "fr_ester(float64)\n", "fr_ether(float64)\n", "fr_furan(float64)\n", "fr_guanido(float64)\n", "fr_halogen(float64)\n", "fr_hdrzine(float64)\n", "fr_hdrzone(float64)\n", "fr_imidazole(float64)\n", "fr_imide(float64)\n", "fr_isocyan(float64)\n", "fr_isothiocyan(float64)\n", "fr_ketone(float64)\n", "fr_ketone_Topliss(float64)\n", "fr_lactam(float64)\n", "fr_lactone(float64)\n", "fr_methoxy(float64)\n", "fr_morpholine(float64)\n", "fr_nitrile(float64)\n", "fr_nitro(float64)\n", "fr_nitro_arom(float64)\n", "fr_nitro_arom_nonortho(float64)\n", "fr_nitroso(float64)\n", "fr_oxazole(float64)\n", "fr_oxime(float64)\n", "fr_para_hydroxylation(float64)\n", "fr_phenol(float64)\n", "fr_phenol_noOrthoHbond(float64)\n", "fr_phos_acid(float64)\n", "fr_phos_ester(float64)\n", "fr_piperdine(float64)\n", "fr_piperzine(float64)\n", "fr_priamide(float64)\n", "fr_prisulfonamd(float64)\n", "fr_pyridine(float64)\n", "fr_quatN(float64)\n", "fr_sulfide(float64)\n", "fr_sulfonamd(float64)\n", "fr_sulfone(float64)\n", "fr_term_acetylene(float64)\n", "fr_tetrazole(float64)\n", "fr_thiazole(float64)\n", "fr_thiocyan(float64)\n", "fr_thiophene(float64)\n", "fr_unbrch_alkane(float64)\n", "fr_urea(float64)\n", "qed(float64)\n" ] } ], "source": [ "from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator\n", "generator = MakeGenerator((\"RDKit2D\",))\n", "for name, numpy_type in generator.GetColumns():\n", " print(f\"{name}({numpy_type.__name__})\")" ] }, { "cell_type": "code", "execution_count": 11, "id": "003cc588-e4dd-4dcc-98ec-4d3fcdd5432b", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3cdbc7fd2ace48aa960a1f81738956bc", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/186 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latent_1latent_2latent_3latent_4latent_5latent_6latent_7latent_8latent_9latent_10...latent_187latent_189latent_190latent_191latent_192latent_194latent_196latent_197latent_198latent_199
O=C([O-])CCCc1ccccc1.[Na+]-1.666455-1.600675-1.385933-1.134785-1.605154-1.460731-1.536649-1.465486-1.543553-1.452940...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.2845310.121276
CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)cccn4n3)c2)CC10.8630330.3766320.4551780.4756030.4632470.4623480.7775060.5032430.8794240.503081...1.457503-0.19722-0.2733873.977884-0.073324-0.199302-0.19722-0.270527-0.2845310.060135
COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccncc3)ccc2c10.9447860.2480640.2955010.3187750.4127920.2686120.5063800.1784870.3841140.284651...1.457503-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-0.496321
COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc23)c(Cl)cc1Cl0.6035070.7465550.7898360.9205480.7839270.7011640.7002730.5829360.5733940.651072...1.457503-0.19722-0.273387-0.250038-0.073324-0.199302-0.197220.628009-0.284531-0.508369
COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O-0.393464-0.067916-0.163103-0.268156-0.140663-0.313933-0.456744-0.450171-0.621424-0.534237...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.2845310.379172
..................................................................
CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c3[nH]c4ncc(C)cc4c23)c11.2863440.7404280.8878420.9005470.7320540.9106491.1360281.0148561.4267701.140227...1.457503-0.19722-0.2733873.977884-0.073324-0.199302-0.19722-0.270527-0.284531-0.325822
CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2c1.Cl.O0.1231240.2673210.4533490.4738070.2890930.2956290.1172620.090182-0.1306640.137535...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-0.746358
Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2ccccc21-0.028239-0.0113720.2308440.3917840.1118040.2004410.0276260.214944-0.0173530.364936...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-0.495217
CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4)cc3)c3ccccc3)c2c11.1152220.7707630.8474260.8608520.8841720.9975701.2178790.8696611.2107110.882431...-0.535847-0.197223.638151-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-1.140448
CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(O)C1.Cl0.252492-0.041707-0.0569390.099009-0.050176-0.053351-0.1107970.0955470.0024730.210616...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.2845310.466665
\n", "

186 rows × 177 columns

\n", "" ], "text/plain": [ " latent_1 latent_2 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.666455 -1.600675 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.863033 0.376632 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.944786 0.248064 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.603507 0.746555 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.393464 -0.067916 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.286344 0.740428 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.123124 0.267321 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.028239 -0.011372 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 1.115222 0.770763 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... 0.252492 -0.041707 \n", "\n", " latent_3 latent_4 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.385933 -1.134785 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.455178 0.475603 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.295501 0.318775 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.789836 0.920548 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.163103 -0.268156 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 0.887842 0.900547 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.453349 0.473807 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... 0.230844 0.391784 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 0.847426 0.860852 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.056939 0.099009 \n", "\n", " latent_5 latent_6 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.605154 -1.460731 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.463247 0.462348 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.412792 0.268612 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.783927 0.701164 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.140663 -0.313933 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 0.732054 0.910649 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.289093 0.295629 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... 0.111804 0.200441 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 0.884172 0.997570 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.050176 -0.053351 \n", "\n", " latent_7 latent_8 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.536649 -1.465486 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.777506 0.503243 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.506380 0.178487 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.700273 0.582936 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.456744 -0.450171 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.136028 1.014856 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.117262 0.090182 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... 0.027626 0.214944 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 1.217879 0.869661 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.110797 0.095547 \n", "\n", " latent_9 latent_10 ... \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.543553 -1.452940 ... \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.879424 0.503081 ... \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.384114 0.284651 ... \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.573394 0.651072 ... \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.621424 -0.534237 ... \n", "... ... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.426770 1.140227 ... \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.130664 0.137535 ... \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.017353 0.364936 ... \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 1.210711 0.882431 ... \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... 0.002473 0.210616 ... \n", "\n", " latent_187 latent_189 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.535847 -0.19722 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 1.457503 -0.19722 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 1.457503 -0.19722 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 1.457503 -0.19722 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.535847 -0.19722 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.457503 -0.19722 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.535847 -0.19722 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.535847 -0.19722 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.535847 -0.19722 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.535847 -0.19722 \n", "\n", " latent_190 latent_191 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.273387 -0.250038 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.273387 3.977884 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.273387 -0.250038 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.273387 -0.250038 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.273387 -0.250038 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.273387 3.977884 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.273387 -0.250038 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.273387 -0.250038 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 3.638151 -0.250038 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.273387 -0.250038 \n", "\n", " latent_192 latent_194 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.073324 -0.199302 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.073324 -0.199302 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.073324 -0.199302 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.073324 -0.199302 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.073324 -0.199302 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.073324 -0.199302 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.073324 -0.199302 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.073324 -0.199302 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.073324 -0.199302 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.073324 -0.199302 \n", "\n", " latent_196 latent_197 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.19722 -0.270527 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.19722 -0.270527 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.19722 -0.270527 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.19722 0.628009 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.19722 -0.270527 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.19722 -0.270527 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.19722 -0.270527 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.19722 -0.270527 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.19722 -0.270527 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.19722 -0.270527 \n", "\n", " latent_198 latent_199 \n", "O=C([O-])CCCc1ccccc1.[Na+] -0.284531 0.121276 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.284531 0.060135 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.284531 -0.496321 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.284531 -0.508369 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.284531 0.379172 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.284531 -0.325822 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.284531 -0.746358 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.284531 -0.495217 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.284531 -1.140448 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.284531 0.466665 \n", "\n", "[186 rows x 177 columns]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalized_df" ] }, { "cell_type": "markdown", "id": "7c216a71-a174-4f0b-8fea-0fe4a1f47fcb", "metadata": {}, "source": [ "Check destination folder" ] }, { "cell_type": "code", "execution_count": 23, "id": "39bfb2ce", "metadata": {}, "outputs": [], "source": [ "model_name = \"rdkit2D\"\n", "dataset_name= \"biolord\"\n", "fname = f'{model_name}_embedding_{dataset_name}.parquet'\n", "\n", "directory = EMBEDDING_DIR /'rdkit' / 'data' /'embeddings'\n", "directory.mkdir(parents=True, exist_ok=True)" ] }, { "cell_type": "markdown", "id": "ee361ecd-2224-4c58-b3fc-5879cb3a6488", "metadata": {}, "source": [ "Save normalised version" ] }, { "cell_type": "code", "execution_count": 24, "id": "f330b59f-798b-420f-9ca8-07d6049dc26a", "metadata": {}, "outputs": [], "source": [ "normalized_df.to_parquet(directory / fname)" ] }, { "cell_type": "markdown", "id": "85180ed5", "metadata": {}, "source": [ "Check that it worked" ] }, { "cell_type": "code", "execution_count": 25, "id": "9620dae5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latent_1latent_2latent_3latent_4latent_5latent_6latent_7latent_8latent_9latent_10...latent_187latent_189latent_190latent_191latent_192latent_194latent_196latent_197latent_198latent_199
O=C([O-])CCCc1ccccc1.[Na+]-1.666455-1.600675-1.385933-1.134785-1.605154-1.460731-1.536649-1.465486-1.543553-1.452940...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.2845310.121276
CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)cccn4n3)c2)CC10.8630330.3766320.4551780.4756030.4632470.4623480.7775060.5032430.8794240.503081...1.457503-0.19722-0.2733873.977884-0.073324-0.199302-0.19722-0.270527-0.2845310.060135
COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccncc3)ccc2c10.9447860.2480640.2955010.3187750.4127920.2686120.5063800.1784870.3841140.284651...1.457503-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-0.496321
COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc23)c(Cl)cc1Cl0.6035070.7465550.7898360.9205480.7839270.7011640.7002730.5829360.5733940.651072...1.457503-0.19722-0.273387-0.250038-0.073324-0.199302-0.197220.628009-0.284531-0.508369
COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O-0.393464-0.067916-0.163103-0.268156-0.140663-0.313933-0.456744-0.450171-0.621424-0.534237...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.2845310.379172
..................................................................
CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c3[nH]c4ncc(C)cc4c23)c11.2863440.7404280.8878420.9005470.7320540.9106491.1360281.0148561.4267701.140227...1.457503-0.19722-0.2733873.977884-0.073324-0.199302-0.19722-0.270527-0.284531-0.325822
CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2c1.Cl.O0.1231240.2673210.4533490.4738070.2890930.2956290.1172620.090182-0.1306640.137535...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-0.746358
Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2ccccc21-0.028239-0.0113720.2308440.3917840.1118040.2004410.0276260.214944-0.0173530.364936...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-0.495217
CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4)cc3)c3ccccc3)c2c11.1152220.7707630.8474260.8608520.8841720.9975701.2178790.8696611.2107110.882431...-0.535847-0.197223.638151-0.250038-0.073324-0.199302-0.19722-0.270527-0.284531-1.140448
CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(O)C1.Cl0.252492-0.041707-0.0569390.099009-0.050176-0.053351-0.1107970.0955470.0024730.210616...-0.535847-0.19722-0.273387-0.250038-0.073324-0.199302-0.19722-0.270527-0.2845310.466665
\n", "

186 rows × 177 columns

\n", "
" ], "text/plain": [ " latent_1 latent_2 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.666455 -1.600675 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.863033 0.376632 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.944786 0.248064 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.603507 0.746555 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.393464 -0.067916 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.286344 0.740428 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.123124 0.267321 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.028239 -0.011372 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 1.115222 0.770763 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... 0.252492 -0.041707 \n", "\n", " latent_3 latent_4 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.385933 -1.134785 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.455178 0.475603 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.295501 0.318775 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.789836 0.920548 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.163103 -0.268156 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 0.887842 0.900547 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.453349 0.473807 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... 0.230844 0.391784 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 0.847426 0.860852 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.056939 0.099009 \n", "\n", " latent_5 latent_6 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.605154 -1.460731 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.463247 0.462348 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.412792 0.268612 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.783927 0.701164 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.140663 -0.313933 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 0.732054 0.910649 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.289093 0.295629 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... 0.111804 0.200441 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 0.884172 0.997570 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.050176 -0.053351 \n", "\n", " latent_7 latent_8 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.536649 -1.465486 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.777506 0.503243 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.506380 0.178487 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.700273 0.582936 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.456744 -0.450171 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.136028 1.014856 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... 0.117262 0.090182 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... 0.027626 0.214944 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 1.217879 0.869661 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.110797 0.095547 \n", "\n", " latent_9 latent_10 ... \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -1.543553 -1.452940 ... \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 0.879424 0.503081 ... \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 0.384114 0.284651 ... \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 0.573394 0.651072 ... \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.621424 -0.534237 ... \n", "... ... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.426770 1.140227 ... \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.130664 0.137535 ... \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.017353 0.364936 ... \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 1.210711 0.882431 ... \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... 0.002473 0.210616 ... \n", "\n", " latent_187 latent_189 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.535847 -0.19722 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... 1.457503 -0.19722 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... 1.457503 -0.19722 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... 1.457503 -0.19722 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.535847 -0.19722 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... 1.457503 -0.19722 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.535847 -0.19722 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.535847 -0.19722 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.535847 -0.19722 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.535847 -0.19722 \n", "\n", " latent_190 latent_191 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.273387 -0.250038 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.273387 3.977884 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.273387 -0.250038 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.273387 -0.250038 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.273387 -0.250038 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.273387 3.977884 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.273387 -0.250038 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.273387 -0.250038 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... 3.638151 -0.250038 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.273387 -0.250038 \n", "\n", " latent_192 latent_194 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.073324 -0.199302 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.073324 -0.199302 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.073324 -0.199302 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.073324 -0.199302 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.073324 -0.199302 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.073324 -0.199302 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.073324 -0.199302 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.073324 -0.199302 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.073324 -0.199302 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.073324 -0.199302 \n", "\n", " latent_196 latent_197 \\\n", "O=C([O-])CCCc1ccccc1.[Na+] -0.19722 -0.270527 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.19722 -0.270527 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.19722 -0.270527 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.19722 0.628009 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.19722 -0.270527 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.19722 -0.270527 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.19722 -0.270527 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.19722 -0.270527 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.19722 -0.270527 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.19722 -0.270527 \n", "\n", " latent_198 latent_199 \n", "O=C([O-])CCCc1ccccc1.[Na+] -0.284531 0.121276 \n", "CN1CCN(c2cccc(Nc3nc4c(-c5ccc(S(C)(=O)=O)cc5)ccc... -0.284531 0.060135 \n", "COc1ccc2cc(-c3nc(-c4ccc(S(C)=O)cc4)[nH]c3-c3ccn... -0.284531 -0.496321 \n", "COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc2... -0.284531 -0.508369 \n", "COc1cc(C=CC(=O)CC(=O)C=Cc2ccc(O)c(OC)c2)ccc1O -0.284531 0.379172 \n", "... ... ... \n", "CCS(=O)(=O)c1cccc(-c2cc(C(=O)NC3CCN(C)CC3)c(C)c... -0.284531 -0.325822 \n", "CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2... -0.284531 -0.746358 \n", "Cl.Cl.Cn1cc(CNCC2CCN(c3ncc(C(=O)NO)cn3)CC2)c2cc... -0.284531 -0.495217 \n", "CCS(=O)(=O)Nc1ccc2[nH]c(O)c(C(=Nc3ccc(CN4CCCCC4... -0.284531 -1.140448 \n", "CN1CCC(c2c(O)cc(O)c3c(=O)cc(-c4ccccc4Cl)oc23)C(... -0.284531 0.466665 \n", "\n", "[186 rows x 177 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_parquet(directory/ fname)\n", "df" ] }, { "cell_type": "code", "execution_count": 26, "id": "201edd5b-832b-4350-b82d-8e6a4da099ac", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PosixPath('/nfs/staff-ssd/hetzell/code/chemCPA_v2/project_folder/embeddings/rdkit/data/embeddings/rdkit2D_embedding_biolord.parquet')" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "directory/ fname" ] }, { "cell_type": "code", "execution_count": null, "id": "8494453a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "ad25c9354f8cefdf5a943c25e67813a21d2807e3af4d6d0915e47390a83b57ce" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 5 }