{ "cells": [ { "cell_type": "markdown", "id": "45d54f76-45c5-46c1-aeb2-dde40c63e8fc", "metadata": {}, "source": [ "**Requirements** \n", "* According to this [paper](https://arxiv.org/pdf/1904.01561.pdf), features are computed with [descriptastorus](https://github.com/bp-kelley/descriptastorus) package\n", "* Install via: `pip install git+https://github.com/bp-kelley/descriptastorus`" ] }, { "cell_type": "markdown", "id": "fa137ded", "metadata": {}, "source": [ "## General imports" ] }, { "cell_type": "code", "execution_count": 5, "id": "6c950b63", "metadata": {}, "outputs": [ { "ename": "ModuleNotFoundError", "evalue": "No module named 'compert'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[5], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m \n\u001b[1;32m 2\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39minsert(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;66;03m# this depends on the notebook depth and must be adapted per notebook\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcompert\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpaths\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DATA_DIR, EMBEDDING_DIR\n", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'compert'" ] } ], "source": [ "import sys \n", "sys.path.insert(0, \"/\") # this depends on the notebook depth and must be adapted per notebook\n", "from compert.paths import DATA_DIR, EMBEDDING_DIR" ] }, { "cell_type": "code", "execution_count": 18, "id": "4643260d", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from joblib import Parallel, delayed\n", "from tqdm.notebook import tqdm" ] }, { "cell_type": "markdown", "id": "db8608d6", "metadata": {}, "source": [ "## Load Smiles list" ] }, { "cell_type": "code", "execution_count": 3, "id": "db1601d1", "metadata": {}, "outputs": [], "source": [ "dataset_name = 'lincs_trapnell'" ] }, { "cell_type": "code", "execution_count": 7, "id": "734a3db6", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "smiles_df = pd.read_csv('/home/user/app/output_smiles.csv')\n", "smiles_list = smiles_df['SMILES'].values" ] }, { "cell_type": "code", "execution_count": 8, "id": "653d99cf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of smiles strings: 840677\n" ] } ], "source": [ "print(f'Number of smiles strings: {len(smiles_list)}')" ] }, { "cell_type": "code", "execution_count": 10, "id": "a5dc19a2-d321-49e6-a62d-e6024073146e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RDKit2D_calculated(bool)\n", "BalabanJ(float64)\n", "BertzCT(float64)\n", "Chi0(float64)\n", "Chi0n(float64)\n", "Chi0v(float64)\n", "Chi1(float64)\n", "Chi1n(float64)\n", "Chi1v(float64)\n", "Chi2n(float64)\n", "Chi2v(float64)\n", "Chi3n(float64)\n", "Chi3v(float64)\n", "Chi4n(float64)\n", "Chi4v(float64)\n", "EState_VSA1(float64)\n", "EState_VSA10(float64)\n", "EState_VSA11(float64)\n", "EState_VSA2(float64)\n", "EState_VSA3(float64)\n", "EState_VSA4(float64)\n", "EState_VSA5(float64)\n", "EState_VSA6(float64)\n", "EState_VSA7(float64)\n", "EState_VSA8(float64)\n", "EState_VSA9(float64)\n", "ExactMolWt(float64)\n", "FpDensityMorgan1(float64)\n", "FpDensityMorgan2(float64)\n", "FpDensityMorgan3(float64)\n", "FractionCSP3(float64)\n", "HallKierAlpha(float64)\n", "HeavyAtomCount(float64)\n", "HeavyAtomMolWt(float64)\n", "Ipc(float64)\n", "Kappa1(float64)\n", "Kappa2(float64)\n", "Kappa3(float64)\n", "LabuteASA(float64)\n", "MaxAbsEStateIndex(float64)\n", "MaxAbsPartialCharge(float64)\n", "MaxEStateIndex(float64)\n", "MaxPartialCharge(float64)\n", "MinAbsEStateIndex(float64)\n", "MinAbsPartialCharge(float64)\n", "MinEStateIndex(float64)\n", "MinPartialCharge(float64)\n", "MolLogP(float64)\n", "MolMR(float64)\n", "MolWt(float64)\n", "NHOHCount(float64)\n", "NOCount(float64)\n", "NumAliphaticCarbocycles(float64)\n", "NumAliphaticHeterocycles(float64)\n", "NumAliphaticRings(float64)\n", "NumAromaticCarbocycles(float64)\n", "NumAromaticHeterocycles(float64)\n", "NumAromaticRings(float64)\n", "NumHAcceptors(float64)\n", "NumHDonors(float64)\n", "NumHeteroatoms(float64)\n", "NumRadicalElectrons(float64)\n", "NumRotatableBonds(float64)\n", "NumSaturatedCarbocycles(float64)\n", "NumSaturatedHeterocycles(float64)\n", "NumSaturatedRings(float64)\n", "NumValenceElectrons(float64)\n", "PEOE_VSA1(float64)\n", "PEOE_VSA10(float64)\n", "PEOE_VSA11(float64)\n", "PEOE_VSA12(float64)\n", "PEOE_VSA13(float64)\n", "PEOE_VSA14(float64)\n", "PEOE_VSA2(float64)\n", "PEOE_VSA3(float64)\n", "PEOE_VSA4(float64)\n", "PEOE_VSA5(float64)\n", "PEOE_VSA6(float64)\n", "PEOE_VSA7(float64)\n", "PEOE_VSA8(float64)\n", "PEOE_VSA9(float64)\n", "RingCount(float64)\n", "SMR_VSA1(float64)\n", "SMR_VSA10(float64)\n", "SMR_VSA2(float64)\n", "SMR_VSA3(float64)\n", "SMR_VSA4(float64)\n", "SMR_VSA5(float64)\n", "SMR_VSA6(float64)\n", "SMR_VSA7(float64)\n", "SMR_VSA8(float64)\n", "SMR_VSA9(float64)\n", "SlogP_VSA1(float64)\n", "SlogP_VSA10(float64)\n", "SlogP_VSA11(float64)\n", "SlogP_VSA12(float64)\n", "SlogP_VSA2(float64)\n", "SlogP_VSA3(float64)\n", "SlogP_VSA4(float64)\n", "SlogP_VSA5(float64)\n", "SlogP_VSA6(float64)\n", "SlogP_VSA7(float64)\n", "SlogP_VSA8(float64)\n", "SlogP_VSA9(float64)\n", "TPSA(float64)\n", "VSA_EState1(float64)\n", "VSA_EState10(float64)\n", "VSA_EState2(float64)\n", "VSA_EState3(float64)\n", "VSA_EState4(float64)\n", "VSA_EState5(float64)\n", "VSA_EState6(float64)\n", "VSA_EState7(float64)\n", "VSA_EState8(float64)\n", "VSA_EState9(float64)\n", "fr_Al_COO(float64)\n", "fr_Al_OH(float64)\n", "fr_Al_OH_noTert(float64)\n", "fr_ArN(float64)\n", "fr_Ar_COO(float64)\n", "fr_Ar_N(float64)\n", "fr_Ar_NH(float64)\n", "fr_Ar_OH(float64)\n", "fr_COO(float64)\n", "fr_COO2(float64)\n", "fr_C_O(float64)\n", "fr_C_O_noCOO(float64)\n", "fr_C_S(float64)\n", "fr_HOCCN(float64)\n", "fr_Imine(float64)\n", "fr_NH0(float64)\n", "fr_NH1(float64)\n", "fr_NH2(float64)\n", "fr_N_O(float64)\n", "fr_Ndealkylation1(float64)\n", "fr_Ndealkylation2(float64)\n", "fr_Nhpyrrole(float64)\n", "fr_SH(float64)\n", "fr_aldehyde(float64)\n", "fr_alkyl_carbamate(float64)\n", "fr_alkyl_halide(float64)\n", "fr_allylic_oxid(float64)\n", "fr_amide(float64)\n", "fr_amidine(float64)\n", "fr_aniline(float64)\n", "fr_aryl_methyl(float64)\n", "fr_azide(float64)\n", "fr_azo(float64)\n", "fr_barbitur(float64)\n", "fr_benzene(float64)\n", "fr_benzodiazepine(float64)\n", "fr_bicyclic(float64)\n", "fr_diazo(float64)\n", "fr_dihydropyridine(float64)\n", "fr_epoxide(float64)\n", "fr_ester(float64)\n", "fr_ether(float64)\n", "fr_furan(float64)\n", "fr_guanido(float64)\n", "fr_halogen(float64)\n", "fr_hdrzine(float64)\n", "fr_hdrzone(float64)\n", "fr_imidazole(float64)\n", "fr_imide(float64)\n", "fr_isocyan(float64)\n", "fr_isothiocyan(float64)\n", "fr_ketone(float64)\n", "fr_ketone_Topliss(float64)\n", "fr_lactam(float64)\n", "fr_lactone(float64)\n", "fr_methoxy(float64)\n", "fr_morpholine(float64)\n", "fr_nitrile(float64)\n", "fr_nitro(float64)\n", "fr_nitro_arom(float64)\n", "fr_nitro_arom_nonortho(float64)\n", "fr_nitroso(float64)\n", "fr_oxazole(float64)\n", "fr_oxime(float64)\n", "fr_para_hydroxylation(float64)\n", "fr_phenol(float64)\n", "fr_phenol_noOrthoHbond(float64)\n", "fr_phos_acid(float64)\n", "fr_phos_ester(float64)\n", "fr_piperdine(float64)\n", "fr_piperzine(float64)\n", "fr_priamide(float64)\n", "fr_prisulfonamd(float64)\n", "fr_pyridine(float64)\n", "fr_quatN(float64)\n", "fr_sulfide(float64)\n", "fr_sulfonamd(float64)\n", "fr_sulfone(float64)\n", "fr_term_acetylene(float64)\n", "fr_tetrazole(float64)\n", "fr_thiazole(float64)\n", "fr_thiocyan(float64)\n", "fr_thiophene(float64)\n", "fr_unbrch_alkane(float64)\n", "fr_urea(float64)\n", "qed(float64)\n" ] } ], "source": [ "from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator\n", "generator = MakeGenerator((\"RDKit2D\",))\n", "for name, numpy_type in generator.GetColumns():\n", " print(f\"{name}({numpy_type.__name__})\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "003cc588-e4dd-4dcc-98ec-4d3fcdd5432b", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4e2bbbdb449d4c62b2d08af1525e2124", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/17767 [00:00],\n", " ['MaxPartialCharge', ],\n", " ['MinAbsPartialCharge', ],\n", " ['MinPartialCharge', ]], dtype=object)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.array(generator.GetColumns())[np.unique(feature_idx)]" ] }, { "cell_type": "markdown", "id": "a851a96f-27d3-42b6-a74c-db5c6fa6a257", "metadata": {}, "source": [ "Set values to `0`" ] }, { "cell_type": "code", "execution_count": 24, "id": "0a7fb00f-468b-4957-96cc-ae0974c54780", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n", " nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n", " nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n", " nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n", " inf, inf, inf, inf, inf, inf])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embedding[drug_idx, feature_idx] " ] }, { "cell_type": "code", "execution_count": 25, "id": "1ee85449-5515-4d93-94ae-b3a4845e088b", "metadata": {}, "outputs": [], "source": [ "embedding[drug_idx, feature_idx] = 0" ] }, { "cell_type": "markdown", "id": "cf768d83", "metadata": {}, "source": [ "## Save" ] }, { "cell_type": "code", "execution_count": 27, "id": "a6291a01", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Deleting columns with std<=0.01: ['latent_90', 'latent_103', 'latent_152', 'latent_164', 'latent_176', 'latent_187', 'latent_196']\n" ] } ], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame(data=embedding,index=unique_smiles_list,columns=[f'latent_{i}' for i in range(embedding.shape[1])]) \n", "\n", "# Drop first feature from generator (RDKit2D_calculated)\n", "df.drop(columns=['latent_0'], inplace=True)\n", "\n", "# Drop columns with 0 standard deviation\n", "threshold = 0.01\n", "columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= threshold)[0]]\n", "print(f'Deleting columns with std<={threshold}: {columns}')\n", "df.drop(columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= 0.01)[0]], inplace=True)" ] }, { "cell_type": "markdown", "id": "0f14068e-51b8-40a9-b9c6-043c12b082ee", "metadata": {}, "source": [ "Check that correct columns were deleted: " ] }, { "cell_type": "code", "execution_count": 28, "id": "c92f8a87-ce38-4309-a6c4-4e5b828b59c7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([], dtype=int64),)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.where(df.std() <= threshold)" ] }, { "cell_type": "markdown", "id": "2d0d8c60-79ae-4451-ae5c-321f533bed48", "metadata": {}, "source": [ "### Normalise dataframe" ] }, { "cell_type": "code", "execution_count": 29, "id": "721590cd-67bb-4c70-bef4-268fbfa9a7cc", "metadata": {}, "outputs": [], "source": [ "normalized_df=(df-df.mean())/df.std()" ] }, { "cell_type": "code", "execution_count": 30, "id": "f4b63954-a11e-4384-945d-c94e2b629026", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latent_1latent_2latent_3latent_4latent_5latent_6latent_7latent_8latent_9latent_10...latent_190latent_191latent_192latent_193latent_194latent_195latent_197latent_198latent_199latent_200
C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc4ccncc4)[C@H]2N3CCC(F)(F)F)c1=O.C\\C=C/c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc4ccncc4)[C@H]2N3CCC(F)(F)F)c1=O-3.7963633.7108083.8996393.4989973.4393293.9216903.6264133.4188643.3187302.970395...-0.163324-0.456443-0.071887-0.049825-0.069332-0.15545-0.156925-0.144541-0.465136-2.514093
Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)-c1ccccc11.0187810.479258-0.181363-0.637532-0.708845-0.300968-0.716196-0.843850-0.750336-0.912162...-0.163324-0.456443-0.071887-0.049825-0.069332-0.15545-0.156925-0.144541-0.465136-0.870167
C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc(F)cc2)OCCCC[C@H](C)Oc3ccc(NC(=O)Nc4ccccc4)cc3C1=O-0.1722621.5632121.6940441.5927341.6688821.6167981.4907771.7334021.2790271.706711...-0.1633241.957854-0.071887-0.049825-0.069332-0.15545-0.156925-0.1445411.906191-1.589019
Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1-0.5073040.099467-0.467475-0.391040-0.461660-0.415835-0.398121-0.531627-0.435896-0.612135...-0.163324-0.456443-0.071887-0.049825-0.069332-0.15545-0.156925-0.144541-0.4651360.698201
COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(=O)N(C[C@H]1C)[C@@H](C)CO0.7637010.4286190.3750340.3125630.3851070.2665440.0885120.3569360.0214560.444731...-0.1633241.957854-0.071887-0.049825-0.069332-0.15545-0.156925-0.144541-0.4651360.109546
\n", "

5 rows × 193 columns

\n", "
" ], "text/plain": [ " latent_1 latent_2 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... -3.796363 3.710808 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... 1.018781 0.479258 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... -0.172262 1.563212 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.507304 0.099467 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... 0.763701 0.428619 \n", "\n", " latent_3 latent_4 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... 3.899639 3.498997 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.181363 -0.637532 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... 1.694044 1.592734 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.467475 -0.391040 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... 0.375034 0.312563 \n", "\n", " latent_5 latent_6 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... 3.439329 3.921690 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.708845 -0.300968 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... 1.668882 1.616798 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.461660 -0.415835 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... 0.385107 0.266544 \n", "\n", " latent_7 latent_8 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... 3.626413 3.418864 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.716196 -0.843850 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... 1.490777 1.733402 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.398121 -0.531627 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... 0.088512 0.356936 \n", "\n", " latent_9 latent_10 ... \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... 3.318730 2.970395 ... \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.750336 -0.912162 ... \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... 1.279027 1.706711 ... \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.435896 -0.612135 ... \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... 0.021456 0.444731 ... \n", "\n", " latent_190 latent_191 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... -0.163324 -0.456443 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.163324 -0.456443 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... -0.163324 1.957854 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.163324 -0.456443 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... -0.163324 1.957854 \n", "\n", " latent_192 latent_193 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... -0.071887 -0.049825 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.071887 -0.049825 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... -0.071887 -0.049825 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.071887 -0.049825 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... -0.071887 -0.049825 \n", "\n", " latent_194 latent_195 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... -0.069332 -0.15545 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.069332 -0.15545 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... -0.069332 -0.15545 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.069332 -0.15545 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... -0.069332 -0.15545 \n", "\n", " latent_197 latent_198 \\\n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... -0.156925 -0.144541 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.156925 -0.144541 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... -0.156925 -0.144541 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.156925 -0.144541 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... -0.156925 -0.144541 \n", "\n", " latent_199 latent_200 \n", "C\\C=C\\c1ccc2n(C[C@@H]3[C@@H](CO)[C@H](C(=O)NCCc... -0.465136 -2.514093 \n", "Oc1ccc(cc1)-c1c(nn2c(cc(nc12)C(F)(F)F)C(F)(F)F)... -0.465136 -0.870167 \n", "C[C@H](CO)N1C[C@@H](C)[C@H](CN(C)S(=O)(=O)c2ccc... 1.906191 -1.589019 \n", "Cc1cc(C)c2c(ccn(CC(=O)N3CCN(CC3)c3ccccc3)c2=O)n1 -0.465136 0.698201 \n", "COCC(=O)N(C)C[C@H]1Oc2cc(C#Cc3cccnc3)ccc2S(=O)(... -0.465136 0.109546 \n", "\n", "[5 rows x 193 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalized_df.head()" ] }, { "cell_type": "markdown", "id": "7c216a71-a174-4f0b-8fea-0fe4a1f47fcb", "metadata": {}, "source": [ "Check destination folder" ] }, { "cell_type": "code", "execution_count": 31, "id": "39bfb2ce", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'EMBEDDING_DIR' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[31], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m model_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrdkit2D\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 2\u001b[0m fname \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m_embedding_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdataset_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 4\u001b[0m directory \u001b[38;5;241m=\u001b[39m \u001b[43mEMBEDDING_DIR\u001b[49m \u001b[38;5;241m/\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrdkit\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m/\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124membeddings\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[1;32m 5\u001b[0m directory\u001b[38;5;241m.\u001b[39mmkdir(parents\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", "\u001b[0;31mNameError\u001b[0m: name 'EMBEDDING_DIR' is not defined" ] } ], "source": [ "model_name = 'rdkit2D'\n", "fname = f'{model_name}_embedding_{dataset_name}.parquet'\n", "\n", "directory = EMBEDDING_DIR /'rdkit' / 'data' /'embeddings'\n", "directory.mkdir(parents=True, exist_ok=True)" ] }, { "cell_type": "markdown", "id": "ee361ecd-2224-4c58-b3fc-5879cb3a6488", "metadata": {}, "source": [ "Save normalised version" ] }, { "cell_type": "code", "execution_count": 20, "id": "f330b59f-798b-420f-9ca8-07d6049dc26a", "metadata": {}, "outputs": [], "source": [ "normalized_df.to_parquet(directory / fname)" ] }, { "cell_type": "markdown", "id": "85180ed5", "metadata": {}, "source": [ "Check that it worked" ] }, { "cell_type": "code", "execution_count": 21, "id": "9620dae5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latent_1latent_2latent_3latent_4latent_5latent_6latent_7latent_8latent_9latent_10...latent_190latent_191latent_192latent_193latent_194latent_195latent_197latent_198latent_199latent_200
C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc10.987011-0.770585-0.997189-1.132568-0.931373-1.050912-1.233309-1.125313-1.354848-1.217915...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-0.311624
Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH]n1-0.2833020.6973920.0003300.0017860.0625900.113854-0.020283-0.053318-0.087372-0.134349...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-0.708922
Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4)cc[nH]c3=O)nc12-0.5350980.9095970.1011000.0428700.1037900.2298040.0804700.045557-0.049695-0.098408...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-1.298021
Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1-3.751746-1.807921-1.731786-1.388457-1.179856-1.629691-1.306154-1.423073-1.218564-1.359103...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.4641391.725542
O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1-0.457417-0.326019-0.838592-0.796089-0.868320-0.678709-0.703193-0.831348-0.833087-0.991385...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.4641391.084554
..................................................................
CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C)CN(Cc1ccc(-c3ccccn3)cc1)[C@@H](C)CO2-0.2513720.7043310.9132191.0835331.0166210.9448810.9993820.8394970.8449280.609324...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-0.874405
Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H]3CO)cc2)c1-0.7047240.3433520.0447820.0799490.1514520.1067830.1234770.4513690.0867280.550145...-0.163202-0.45557912.591213-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-0.381994
CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc10.715589-0.332284-0.352032-0.237123-0.176995-0.271070-0.283603-0.267059-0.466131-0.515176...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-1.432809
CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=O)[C@]5(CCCN5C3=O)C[C@H]4C2(C)C)O1-0.4505990.7154670.0259830.1381250.0685380.0183750.2496870.1037731.0406140.795995...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-0.450464
C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c11.616096-1.246753-1.377020-1.519188-1.593465-1.509452-1.613052-1.724251-1.527325-1.653639...-0.163202-0.455579-0.075138-0.050245-0.069133-0.156047-0.157586-0.145213-0.464139-0.606612
\n", "

17869 rows × 194 columns

\n", "
" ], "text/plain": [ " latent_1 latent_2 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 0.987011 -0.770585 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.283302 0.697392 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... -0.535098 0.909597 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -3.751746 -1.807921 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.457417 -0.326019 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... -0.251372 0.704331 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... -0.704724 0.343352 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 0.715589 -0.332284 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... -0.450599 0.715467 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 1.616096 -1.246753 \n", "\n", " latent_3 latent_4 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -0.997189 -1.132568 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... 0.000330 0.001786 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... 0.101100 0.042870 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -1.731786 -1.388457 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.838592 -0.796089 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... 0.913219 1.083533 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... 0.044782 0.079949 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.352032 -0.237123 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... 0.025983 0.138125 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -1.377020 -1.519188 \n", "\n", " latent_5 latent_6 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -0.931373 -1.050912 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... 0.062590 0.113854 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... 0.103790 0.229804 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -1.179856 -1.629691 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.868320 -0.678709 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... 1.016621 0.944881 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... 0.151452 0.106783 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.176995 -0.271070 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... 0.068538 0.018375 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -1.593465 -1.509452 \n", "\n", " latent_7 latent_8 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -1.233309 -1.125313 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.020283 -0.053318 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... 0.080470 0.045557 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -1.306154 -1.423073 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.703193 -0.831348 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... 0.999382 0.839497 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... 0.123477 0.451369 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.283603 -0.267059 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... 0.249687 0.103773 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -1.613052 -1.724251 \n", "\n", " latent_9 latent_10 ... \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -1.354848 -1.217915 ... \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.087372 -0.134349 ... \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... -0.049695 -0.098408 ... \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -1.218564 -1.359103 ... \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.833087 -0.991385 ... \n", "... ... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... 0.844928 0.609324 ... \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... 0.086728 0.550145 ... \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.466131 -0.515176 ... \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... 1.040614 0.795995 ... \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -1.527325 -1.653639 ... \n", "\n", " latent_190 latent_191 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -0.163202 -0.455579 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.163202 -0.455579 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... -0.163202 -0.455579 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -0.163202 -0.455579 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.163202 -0.455579 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... -0.163202 -0.455579 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... -0.163202 -0.455579 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.163202 -0.455579 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... -0.163202 -0.455579 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -0.163202 -0.455579 \n", "\n", " latent_192 latent_193 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -0.075138 -0.050245 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.075138 -0.050245 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... -0.075138 -0.050245 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -0.075138 -0.050245 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.075138 -0.050245 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... -0.075138 -0.050245 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... 12.591213 -0.050245 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.075138 -0.050245 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... -0.075138 -0.050245 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -0.075138 -0.050245 \n", "\n", " latent_194 latent_195 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -0.069133 -0.156047 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.069133 -0.156047 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... -0.069133 -0.156047 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -0.069133 -0.156047 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.069133 -0.156047 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... -0.069133 -0.156047 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... -0.069133 -0.156047 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.069133 -0.156047 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... -0.069133 -0.156047 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -0.069133 -0.156047 \n", "\n", " latent_197 latent_198 \\\n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -0.157586 -0.145213 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.157586 -0.145213 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... -0.157586 -0.145213 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -0.157586 -0.145213 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.157586 -0.145213 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... -0.157586 -0.145213 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... -0.157586 -0.145213 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.157586 -0.145213 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... -0.157586 -0.145213 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -0.157586 -0.145213 \n", "\n", " latent_199 latent_200 \n", "C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1 -0.464139 -0.311624 \n", "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n... -0.464139 -0.708922 \n", "Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4... -0.464139 -1.298021 \n", "Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1 -0.464139 1.725542 \n", "O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1 -0.464139 1.084554 \n", "... ... ... \n", "CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C... -0.464139 -0.874405 \n", "Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H... -0.464139 -0.381994 \n", "CN(C)CCOc1ccc(/C(=C(\\CCCl)c2ccccc2)c2ccccc2)cc1 -0.464139 -1.432809 \n", "CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=... -0.464139 -0.450464 \n", "C[C@@H]1CC(=O)NN=C1c1ccc(N)c([N+](=O)[O-])c1 -0.464139 -0.606612 \n", "\n", "[17869 rows x 194 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_parquet(directory/ fname)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "id": "201edd5b-832b-4350-b82d-8e6a4da099ac", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "ad25c9354f8cefdf5a943c25e67813a21d2807e3af4d6d0915e47390a83b57ce" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.20" } }, "nbformat": 4, "nbformat_minor": 5 }