File size: 72,070 Bytes
a48f0ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e4ac8068-6c28-44dd-bd7e-a2fc37938f23",
   "metadata": {},
   "source": [
    "# 4 SCIPLEX SMILES\n",
    "\n",
    "This is an updated version of `sciplex_SMILES.ipynb` which relies on a `drug_dict` to assign SMILES strings.  \n",
    "The `sciplex_SMILES.ipynb` notebook is not applicable to the full sciplex data as it relies on the `.obs_names`.  \n",
    "Hence, the second half of the dataset (left out in the original CPA publication) would be left without SMILES entries. \n",
    "\n",
    "**Requires**\n",
    "* `'sciplex3_matched_genes_lincs.h5ad'`\n",
    "* `'sciplex3_lincs_genes.h5ad'`\n",
    "* `'trapnell_final_V7.h5ad'`\n",
    "\n",
    "**Output**\n",
    "* `'trapnell_cpa(_lincs_genes).h5ad'`\n",
    "* `'trapnell_cpa_subset(_lincs_genes).h5ad'`\n",
    "\n",
    "\n",
    "## Description\n",
    "This script assigns SMILES strings to drug conditions in the sciplex dataset, serving as a counterpart to `2_lincs_SMILES.py` but handling sciplex data. Below is a summary of its key steps:\n",
    "\n",
    "1. **Load Data**: The script uses either `sciplex3_lincs_genes.h5ad` or `sciplex3_matched_genes_lincs.h5ad` as the target dataset to which SMILES strings are added. The choice depends on the `LINCS_GENES` flag: if `LINCS_GENES` is `True`, the dataset with the LINCS gene subset (`sciplex3_lincs_genes.h5ad`) is used; if `False`, the matched genes dataset (`sciplex3_matched_genes_lincs.h5ad`) is used.\n",
    "\n",
    "2. **Create and Assign SMILES**: A dictionary (`drug_dict`) is created by zipping the `condition` and `SMILES` columns from `trapnell_final_V7.h5ad`. The script assigns SMILES to the target dataset by applying `drug_dict` to the `condition` column.\n",
    "\n",
    "3. **Canonicalization and Validation**: The SMILES strings are validated using `rdkit` to make them canonical. The notebook also checks that each drug condition is assigned a unique SMILES string.\n",
    "\n",
    "4. **Subset Creation**: A subset of the target dataset is created by sampling up to 50 observations per drug condition to reduce data size. The subsets are concatenated into `adata_cpa_subset`.\n",
    "\n",
    "5. **Output**: Depending on the `LINCS_GENES` flag, the script creates two output files:\n",
    "\n",
    "   - If `LINCS_GENES` is `True`, the produced files are the whole set `trapnell_cpa_lincs_genes.h5ad` and the subset as `trapnell_cpa_subset_lincs_genes.h5ad`.\n",
    "   - Analogously if`LINCS_GENES` is `False`, the produced files are `trapnell_cpa.h5ad` and `trapnell_cpa_subset.h5ad`.\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ef4b99de",
   "metadata": {},
   "source": [
    "\n",
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "852f177f-f3f8-4674-a869-44dfa85359f8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.6 scipy==1.7.3 pandas==1.3.5 scikit-learn==1.0.2 statsmodels==0.13.2 pynndescent==0.5.6\n"
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import rdkit\n",
    "import scanpy as sc\n",
    "from rdkit import Chem\n",
    "\n",
    "import warnings\n",
    "from chemCPA.paths import DATA_DIR, PROJECT_DIR\n",
    "\n",
    "import os\n",
    "import sys\n",
    "root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))\n",
    "sys.path.append(root_dir)\n",
    "import raw_data.datasets as datasets\n",
    "import logging\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "from notebook_utils import suppress_output\n",
    "\n",
    "with suppress_output():\n",
    "    sc.set_figure_params(dpi=80, frameon=False)\n",
    "    sc.logging.print_header()\n",
    "    warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "a2d6d8f7-ed22-4fac-843e-4406af91d91c",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The autoreload extension is already loaded. To reload it, use:\n",
      "  %reload_ext autoreload\n"
     ]
    }
   ],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f729ae3a-6303-4e3d-acd3-cbe1b6ab7f5e",
   "metadata": {},
   "source": [
    "## Load data\n",
    "Note: Run notebook for both adata objects (LINCS_GENES)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "121ef798-2dce-4984-b1db-981bf7312b10",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Switch between 977 (True) and 2000 (False) gene set. Defaults to true. Env variable to allow for run_notebooks.py to change this.\n",
    "LINCS_GENES = os.environ.get('LINCS_GENES', 'True').lower() == 'true'\n",
    "\n",
    "adata_cpa = sc.read(DATA_DIR/f\"sciplex3_{'matched_genes_lincs' if not LINCS_GENES else 'lincs_genes'}.h5ad\") \n",
    "adata_cpi = sc.read(datasets.trapnell_final_v7())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "f102d651",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 581777 × 977\n",
       "    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val', 'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug', 'control', 'split_ho_pathway', 'split_tyrosine_ood', 'split_epigenetic_ood', 'split_cellcycle_ood'\n",
       "    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'gene_id', 'in_lincs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'\n",
       "    uns: 'all_DEGs', 'hvg', 'lincs_DEGs', 'log1p'"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_cpa"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b45f3d4-dc24-41d3-842d-8264cc491aaa",
   "metadata": {},
   "source": [
    "Determine output directory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "392d448a-8ec5-41bc-90b3-63ac8b8ce3b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_out = DATA_DIR / f\"trapnell_cpa{'_lincs_genes' if LINCS_GENES else ''}.h5ad\"\n",
    "adata_out_subset = DATA_DIR / f\"trapnell_cpa_subset{'_lincs_genes' if LINCS_GENES else ''}.h5ad\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "836e940e-5db4-4523-833f-143aa9979acf",
   "metadata": {},
   "source": [
    "Overview over adata files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "df885b5a-e8d7-4cff-8825-94c498045458",
   "metadata": {},
   "outputs": [],
   "source": [
    "# adata_cpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "f0ebec71-307a-4628-a0f8-f8e7ce77a55a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# adata_cpi"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea2a522d-60a4-4f5b-baf4-f9c4e5da1486",
   "metadata": {},
   "source": [
    "__________\n",
    "### Drug is combined with acid"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "788a0202-6880-45d3-8ac7-a65a8612d665",
   "metadata": {},
   "source": [
    "In the `adata_cpi` we distinguish between `'ENMD-2076'` and `'ENMD-2076 L-(+)-Tartaric acid'`.  \n",
    "They have different also different SMILES strings in `.obs.SMILES`. \n",
    "Since we do not keep this different in the `.obs.condition` columns,  \n",
    "which is a copy of `.obs.product_name` for `adata_cpa`, see `'lincs_sciplex_gene_matching.ipynb'`,  \n",
    "I am ignoring this. As result we only have 188 drugs in the sciplex dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "95947e2c-e9b6-40cb-b9ff-5dc90d6db493",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "index\n",
       "G02_E09_RT_BC_104_Lig_BC_173-1-0    ENMD-2076 L-(+)-Tartaric acid \n",
       "G06_F10_RT_BC_354_Lig_BC_250-1-0    ENMD-2076 L-(+)-Tartaric acid \n",
       "B08_E09_RT_BC_174_Lig_BC_28-1       ENMD-2076 L-(+)-Tartaric acid \n",
       "D06_F10_RT_BC_338_Lig_BC_205-1-0    ENMD-2076 L-(+)-Tartaric acid \n",
       "G11_E09_RT_BC_175_Lig_BC_88-1-0     ENMD-2076 L-(+)-Tartaric acid \n",
       "                                                 ...              \n",
       "C05_F10_RT_BC_117_Lig_BC_88-1-0     ENMD-2076 L-(+)-Tartaric acid \n",
       "D04_F10_RT_BC_300_Lig_BC_255-1-0    ENMD-2076 L-(+)-Tartaric acid \n",
       "G02_E09_RT_BC_182_Lig_BC_172-1-0    ENMD-2076 L-(+)-Tartaric acid \n",
       "E06_E09_RT_BC_195_Lig_BC_268-1      ENMD-2076 L-(+)-Tartaric acid \n",
       "C01_E09_RT_BC_100_Lig_BC_55-1-0     ENMD-2076 L-(+)-Tartaric acid \n",
       "Name: product_name, Length: 1343, dtype: category\n",
       "Categories (189, object): ['2-Methoxyestradiol (2-MeOE2)', '(+)-JQ1', 'A-366', 'ABT-737', ..., 'XAV-939', 'YM155 (Sepantronium Bromide)', 'ZM 447439', 'Zileuton']"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_cpi.obs.product_name[adata_cpi.obs.SMILES == 'O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\\\C=C\\\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1 |r,c:24,26,28,36,38,t:17,22,32|']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "53c695f2-de28-486f-a31c-c405ec45865d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3deVxV5dYH8N9hngQEBwYRVJxSQSWVwwGupGIpoqCVduWGmZp9nPLVVxuMsqxeNBNzblA0xdREEBXBCTkDkEdQTPSCJTEIMioynuF5/9iEZJZyBg4c1/dzP/eDe5+znrW5uu6z934GHmMMhBBCVGWg6wQIIaRzozJKCCFqoTJKCCFqoTJKCCFqoTJKCCFqoTJKiIp+/x2VlQCgUODWLV1nQ3SHyighKoqMxL//DQA1NVi5UtfZEN2hMkqI6pydERPT/HN1NRSKp/3imTOorgaAkhLcvImUlObjFRXIztZ0lkTLjHSdACGd2LJlWLQIfD4ATJyIjAx07QpHR3TtCicnODo2/3fLHx0cYGAAAB98AC8vbN2KzExkZ+P4caSmAsDNmzh2DJGRurwo0lZURglRnYkJPvgAn3wCAAoFeDxUVaGq6m8/b2qKHj3g5wdraygUkEjaLVOiRVRGCVHL+PHYvRsALl0CgKoqFBfjzh0UF6OqqvmHlj+WlKCgAHfvgsfDJ5/g1VexfDkAVFcjLAwAysrg4aG7iyEqoTJKiIpcXGBrCwAbNiAtrflg167o2hVDhjz+K/X1KC2FXI6330b37pg5E7t2wdcXtrbYtw8AxGIcO9Yu2RPNoVdMhKgiIwOrV2PiRABwdERIyFN9y9wcbm5wd2/+45tv4u5dbWVI2g31RglRRVQUgOYyqoIvvgCAigosWAA+H35+zccHD4adnSbyI+2IRwvlEdJWd+7AzQ0KBfLy4OamepxRo3DpEs6dQ0CAxnIj7Y9u6glps+3b0dSEkBC4uSEnB7t3o7FRlTjcSCl6X9/ZURklpG0aG7FrFwAsWQIAGzfijTewZo0qoaiM6ge6qSekbaKjER6OESNw+TIqK+Higvp65ORg4MA2h8rPh5sb7O1RVgYeTwu5knZBvVFC2mbLFuCPruiOHairw6RJqtRQAK6ucHZGRQX++19NZkjaGZVRQtpAJBI1NS0cP75g5kzI5dixAwCWLlU9oLc3QPf1nRyVUULaICoq6urVHXz+N2ZmOHHiVm2tcvBgjB+vekB6PKoHqIwS8rSKioqOHTtmbGy8YMECAOvXv15X123VqnR1Hmv6+KB799Ly8jMay5K0OyqjhDytLVu2yGSyl19+2dnZ+fLlyyKRyMyMzZgxVJ2YI0c23b/veuzYxHv37mkqT9LOqIwS8lTq6+u/+eYbAEuWLAEQFRUFYN68eZaWluqENTU1GTlypFKpTE9P10iepP1RGSXkqezfv7+iosLLy2vMmDF37949dOiQoaHhwoUL1Y/M5/MBiMVi9UMRnaAySshT+frrrwEsX74cwP79+xsaGqZMmdKnTx/1I3NlVEKvmTotGn5PyOPJ5fKbN2+KRCKhUHjx4sUHDx7U1dUVFBTY29srlcrExEQHB4eRI0eq31BxcbGzs7O1tXVVVZWBAfVsOh8qo4Q8VFyM9PTjIlGKRCKRSqWNrabK29raVldXDx8+/MSJE05OTmo2dP78+W+//Xbv3r2GhoYA3Nzc8vPzs7Ozhw5V64UV0QlaKI/orfXrmzfsvHkT//0vpFL8+9/o3x9ZWaisxAsvAIBCgRs3IBJBKIRUiuvX4en55ZUrzTvM9e3bVyAQeHl5+fr62traBgUFZWVljRo16uTJk56enqplVVBQsGLFikOHDgEIDAx8/fXXAfD5/Pz8fLFYTGW0M6LeKNFbvr4QCgHg3DmkpCApCY6OOHoUsbH45RfU1kIsxqVLqKt7+BVra8yatdvBId/b25vP59vY2LQOWFVVNW3atIsXL9ra2sbGxo4dO7ZN+chksm3btq1Zs6ampsbCwmLlypWrV682MzOrrKwMDQ2VSqVmZmYrVqyYP39+165d1b980m6ojBK9NWoUtm0DgEuXUFICiQQBAXB1hbk5rl3Dhx82f8zREb6+EAjg64sRI/DPDycbGxvDw8MPHjxoYmKye/fu11577SmTOXPmzJIlS3JycgAEBQVt2bLF1dVVoVDs3LlzzZo1lZWVZmZmDQ0NALp06RIeHr5o0aIBAwaofvGkPTFC9NSwYSwqikVFsUWL2IcfsgkTWEMD8/dne/awqCj22Wfs+HFWXt7msEqlMiIiAgCPx4uIiHji5/Py8l5++WXun9vAgQMTExO54xkZGaNHj+aOv/DCC1evXk1OTg4KCuLxeFzw8ePHx8fHK5XKNqdI2heVUaK3BILmH86ebS6jjLGkJPbccywqSt3gmzZt4t6qv/nmmzKZ7LGfqauri4iIMDMzA2BpaRkREdHY2MgYKy4uDgsL48plr169oqOjW3/r5s2bS5YsaRnV7+7uvmnTppqaGnUzJlpDZZTorceWUcbYK69ooIwyxo4ePWpubg5g6tSptbW1j5yNj493c3Pj+pVhYWElJSWMsaampk2bNllbWwMwNzdftWrV39XH6urqTZs2ubq6csXU2tp6/vz5N27c0EDeRNOojBK91dTU/INSyeRytnw527GD/aXcqUUikXTv3h3A6NGjS0tLW45fu3aN66t6eXmJxWLu4Llz54b8sfNyUFDQr7/++sT4CoUiPj5+/B9LSBkYGNCdfgdEZZQ8E0pKmIEBMzdnDx5oOHJeXl7//v0B9O3bt3VvceXKlTt27FAoFIyxwsLCsLAwrhT279//xIkTbW0lMzNz/vz5XOeXe8a6adOmv3aBiU5QGSXPhO3bGcCmTdNK8PLych8fHwB2dnapqamtTzU2Nm7atMnKygqAhYVFREREQ0ODyg2VlpauXbvW0dGRK6b29varV6+urKxU+wqIWqiMkmfCiy8ygO3era34Dx48CAoKAmBqavrjjz9yB+Pj4/v27dtyF5+fn6+Rtpqamg4dOiQQCABYWVlVV1drJCxRGY0bJfqvpgbdu0MuR0kJunXTVisKhWLx4sXbt283NDR8//33pVLpiRMnAAwaNGjz5s0TJkzQeIvp6ek5OTnh4eEaj0zahMoo0X8HD2LWLIwdi/Pntd7WunXr1qxZY2pq2tDQ0LVr148//njhwoVGRjTrWp/R/7pE/8XFAcDUqe3R1vvvv+/q6mpjY5OQkPDpp59y7/GJfqPeKNFzMhl69EB1NX79FZpYHZSQR1FvlOi5CxfOurklDB4c3qePimsyEfLPaI1YoueOHj2SlbVpwIBYXSdC9Bbd1BN9xhjr3bt3YWFhZmbm8OHDdZ0O0U/UGyX6LD09vbCw0NXVVeVVlgl5IiqjRJ/FxcUBCAkJ4ZZTIkQbqIwSfXbs2DEAU9tnrBN5VtGzUaK3cnNzBwwYYG9vX1JSQgPgifbQ3y2ib2QyWVZWlkQiiY6OBuDn50c1lGgV9UaJPrh3797PP/8sFApFIpFYLK77Y5s6W1tbExOTtLS0PjTynmgNlVHScdXVISsLPj4AcP06+vTBH+ttQqHAtWsQiyGR4NatRrHYrOVbPB5v4MCB3t7eo0ePPnjw4MWLF/v16ycUCh0cHHRxEUT/URnt3BQKxY0bN6RSqUgkEgqFCxcudHJyCg0N1XVempGXh2HDcPo0/P3xxhtYuhRFRZBIIJEgIwM1NQ8/6ebm06ePmY+PD5/P9/b2tre3547X1dVNmDBBLBYPGzYsJSWFNi4m2kBltPOpqKhIS0uTSCRisfjnn39+8OBByylTU1PGWEJCgjaWZWt/eXn46CPcuYNTp/DWW7C3x4YND8+2bIzs5YXRo2Fi8vDUrVu3GhoauB07ysvL/f39c3Jy+Hx+cnJyy1ZxhGgKldHO4ddffxUKhVyvMzMzU6lUtpxydHT09fUVCAReXl6xsbEbN260sLBISkrilvXt1PLysHEjvL1RVITcXEyahKgo+PjAxwfe3ujZ8+En6+sbpNJLYrFYLBanpaWVlpZOnjw5ISGBO1tYWOjr65ufnx8UFBQbG0tvnIhm0d+nDurBgwdZWVncrbpEIqmoqGg5ZWxsPGLECIFA4Ovr+69//atHjx4tpwQCwb1797777rugoKALFy7ox9SdsDBMngylEs8/j9TUh8eLiyGVQiSCUIiGhkap1K/lVI8ePVqvUNerV6/k5GQ/P7+EhIQ5c+ZER0dz+80Rohm6WXSf/L07d+4MHTr0kX/nvXv3njVrVlRUVEZGRlPLjpePI5fLZ8yYAcDZ2fm3335rr6y1IjeXLVzIGGPZ2czYmLVcTVERc3ZmwMP/WFoyLy/vhQsX7t27Nzc397HRMjIyunTpAmDRokXtdAHk2UBltGMpKioaOHCgmZmZkZGRl5fXkiVLoqOjn2YnXsZYenr6p59+yhhrbGzkno26u7tz26N3UmVl7Msvm7dEPn/+4aaeSiWzs2NdurDx41lEBIuPZ0+5q9u5c+fMzMwAfPbZZ9pKmjx7qIx2LNu2bQMwYcKEtu4fWV1dzb2G3rBhA2Ps3r17Xl5eADw8PKqqqrSTrNYdP84A5uv7mFMFBUy1rdqPHTtmZGTE4/F27typZnqEcKiMdiwTJ04EEB0drcJ3Dxw4YGBgwOPxvv32W8ZYWVnZoEGDAIwdO7a+vl7TmbaHuXMZwNat03DYPXv28Hg8AwODli08CVEHldEOpLq62sTExNDQsKysTLUIXGfW0NDwyJEjjLGCgoLevXsDCA4OlslkGk1W6xQK5uDAAPbLL5oPvm7dOgAmJiaJiYmaj06eMVRGO5ADBw4ACAgIYIxduXJl1KhRW7dubWuQiIgIrkAkJSUxxq5du8aNRQ8LC1OqdhusIxcvMoC5u2sr/ooVKwBYWFiIRCJttUGeDTTsowPhFsfkVnU7evTozz//nJ2d3dYgH3300TvvvNPU1DRjxgypVDpkyJCTJ09aWVnt27dv2bJlqiUml8szMzO3bt06e/bsy5cvl5WVcceVSuXSpUsPHz6sWth/xm3nqb0JWZGRkXPnzq2rqwsODs7JydFWM+RZoOs6Tpo1NDRYW1sD4N7LcztenDp1SoVQSqUyPDwcQPfu3W/cuMEYO3PmjKmpKYDIyMinDHLv3r3k5OSIiIigoCBbW9uWvzB2dnbPP//8/fv3GWPcap4tPV/N6t+fAUyrPUW5XD59+nQAzs7Ot2/f1mJLRK9RGe0oTp48CWDEiBGMsdu3b/N4vC5durT1fX2LpqamSZMmAXBxccnPz2eMHT161NDQkMfjffPNN4/9ikKhyM7O3rVrV3h4+KBBgx5ZLr5///5hYWGRkZH9+vUDEBAQwL22WrlyJbRwa3z1KgNYz55ModBg1Meoq6vz9vbu0qWLUCjUbktEf1EZ7SgWLFgA4KOPPmKMffXVVwBmzpypTsC6ujpfX18AQ4YMqaioYIxt374dgKGh4eHDh7nP1NTUpKamfvHFF0FBQS3LeXCMjY25gauHDh0qLS1tCfv777+7uLgAmDp1qkwmUyqVc+fOBWBvb/+L5l4GRUUd9fGJWbTonqYC/oPPPvsMwOzZs9uhLaKXqIx2CAqFwtHREUBWVhZjbOzYsQBiYmLUDFtdXc09HBg9enRNTQ1jbO3atQBMTU2nTJmi8lypa9eu2dnZAfjPf/6jVCpb3xprat4UN+j1xIkTGon2CIVCERwcvGHDBm70gr+/PwAa/ERURmW0QxCLxQDc3NwYYxUVFUZGRsbGxhoZNl9UVMStWLxnzx7uSHh4eMuzThXmSnHS0tKsrKwArF69mml63lRhYSGPx7OystLScFehUAigX79+jLHy8nIjIyNTU9N799qj50v0EpXRDmHVqlUA3nnnHcbYnj17ALz44ouaCp6bm9t64NTHH38MgM/ni0QilZ+9MsaSk5O511br169nreZNeXp6qvl/AJs3bwbwyiuvqBPkH3DPc1esWMEY++677wBMmjRJS22RZwGV0Q5h4MCBAC5cuMAYCwkJAbB9+3YttaXB++WYmJjW86bu3r3LXUjLCyjVjBs3DsD+/fvVz/CxBgwYACA1NZUxFhwcDIAmhhJ1UBnVvevXr3OvaGQyWV1dnaWlJY/HKygo0EZbGr9ffmTe1O+//67mvKmqqipjY2NjY+PKp1xupI2uXbsGoEePHnK5vLa21sLCwsDAoLi4WBttkWcEDb/XPW70ZXBwsJGRUVJSUm1t7ZgxY3r16qWNto4ePcoYmzRpErfQkfoWLlwYERGhUChee+215ORkFxeXkydP2tnZxcfHz507l7V9UfDjx4/LZLKAgICWDT/kcrlGUuW0/LYNDQ0TExO5AU/c+z1CVENlVPdaT15q/bO229KU1vOmLl++PGTIkISEBEtLy+Tk5Dt37jx9HG6uFPdgtCXDmJgYT0/P0tJSTWXbnr9t8qzQdXf4WVdUVMTj8SwsLGpra+VyObdme05Ojjba0t79slKpfP3119Fq3tTZs2ef5tX/X+dK8Xg8W1vbNWvWMMZkMtnIkSMBjBo1ips3pSbumYalpWV9fb1cLu/WrRsALmFCVEZlVMe4Z4shISGMsZqamnfffTcoKEhLbe3duxdAYGCgNoL/dd7UYz1xrpS/vz/32opbLVBTr604W7ZsATB9+nTG2NmzZwE899xzasYkhMqojnELjHJvurWNGySvwqpRT+mv86Y4bZ0rtXXrVgCGhoY//fQT+8u8KXUy5Aa37t27lzG2ZMkSAO+99546AQlhVEZ17vjx4xYWFqGhoQotzx5vaGjo0qWL9sYAcFrmTY0YMWLHjh3z589Xba7UmjVrAJibm6ekpDDGsrOz7ezs/P0Phoervtoft5yrkZERV+K5WQnp6emqXishzaiM6tjVq1e5V9ILuc3btCY+Ph7A6NGjtdoK+2PeFDdblKPaXKmlS5cCsLa2lkqljLG0tEILCwawd99VMbEffvgBwLhx4xhjUqkUgJOTU+dag5V0TFRGdU8sFltaWgKIiIjQXivcAiLrNL4jx+Pk5uaeP38+NDT0yy+/VHmulEKhePXVVwE4OfXKy3vAGEtOZqamDGDr16uS1csvvwxg8+bNjLEPP/wQwNtvv61KIEL+jMpohxAfH29kZARg48aN2oivUCgcHBwAaHARpnbQ1NQUFDTN1ze3Tx9WVMQYYwcOMAMDxuMxFR4mr1mzpm/fvtzrLw8PDwCnT5/WdMrkWURltKPYt28f94Z69+7dGg9+8eJFAO7a25FDax48YN7eDGDDhjXvovz11wxgn3yieszffvsNgI2NTWNjo6byJM8yGn7fUcyePTsqKooxNn/+fG4JZw3ixpmHam9HDq2xtERiIjw9kZ2Nl15CbS0WLUJGBj74QPWYsbGxACZPnmxiYqKxRMkzjMpoB7Jo0aJ3331XJpPNmDGDW8xNBbdv3z5w4MCyZctaz6Hk3i910uk6NjY4eRJubkhPx7RpaGzEqFGqR5PL5YcOHUKn/W2QDojH2j7rmWgPY+ytt97atWuXjY3NhQsXuMFD/0wul1+5ckUoFEql0tTU1Nu3b3PHL1++PGLECADZ2dkeHh49e/YsLi5+ZOxRJ5KXBz8/lJRg1iz88APadB3379/PyMjgfkVCoVCpVMpkMolE4unpqbV8yTPESNcJkD/h8Xjbt2+vqqo6fPjw5MmThUIhN7zxEUVFRWKxWCKRSCSSy5cvNzU1tZzq1q2bt7c3n8/n5pXij3vYqVOndt4aCsDdHSdOICAAMTGYMAFz5vzTh5VKXL+OzMzEc+d+TEtLu3nzZuvugq2tbX19/fTp01NTU2lREqIBun00Sx6rsbGRm93Ur1+/O3futD61c+dObkpPC0NDw2HDhi1YsGDPnj1cvWihUCiuXr3q7u4OrW3I0c7On2eLF7P161l0NGOMVVayTz9tPlVTw1JT2RdfsKAgZm/PADZ27Efcr+iRuVK1tbUCgQDA0KFDW8+2IkQ1dFPfQdXV1U2YMEEsFg8bNiwlJaVl1bjvv/9+7ty5Xbp08fDw8PX1FQgEAoGg9Vj3mpqaK1euiEQioVAoFosrKyutrKwCAwP379+vqcXxdG7ePFy9imPHYGCApUthYwOxGNevQ6l8+BlXV0ydeqVfvxQ+nz98+HBjY+PWESoqKvz9/a9fv+7t7X3mzBlu3C4hqqEy2nGVl5f7+/vn5OTw+fzk5GTun3plZeWdO3cGDx7ccofOGLtx40ZaWhp3m5+Tk6NsVU7c3Nx8fHx27tzJbZ2kH+bNw5QpOHQIX36JZcuQlYUbN2BkBE9PCATw8oK/P9zcnhCkqKjI19f39u3bEyZMSEhIoLf2RGVURju0wsJCX1/f/Pz8oKCg2NhYbog+gNra2szMTKlUKhKJzp8/X15e3vIVIyMjT09PgUDg5eXl7+/v9sRy0gnNm4cPPsDGjRgzBsePIzwc1tYYORKmpm2Lc/PmTT8/v7KystmzZ0dH7zUw4D35O4T8BZXRji43N9fPz6+0tDQkJCQkJITrdWZnZysUipbPODs78/l8Hx8fb29vLy8vve9YcWXU1haBgejbFzExqoe6cuVKQECAh8fqIUP+d+tWzaVIniVURjuBjIyMcePGmZqaVlRUcEeMjIwGDBjAPRv18vIaMmSIbjNsZ0uXYsUKuLjgwAGcO4dvv1UrWmrq3cDAHg0N+OQTtUb1k2cWldHOQSQSFRcX79+/38fHh8/nP//88+bm5rpOSn/Ex2P6dMjl2LQJS5fqOhvS2VAZJQQA9u3D66+Dx8OBA3j1VV1nQzoVKqOENPu//8Pq1TA2RlwcXnqpzV9vPVfKzc1t0KBBb7/99iO7pBC9RLOYCGm2ahXKy7FhA155BUIhnjhTVKnE9es5EolQLBY/MlfK0tKytrb21q1bGzdu1HreRNeojBLyUGQkqqpw9y6kUtjawtUVv/+OggIIBM0fePAAWVkQiSAUQiJBnz7LL11K5E4ZGxt7eHgIBAJfX18jI6PXXnvtq6++6tat23vvvaez6yHtgm7qCfkTbmGs8ePRowcOHcKFC0hKwoABEIshkTw6V2ratC3m5mJuEYNH5krFxcXNmDFDLpdHRUVx2+cRfUVllJDHCAyEnx8GD0a3bkhMxPr1zdWz9VwpPz88bt2Yh6Kjo+fMmcPj8WJiYl555ZX2yZy0PyqjhDxGYCDi4zFxIlauhFSK8nL06QNvb3h5tW2u1Oeff/7ee++ZmJjEx8dzy80Q/UNllJDHCAxEUhJOncK6dZgwARERqodauXLlhg0bLCwskpOTfXx8NJcj6Sg68QKUhGjbSy+hZ091g0RGRs6dO7euri44OPj69euayIt0LNQbJeQx1q/H0KEYNw4yGZqa8Mc6hSriNov+6aefnJ2dhUKhXq4X8yyjMkrIoxob0b07HjxAfj7+vEa26urr61966aWUlBR3d3ehUNhT/V4u6TDopp6QRyUno6YGXl4aq6EAzM3N4+PjR44cmZeXN3HixOrqao2FJrpGZZSQR8XFAYDGdw61trZOTEwcOHDglStXQkNDGxoaNNwA0RG6qSfkT5RKODujpATZ2Rg6VPPxCwoKBAJBQUHB1KlTjxw50rIUN+m8qDdKyJ+IxSgpQb9+WqmhAFxcXE6dOmVnZxcXFzd37lzqx+gBKqOE/Al3Rx8SosUmhgwZkpCQYGFhsXfv3vfff1+LLZF2QWWUkD+Jjwe08GD0EXw+Py4uzsTERKFQyLlp/KTTojJKyEO//JLTvfvOceOK+Hytt+Xu7t7U1LRz587WO7mSzojKKCEPxcb+JBK91afPR4aG7dBWLIBJkybp/RaEeo/KKCEPxcXFAZiq7Vv6dm+LaBUNeCKkWVFRkYuLi6WlZVlZmZmZmVbbqqiocHBwMDQ0vHv3rrW1tVbbItpGvVFCmsXGxjLGXnzxRW3XUADx8fFyufyFF16gGqoHqIwS0ozu6Ilq6KaeEACorq7u2bMnY6y0tLSrmgs6PUldXV337t0bGhoKCwsdHR212hZpBzQRjRAASEhIaGpq8vPz03YNBXD2rNGIEb+4up6hGqof6KaeEAAIDAwcPnx4bm7unTt3tN3WTz+ZiERuHh5varsh0j6ojBICAFZWVlZWViUlJYGBgZWVldprSKHAiRMAMG2a9hoh7YrKKCEAYGFhkZCQ4Onpee3atUmTJtXW1mqpoYsXUV6OwYMxcKCWWiDtjcooIc1sbGxOnjzp5uaWnp4+bdq0xsZGbbTCLX1CXVF9QmWUkIecnJySk5MdHBzOnDkzZ84cbcx2b5+lT0h7ojJKyJ+4u7ufPn3a1tY2JiZm8eLFmg2emYnffoOTE0aP1mxgoktURgl5lIeHR2xsrJmZ2bZt29auXavByFxXNDgYPJ4GoxIdo+H3hDze8ePHQ0ND5XL5V199tWzZMjWjicXw8oJcjh9/RK9eCAzUSI6kQ6DeKCGPN2XKlO+//57H4y1fvjw6OlrNaG++ichIWFrC0hK3bmkkQdJRUBkl5G+FhYVFRUUxxubNm3fq1Cl1QvXqhatXkZenqdRIB0JllJB/snjx4lWrVslkshkzZgiFwqf/ImPIycHu3Zg3D7t2AcBnn2H5cm3lSXSI5tQT8gSff/55ZWXlN998ExQUlJKS4unp+XefrK1FZiakUohEOH8e5eXNxwsKAKB/f4wYgbg4+Pm1S96kvdArJkKeTKFQzJo16/Dhw05OTkKhsE+fPi2nbt26JZFIJBKJWCy+f//Qr7/2bznl7AwfH/D58PfHu+8iKQn19Rg2DP/zP5g1C7a2urgSogXUGyXkyQwNDX/44Yd79+4lJSWNHz9+48aNN2/eFIvFaWlppaWlLR/z9xfb2/fn88HnQyCAi8vDCEuXAoC5OY4cwY0bGDAAmzdj5sx2vxKiBdQbJeRp1dTUBAQEZGVlKRSKloM2NjajRo0SCAS+vr4+Pj4WFhZPjPPll1ixAiYmOH6cRj7pAyqjhLRBWVnZpUuX1q5dO3LkSG9vbz6f7+7urkKcVasQGQkLCyQlQSDQeJqkXVEZJUQHGMP8+fj2W9ja4sIF/P1bK9IJUBklRDcUCsycicC7gv8AAAEFSURBVCNH4OQEkQhubrpOiKiKxo0SohuGhti3DwEBKC5GaKji7t0yXWdEVES9UUJ06f59hIY2FRXNMTG5lpKSYkvDoDoh6o0SokvW1jh48D5w+erVqyEhIQ0NDbrOiLQZlVFCdKxbt27Jycm9e/e+cOHCq6++KpfLdZ0RaRsqo4ToXq9evU6dOmVvbx8fH//GG2/Qo7bOhcooIR3Cc889d/LkSSsrq3379q1evVrX6ZA2oFdMhHQgp0+fDg4OlslkEolkzJgxuk6HPBUqo4R0LAcPHqypqZk3b56uEyFPi8ooIYSohZ6NEkKIWqiMEkKIWqiMEkKIWqiMEkKIWqiMEkKIWv4fMJuS9+yw9/AAAAIYelRYdHJka2l0UEtMIHJka2l0IDIwMjEuMDkuMgAAeJx7v2/tPQYg4GVAABkglgfiBkY2hgQgzcjMzqAApJkhXCYmGM3OoAESxhBncwCLs7A5ZIDlGWEC7AxgASaEAIRmZnewANGMzHCtUJvhRsBUIhTAzGZAswTJVsIMmLHcDIwMjEwMTMxAAxhYWBlY2RhY2BXYOTKYODgTOLkUuLgVuHkymHh4E3j5Evj4GfgEMpg4BTOYBIUShIRVmIVFtJiYmYRERURVmEXFMpjExBPEJTKYJCQTJKUymISkM5jYGBmk2RMEuBOkRBNEWNgY2VhZmJnYODgFhaTZWXm4Bfh42cTEJSSlRMWjGIEeh8dFlMkihz0rVx8AcUrWNzrkRneA2cabVR1u3J0EZp8NOmkvLB8PZgd4MzrMq2UGs1m/xzn0f7yzH8TOYJjgkCpqChbnaG6wZ2opBovfOrbTfj7zNzuw+Nl7dsq7TtmD2BrvEu3b/jk4gNj26mIOscxhYPYvExsHQeHpYPbzJ70Oe2dMArOD0mc4HG11B7O3VBxx4KpQBLOjzeMc1KaJgdkaS5r3l21YADb/1/M9+2dONtkHYm+StzngaMEIdk+dZu0BMytJsJpZWWsPbPPtArut/v3ZAyvPrAeLRx39c+DPoVYwW72b9eDW/0fB5mROfHVAc+c+sDl9u3cfqF4/BczuDHy9b/3pxWC2GADiUYxFhEuX8wAAApt6VFh0TU9MIHJka2l0IDIwMjEuMDkuMgAAeJx9VVtuGzEM/PcpdIEV+Jb0mcRBURRxgDbtHfqf+6NDGckqgNB1ROzKYy41nGEuJa+f1x9/38vnJdfLpRT6z98Yo/xRIrq8lLwpj8/fvt/K09vD48fO0+vv29uvIr0o4zf4fMU+vL2+fOxweSpeKXofVA6vGmYxClWa1/lTKbdilVwiejmsSjAHbYCKjFId+UISiDvvvAEagFy9SddRDq3WQ5psgI5XSyUhEWTEnZvbDhjImHm0SysH14YiZQdsAKIy7Dllxi7DdFdjnzUSadJ+UB3OoroBjgk0Z51ntU42YoNjtCe/V2fveXwUS74DMk5NtXdxA6mV2F23QJmEqwWbA6imw7dAnTwOo4YjGNhham0HtNlrU+MhyVPnTL0B+uSxRfKMVzO3LHYDzM5EZR1NLdUhMnzXGG73FqI2GXkql4FyNsCOGtFikm7WQKSAnr4tMlsDpDVHm7OJ3EJtl1SyOZDDUMjBE+pkLXZZJY0D0YLq8CTRR/d90uwPnJVkRkpHGrFskWmdI1Ic8ALOpGBrqzZJ7xyt9jFocCrUnHiLzBYdPeVhTfNI2qPxru0S96SqJGkLUDaIZKdiafcz9YiAMgDlwDTYEjUbBcX7CIqEShvhu1qfb9cvs+k+rR5fb9dzWuVHzpmEh6Ln5GEsO+cLY/k5RRgrzlnBWO2cCPltP31vWON0t2Hx6mKegRe78gyy+NIysC4GtAxsi9MsA/tiKZ4hFu/Y3GmLSWy+vS9usAw8FtVzBlnFLTPwomHOILJo1TKILpq0DGKL9iyD+KKxfISAFilZBmmLYmzW0xdhMGg/350kY/MTcG9hWyjVbCgynQWnZFaB5PPHv0LcX/4BujZd9NBEj0EAAAFeelRYdFNNSUxFUyByZGtpdCAyMDIxLjA5LjIAAHicHZE7juQwDESvsmEbUGv4/6CxkZON+gKDiZz3CebwW7SdCE8lslg8L76ux/uSz/X4Ov+eX5de8+kB8NbzfD/O4zz1uOT4fH/+/fCf30ds1k5dsl2kfb1sZ0jw0s2cbuul21J4FOrSvF44WLBDoaYd60W7SiDVTeyuA0ydvRZvLy28efImKXMQgaRvYonb9aTNGWpo9ZTd2nYzJ8uAn6fudo9YQF1+y3wbtwySJJZBMS2tUV8TtkByVzcNIXMaDzXGLae6ViTnrVIl8TV2mkjv6hURNogDPxAG8g6KYZI9vlCWbEq1s6hOLCQksjAEuU1ymD4x9MIIViE5Gq+eMjYHBIS4ySVyCFJnpOmboqoXfGgY7ELjRORTuWTimZ0gVx07iWcyvcxZC5FYkc1WkKURNouFCtPdyhS5renNbOv4/Q+Ce3OznRP1XwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<rdkit.Chem.rdchem.Mol at 0x7fe0a0f85bc0>"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from rdkit import Chem\n",
    "from rdkit.Chem.Draw import IPythonConsole\n",
    "from rdkit.Chem import Draw\n",
    "\n",
    "\n",
    "def mol_with_atom_index(mol):\n",
    "    for atom in mol.GetAtoms():\n",
    "        atom.SetAtomMapNum(atom.GetIdx())\n",
    "    return mol\n",
    "\n",
    "# Test in a kinase inhibitor\n",
    "mol = Chem.MolFromSmiles(\"CN1CCN(CC1)C1=CC(NC2=NNC(C)=C2)=NC(\\\\C=C\\\\C2=CC=CC=C2)=N1\")\n",
    "# Default\n",
    "mol"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "4d52cf91-145b-44c9-b3ae-45a412a4dddd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAAeu0lEQVR4nO3deVRU5f8H8PdsDJuMbCqgaIqiKYpiLrixKYOCuWGCodm3xoNbhhkmplb6C5fj129mhmXkkhQiooigLDMouCSGmmGKS2AEgoCyDTAz3N8fV4lMWWbAceDzOp3TOHOfZz73NL6797nPfS6HYRgQQghRF1fbBRBCiG6jGCWEEI1QjBJCiEYoRgkhRCMUo4QQohGKUdIu3LqFyEjk5qK2FtevA0BeHh480HZZpEOgGCW679o1fP01hgzBxo24fh2RkQBw9iyysrRdGekQKEaJ7jt1CgsXwt4e8+YhPR1nzmDjRkRHa7ss0lFQjBLdZ2qKwkIAKCyEqSnGjUNICGbM0HZZpKPga7sA0rFlZmLfPohEMDPDsmVqdvLGG5g6FQ8eoLAQ776LTp0AQCiEnl4rVkrI83DoZlCiTYGB2LEDfD6WL8fGjTAyUqeTkyfh5YX6X/Kvv2LQoFaskZDG0Uk90TY+HwA6dUJVlZo9/PknGh4N1NS0QlWENBvFKNGq4cMRHo6LF5GfD0tLNTvx8wOP9/cfFYpWKY2QZqIYJVoVEAATE9y+DXt7rFypZieGhnB2/vuPtbWtUhohzUQxSrRq2jTMmgUDA6xfj61bUV6uZj9Tpvz9mmKUvFgUo0Sr+vYFgFu30KfP4xfqmTSJ/XcdkFdcrFFJ5eVIScHduwBw8yYA1NQgJ0ejPkm7RjFKtIqN0ezsv1+ox9Fxg7FxBrAIuGRgoH49NTVYuhT6+vjuO6SmYscOACgsxI8/qt8nae8oRolW1aennd3jF+rhcK5PnfoaEAYoNLnElJkJV1c4O2PNGkRH4+5dbN2K3bvV75B0ABSjRKvqY7RfPxgYlN+/r3ZPEydOZF+oH6MKBYyN8egRAJSVwcgIvXph6VK89ZbaVZGOgGKUaFXPnj+5uq6wsEjt2ZNXXS2+dEntnsRiMYfDgdox+vvvcHREbi7y8vD551i7FgsXgseDUAihEFz6m0Kei+5iIlrWv3//GzduJCcnu7u7W1paFrJ3x6ulV69eOTk5fn5+O3fuNDU1bUHLgwchkaCyEhMn4tQpKJWPbwogpBno/7FEy/r27Qvg0aNHnTp1KioqKi0tVaMTpVK5atWqnJwcMzOziIgIKyur2bNnJyUlNX2UoFRi1SrMnYvKSgQEICYGAGUoaRGKUaJlbIxmZ2f36dMHwK2Wz3kqKiry9PTctGmTUCicNm2ap6enQqE4dOjQxIkTBw0a9L///a+kpOSZDQsKCtbNnYsdOyAU4uuvsW8fDA013B3SAVGMEi1jY/T69evdunUDkN3Ci/UZGRnDhw9PSUmxsbGRyWR79uxJSEjIzc0NDQ21tbXNyspavny5tbX1vw9Oz507N3z48E8jI8NdXZGaioULW3e/SAfCEKJVMpmse/fuRkZGNjY2hoaG/v7+V69ebWbbsLAwPT09AOPGjcvPz3/qU6VSmZiY6Ovry3tyx32/fv1CQ0OLiooab0hIi1CMEm2Sy+ULFixgM6579+7sCw6HM2HChAMHDsjl8kYa/uc//2G3l0gktbW1jXzL3bt3Q0JCrKys2O0FAgH7YtmyZQqFog12i3QsFKNEa3Jzc0eMGAHAwMDg+++/ZxgmIyNDIpF0YtddBjp37iyRSP59cPrvhs2hUqn2799vb29vYmJiZ2cnEAi2b9/eyrtEOiSKUaIliYk3PD0B2NnZPRWUZWVlYWFhTk5O9UNPTk5OYWFhlZWVDMNIpdIuXbo8s2GTLl++DKB///7btm0DsGTJktbcI9JRUYySF66ujgkNZXg8BkhcvLi0tPR5Gz51cGpmZjZ+/Hh2oNPb27uRhs9TUVHB4XCEQuGxY8cAeHp6arYnhDAMxSh50crLmVmzGIDhcJjgYEalarJFVVVVZGSkh4cHAHNzcw6HExwcrGpGw2eytrYGIJVKAfTu3Vu9TghpiCY8kbahUj1ePLSqCrW1qK1FTg6USixbhqgodO6M2FiEhjbnJksDAwNfX9/ExMRvvvmmuLiYz+d//PHHXHXvzmTnV1VXV/P5/JycnFpanJRojGKUtI38fHz1FQAcPozkZCxahDNn8O67WL0anp64ePEfCy03z8CBAwEoFIrTp0+rXVe/fv0GWVrW/fXXnilT0saNU9y5o3ZXhLDopjfSZrKyEBODX34Bj4elSzFkCKytcekSEhLU60/vyQOTExMTvby81Ovk6379uEVFuHYNNTWQyXDrFvr3V68rQlh0NErajLk5+vZF167gcFqlv/r5nqdOnVK7Ey67zL7mC0UT8gTFKGkzXbti4EDY2MDNDTt2IDISBw5g8mS1+9PT05MAJcCm334rUftkvLUWiibkCVooj7QNlQoVFRCJUFkJPh8Mg5wc9O6NJ0eUapDfuqXft+/jI9vwcDVXU5bLYWwMPh+HD8PHBx4eSExUuyRCQEejpK3weBCJAMDICEIh9PVhb69JhgIwEAr/Hh1Q+7zewAA2NqithZERAGj4/DtC6GiU6JKCAjy5Lx4iEUpK1FyUfu9eMAzKyyEQ4O238eTKFSHqoaNRojsa5t2jR7h8Wc1+5s/HhQuYPBljxmDVqlYpjXRkFKNEdzx12Lhtm/pdMQz69IGDA6qqNCyKEIpRojv09P6RpJMno6YGABgGLb0ZiX3sHcNApWq9+kgHRTFKdIeeHk6dQp8+cHODqyuGD8cHHwBAXl6Lj0z9/bFmDT76CBJJW1RKOhS6i4nolM6d8f77WLwYjx5hwwaUluL0aRQVtbgfd3e4u7dBfaQjoqNRolNqayEUAoCe3uMTcw6nte6SIkQ9FKNEpwwdivR0nDqF9evh5wdTU4wbhxEjtF0W6dBo3ijRNTU1uHwZPXuiWzfcu4cePaBQoKQEXbtquzLSQVGMEkKIRuiknhBCNEIxSgghGqEYJYQQjVCMkvYvNxcODqipwZYtyMtrbMsTJ3DzJgDExiIuDgCqq5GS8iKKJLqLpt+TDmH8eGzbBj4f4eGIiYGpKfh8dOoEfX0YGMDYGAIBOnfGvXsA8OWXSE6GUokpUyCXQyaDm5u2d4C8xChGSYfQty/KypCbi9paXLr03M0WLICfHw4cAIB797BxI6qraXY/aQLFKOkoPvgAjo44cACBgXj4EAoFKiogl6O6GuXlUCpRWorKSnh4ICgIcjl69EBICEpL8d//art08nKjGCXtn1SK4mI8eoSUFHTt2tga/Lt3A8DKldixA9bWACAQgH0IHiHPQ9PvSfs3ZAiuXkVMDF5/vemNi4sxZgxKS5Gfr+bi+qSjoZ8JaeeuXcPVqzA3h6Mj5PKmtzc3h0KBwkJkZrZ9caRdoBgl7dz33wOAvz9WrUK3bo+nMTVu0iQAiI9v07pI+0ExStozlQoHDwLAjBk4ehQVFRg8uOlWXl4AkJDQtrWRdoNilLRnUukvvXvfHjas7tYtyOVwc0OPHk23cneHUIjz51FS0vYlEt1HMUras+++25qebjdr1ub09Ihhw0rmzWvWBVUjIwQExA0fPi0l5WhbV0jaAYpR0m6VlZUdPXqUy+W6uEzYu3fujRu206dXNrOtvf31CxeOnjhBMUqaRjFK2q1Dhw5VVVW5urqeOnWKYZiZM2caGxs3s61YLAYQHx9PMwJJkyhGSbu1b98+AAEBAREREeyL5rQqKyurqakZNGiQra1tQUHBlStX2rZKovsoRkm7olKpLl26tH79eicnp99++43P5zs4OEil0h07drg1Y32RxMREBweH9evXA/D09ATw008/tXXNROcxhOi+3Fxm925mzpy5JiYm9b9tAwMDACYmJtHR0U32UFZWtnDhQg6HA2D06NF5eXnOzs5mZmYAnJyctm/f/uDBgxewI0QXUYwSHXDo0OMXUVHMhg1MTQ2Tk8OkpTFnzjDBwYyTE8PhMADj5OQJoHfv3hKJ5NixYw8fPnzrrbcAcDicZcuWKRSK5/V/5swZOzs7AAKBIDg4+ODBgxYWFmwQs1nMvvbz8ztx4oRSqXxBu010BMUo0QGLFz9+sWQJM20as2ULk5HBzJ3LAI//MTFhZsxgDhy4lpub+1TbsLAwgUAAYMKECQUFBU99WllZGRwczOVyAQwePDg5OXnWrFlsbk6cODEnJ6eqqioyMtLb25vH47HvW1lZLVu2LCMj4wXsONEJtDQJ0QFvvYW+fQHgjz8gEsHYGIMG4c4dRERALIZYDGfnxtZtSktLmz17dn5+fvfu3aOiokaOHMm+n56evmDBguzsbIFAEBQUNGrUqEWLFuXn5xsaGq5du3blypXcBmuT5OXlRUVFhYeH1190evXVV+fNmzd//vxu3bq13b4THaDtHCekaQ2PRlesYMrLGTc3Zt++FvTAjnUCEAqFu3fvZt9cvnw5AAcHB5lMJpFI2L8RY8eOzc7ObqSrCxcuLFmyhD3rB8Dj8ZYuXarmjpF2gWKU6IBvvnn84ttvme3bGYZhTpxgUlNb1olCoQgODmazLyAgoKqqqrKyctu2bUePHrW2tgZgaGgYGhqqUqma05tSqUxMTAwICDAwMNi0aVPLSiHtC53UE11SWQkzM1hb4+bNxs7iG/HDDz9IJJKqqqphw4aFh4fv3Llz9+7dAMaMGRMeHt6XHTtoiZKSEi6X27lzZ3WqIe0CxSjRJadOwdMTI0bgwgX1O8nMzJw5c+bdu3eFQmFNTQ17ELp48WIurdJM1EIPESG6RCYDAFdXjToZOnRoRkZGQEDAsGHDUlJSwsPD+/Xr1xrVkQ6KjkaJLhk9GufPIz4eYnHrdMgwDIee/Ek0QzFKdEZFBczMwDAoKUGnTtquhpAn6KSe6IyzZ2WOjqcHDPDp1Gmotmsh5G80pk50RkpKwsWL62xsDmm7EEL+gWKU6AypVArAxcVF24UQ8g80Nkp0Q3l5uZmZGYfDKS0tNTIy0nY5hPyNjkaJboiPj1cqlSNGjKAMJS8busREXmq//fbb8ePHk5KSZDJZ9+7dKysrlUoln0+/W/ISoZN68tJ58OBBYmJifHz8yZMnCwsL2Tf19PS4XG51dfWcOXP2799PSUpeHhSj5KVQV1eXmZmZlJTEHngqlUr2/W7duk2cONHb29vT0/PmzZtisbikpMTb2zsqKkooFGq3ZkJYFKNE+4KCgvbv3//gwQP2j/r6+uPHjxeLxSNHjszLy4uNjY2Li7t69aqNjc3Fixc9PT1LS0unTZv+44/RFKTkZUAxSrTso48+iouL+/XXX3v37u3h4eHq6ioSic6cORMfH3/lypX632dERMScOXMAXL58WSwW9+mzTU/PPzYWzX5kMiFthWKUaBPDMF26dHnw4EFKSsqQIUMkEklSUtKjR4/YT42MjNzc3MRisVgs7t27d32rrKwKNzfj+/fh6orYWNCle6JdFKNEm65cueLo6Ghra5uTk1NXV2dlZVVYWMgelnp7e0+aNOl5A6A3bsDDA3/+iTFjcOIEGjwPlJAXjS53Em1ib0xinyDP5XIjIiLs7OxsbW2bbGhvD6kU7u5IT8e774IeJk+0iKbfE22SyWRocH+nm5tbczKUZWeH1FRMmAAHB2zcCAAffdQWNRLSBIpRojV1dXVpaWnQ4Db5Xr0gk8HYGAUFuHEDcnlrlkdIM1GMkharqalplX6uXLlSXFz8yiuv9OzZU8Ouli3Dtm2gcX6iFRSjpLnu3r27e/fu2bNnd+nSZd26dRERERp2yA6Mumr4SBAAgL4+pk3DkwfIE/JC0SUm0hi5XJ6amhofH5+QkHDz5k32TQ6H89lnn3G53Nra2vnz56vd+VMDo2oTiXDyJHx88PnnGvZEiDpowhN5hjt37rD3ZSYkJJSXl7NvmpmZubu7e3h4TJ48+Ycffli1ahWHw/niiy+WLFmixleoVCoLC4uHDx/m5OQ0/7JSQ1FRuH8fr7+ORYsQG4t9+xAQoEY3hGistR98T3SYQqFYvHhxw4nuXC73tdde+/jjj8+ePatUKhtuvHXrVg6Hw+FwvvzyKzW+KyMjA4CdnZ3a1bq4MABz+DDTuTMDMDk5avdEiEbopJ78bc2aNTExMXl5eebm5m5ubuwceGtr62duvGLFCmNj4/feW7l378yHDxES0rLv0nAp+5oanD8PLhedO+PhQ/TpA7WOaAlpBRSj5LHCwsLNmzfr6+unp6ePGjWKy2368uPChQsNDWe89ZblxYvgcLB6dXO/q6qqKiYmBoCLiwvDMEuWLJkyZcrkyZObX+25c6iuxtCh+OUXQOMn1xOiCbpSTx6TSqUMw0yYMMHZ2bk5GcoKCLA8cAB8PkJCsGpVExvfuXOHvdbftWvXCxcuiESiu3fvHjp06Kuvvpo+fTobrM109uyXI0eGzpyZ9euvZRwOJkxoflNCWpuWBxXIS2PhwoUAQkND1Wh78CDD5zMA89//Pv1RRQVz9CgTGMh4eb1d/6vjcrl9+vQBwF6kCgoKAiAQCI4cSW/mN44fPx7AkSNHRCKRmVm/vLyHapRNSKugo1HyGDv9SL1ZnH5+OHIEgwbhzz+RmgoAn3yCrVvh4QFzc7z+OnbtQkWFk4WFha+vb1hY2L17927durVr1y4Oh7Ns2TIzM7N169Y5OLz/5pvOe/c2/XXV1dU///wzj8czMTF59OiRhQWsrUVqlE1I69B2jpOXQn5+pb29o0gkUigUaneiUDBBQUxgIFNTwyxYwAAMwPB4jLMz89lnTEZGjUqleqpJWFgYO4CwfXv0hg1KgOFymW+/beKLzp8/z+PxnJycNm3aBEAikahdMyGao3mjBAAiIuDvj+nTK6OjNVq884MPsGAB4uKQnw+BAE5OmDgRZmaNf3XEpk2pt2/vWryYY2qKVatgbo7sbJiaNtaqrKwsLy9vxYoV8fHx9Ss6E6IVdFJPAEAqBYDRo1thAeSBAyGXQ6HA5s14440mMhSAn5/fqlW7qqs5mzahpARffon4+CYyFACXy83KymIHIthxUkK0hWKUAIBMBrTGtKEePZCRgeXL8cYbLWg1Zw7nyBEIhdi8GdeuYfjw52557dq1LVu2uLu7m5ubz5o1SyAQ8Hi88PBwTesmRAN0Uk/w11+wsYFIhOJi8HgaddWvH7Kz8fPPeO21FreNjYWvL2pqcPgwZsyASgWlEkIhKitx7hxiYxETA6FwcHb2rwB4PN6oUaMsLS2PHTtWV1e3cePG1c2ftkpIq6Lp9wQpKQAwbpymGfrXX8jOhkiEYcPUae7jg6NHcfEitmxB9+4wNsa33+KXX3D2LBSK+m3mjh17w8vLy8PDw9TUFMDBgwfnz58fEhJSVlYWGhqq0Q4QohaK0Y4uLg5TpyI+Hvfva9qV5nHs6QlPT5SUYM8esAuepKaCx4OTE7y94eODYcOCOZx/NPH39+fz+W+++eamTZtUKtWWLVsa/4rKykqpVBofH5+fnx8dHa1moYQ0QDHa0R04gKwszJuH777TtCt2gFXjde/A5eKddxAWhgEDcPgw3N0hanRW6OzZsw0MDHx9fbdu3VpZWblz507OU1n7ZM2q2NjYpKSk6upqABwOJz8/38rKStNySYdHMdpB3b+PhAQUFcHaGtXVuH27Ffo8f14OGGgeowBeew379oHPx4wZzdrex8cnOjp65syZu3btUiqVX3/9NTsdNSsr64svvkhISMjJyWG3ZAdVvby8xowZk5aWdvLkSQcHB0tLS39//1aom3RM2p64Sl4chUKRlqZavZoZNozhcBiAMTZm3nuPqaxk3niD+b//Y/65El7L5Obm6unpOTt7a9IJ6949hmGYigqmsLBlDRMSEgwMDAD4+/uz9xGkp6ezv3NLS0v2BiqZTBYaGurh4aGnp8d+ZGZmBmDt2rWa1k06Kjoabf8KCwtTU1NjY2OPHz/eq9ftzExTAIaGcHWFlxeKimBoCH9/ZGZi4EDs3YuRI9X5FqlUWltba2HB1fA6FYDu3QHAyAhGLZzG6unpmZCQ4O3tffDgwbq6uv37948cOTI0NHTkyJEFBQUnT55ct25dQUEBu7FAIHBxcfHy8uJwOCEhIZ9++qlcLt+8ebOm1ZMOSNs5TtqQTCYbMmRIw//cvr7H33+fOXmSkcuf3njhQgZgRCImvbnLg/zDggULAGzbtk3zsjWUlpZmYmICYNKkSevWrRs9ejSvQbTb2tpKJJLo6OiysrL6JseOHRMKhQACAwPr6uq0WDzRRRSj7dnVq1cBGBoaenh4bN++/e7du41srFQyAQEMwBgZMcnJLf6uXr16AcjMzFS72lZ07ty5zp07d+vWjY1OPp8/ZsyY0NDQjIyM56VkXFycvr4+gHffffff9/4T0giK0XZOJpPV1NQ0c2Olkpk3jwEYQ0NGKi1oZiuFQhEZGQnAzMzs5QmgzMxMqVQaGBh47NixioqK5jSpH1qdO3dui5Zoqaur02RJF6Lr6C4m8g8Mg6VLceXKzYwMx59++nHq1KnP2/L+/fsnT548fvx4YmLiw4cPHR0d/fz8PvzwwxdZbas7ffq0t7d3eXn5nDlz9u/fz+c3dvGgpKQkOTk5KSkpLi7u7bffBvDpp5++qErJy0TbOU5eOnV1zOLFSwHo6enFxMQ0/KimpiY5OXnlypUODg4Nf0Wvvvrq+vXrtVVw6zpz5gw7tOrr61tbW/vUpyqV6sKFC5988smoUaMaDrmyAwLLly+nodUOiGKUPENdXR27Ij2Px9u/f39+fv7evXt9fX1FDebB1w+5/vHHH9qut5VlZGSws6CmTJkil8sZhikqKoqMjJRIJA0f8NdwyPX48eNskkokkpdnZIO8GHRST55r9erVn3/+uUAgUNTf0w4MHjxYLBaLxeKxY8cKBAItltemLl265OnpWVxcPGDAAGNj40uXLtXV1bEf9e7dWywWe3l5ubq6Gj2Zk3Xu3Lni4uLZs2fL5fK5c+fu3buXp/nML6IjKEZJYzZs2FBdXb19+3ZXV1cfHx8vL68ePXpou6gXJCsry9nZWSQS5ebm6uvrjx071sPDw8PDw8nJ6aktExMTfXx8XF1dg4KCZs6c2cyhVdJuUIySJigUCg6H0zETwdbW9t69ezt37lywYAF7Ef+Zrl+/7u7unp+fP378+JCQEF9f37KyMl9f3x9++KEdH7CTehSjhDxbbm5uz549zczMCgsLmzxDv3Hjhru7e15e3tixYzdu3Dh9+vSSkpIpU6ZERUWxY6akHaPV7wl5tpSUFADjx48PCgoaOnRoYmJiIxvb29unpaW98soraWlpQUFBhw8ftrCwiIuLmz59ulwuf1ElE+2gGCXk2aRSKQBXV9fk5OTLly8bGho2vn2vXr1kMlmfPn0uXboUFBQUExNjZWWVkJDg5eVVUVHxQkom2kExSsizpaamAhgyZEhWVpahoeFrzXguiq2trVQq7du3b2ZmpkQiiYiIsLKySk1NXbRoUdvXS7SGYpSQZ7hz505OTo65ufn9+/cZhhk7dmz9wnqN69GjR2pq6oABA3Jzcw0MDGQymYmJSXV1dVlZWVvXTLSlI15+JaRJ7Bm9i4sLe0zq0pLFqK2srGQy2e3bt0eMGHHnzp2ysrKUlBRjY+M2KpVoHR2NEvIMMpkMgIuLS32etqh5ly5dRo8ejQZxzK7GT9ol+k9LyDOcPn0awJAhQ37//XdjY+Phw4er1099HLdeaeSlQzFKyNOys7Nzc3O7dOny559/sgOjas+iZ+OYYrR9oxgl5Gn1h5BqDIw2dOvWrdzcXEtLy4EDB7ZedeSlQ5eYCHla/YDmiBEjbGxsvL29Nezn3w98Ju0JxSghT7OzszMxMVEqlU5OTv9eiKT5aGC0g6CTekKeZmdnV1ZWtmLFiqioKE36YccEXF1dW6ku8pKiGCXkafPmzVu7dq1CoZgzZ87evXvV6+TmzZt5eXldunTp379/65ZHXjYUo4Q8wyeffBIaGqpSqd5+++3vvvuuma2ys7N37drFLvBcf0s+DYy2ezQ2SsizBQcHczic4ODgd955p7KycunSpc/cTC6Xp6enJyUlxcbGZmVlARgxYoSTkxMNjHYcFKOEPNeHH35obGy8ZMmS9957T6VSLV++vP6j33//PT4+PiEh4fTp09XV1eybFhYWEydOFAgE165dYxfWo4HRjoBilJDGLFq0iM/nBwYGvv/++xUVFWvWrAGQl5c3YMAAdgMulzty5EixWOzi4qJQKI4fPz516tScnBxTU9PAwEB7e3utlk9eBFr9npCm7dmzRyKR1NXVBQcHh4aGAvDw8LC2thaLxTY2NufPn09ISEhPT69/9l+3bt28vLx27NhR/8w70o5RjBLSLD/++GNAQIBSqQwODv74449TUlKOHz8eHx9/7949dgMej+fo6Ojt7e3j4zNs2DC6stRxUIwS0lw//fRTQECAQqHg8XgqlYp9s3v37uwTpz08PEQikXYrJFpBMUpICxw5cuTcuXPbt28fPHgwHXgSFsUoIS1WVVXV5KOZSMdBMUoIIRqhu5gIIUQjFKOEEKIRilFCCNEIxSghhGjk/wFBkMf0J/xcKAAAAp16VFh0cmRraXRQS0wgcmRraXQgMjAyMS4wOS4yAAB4nIWRXUgUURTH770zzmj74a6rs98747arIunTPgTG7l1CEsOgwofephKZ6CHFIrAoIZJqJYJCjIpErHxIfIiw2NgZl5DyKwskQujFh7ICI2oNwmjmzK6GQl24nN8553//9x7uSmb0PdKXDW2sKn3X6LsHFyNZjxhzUkyPBLMmYMyud0DJ6LliRKwfgQL5RwNOMjySjNxMCSlE3pRtqXPU9ONpvp/POdOfwRtQ6ND1wv9hsy1vuhG8+V6e7oR5/nqAOc9WTwtGLMYEEYbFhEVskcRyNQTziC+W+JIass2CLFZktSGbHdlLUakD2Z2Ss0whZS7ZVR5hyitqCUNcQoUQYQS3Qtwe2eNViNcn+/wKcQUUEgjKwZAUEiVRUohUKVeG5fB2FI4oJBhViMOCok7ZL8gRUS5nOYuj1G6zcmWuQDDq5Nwer88vFEliJFzpfoP1Cda//3i/GhezUtJIFueERA8uKbD6suU7NVjXZFIzz4EDT9bUD1YGNGd3N2v9bR+hvnrbql2c9SXzmsQ5PypoqC9nT+Y1NPN4HvTV1qKJkXevVIPx50WtvuMW8MG6eS23cCpuMIeGtM6mqoTBc9J57dPJqYzBwr6UVv3iC+iPnnmqvR5waAbHJsNaWmzfZfDKbFqN/bgAmrH6yWddbhFYvdebqGtZAO4eFemjpUHg8b42ev2AC3wOOVtoidIBvPfnJXpk+CbwQHScMlf6gPcsZem13w3AI12D9GHDN/BZOz0Wb4+3wptzWoN6X/4KzDxIZ6YaW2H2runLid7uE8B8bjqRmhkGvrHcSLNrQ8D7QzJdHjwG3Bm6Q++mmoB/HXbQq7Ux4LfNFm3Hahr8hT//X7zo2o8UfAAAA2B6VFh0TU9MIHJka2l0IDIwMjEuMDkuMgAAeJx9VkmOFDEQvPcr/IGxnIu3AwdmYYQQPRIM/IE7/xcRbsZVIyX0LCpbWZnhiMxwXxI/3x6//Pqd9kcfL5eUyn9+55zpp5VSLl8TH9L90/Pna3p4/Xj/tvPw8uP6+j3ZSF7S+nkf+/H15evbjqSXVLJ1l+lJSi7ebRq2yvocr2p6wG4dsyBw5Np0tB7EGeLudmDPzdxnDQIdle926ZbF52wzCKzMKHkOlSEoLbO0WoLAxoyWpRVDop61qfYRBHYGKhL1MkqaufUqvQWBA6VPlUtpXTSIm0i4C488rZYRkSgFgbtwy1WROSJHhIce2aQ6aMbxm9qw6NQAdE13LY+ubitUxa1JFLrEQdlSqxGfeindo0hnZM2laS2MbBDRwyNVljeQ1Kz3pSgiNQxtTOrZqulQIh0+R40Ukn6r32czHBBiaZ2mUcMJRUIA4JmvAPHikZoyCVUyGFVd9YuVqRGp2FxtXBTdqXxqYLVG2qusJilVJjilUq25RIdSzpBmn45TLaFqw3MUSaEs1y5lDB7OikyJGkV9RWpBSacOvTYLp0M5R5518NCUYRQxiXhS6gTu1cxWpE0kj7hXytRymeDJiXNynEOWxsoJbQTcUPqJV8LqVAld5HqbtWE6w2Gyvxp16I7ORsreYzpNKDxyNhjHBF1mmKao701vYvZWoSvYBK01LG5IKRlNyVPUf7uXOQINFKGikgG8EXqN1ZvmBRWxyoAaE2RtCQl/na3RdszgI1FgX/2mtU64jeYhOH3o2X9HCCboTia9DR+RgzxdH99dH7cL5f7l+nhcKIK7AjPx6fmDHJcHN+24I8AtLoB3Mb426+H6gr92eDvIS/1wcCYch0/DntM87Fi5dbZdoaWKnvyVSzjiyUaFBil+ckuhEcrGxAUNTzYsLuhrspFxsbY3OHiZ06Vk48PC6UZ6tiah6ehBGVlDD+rGrItabG7MWDi9QjdmJa3wBN2YseASQ30yAueM68aMhXOWdWNWYiaEjVmJGbNp50GVtKbrNJHCYbONGQvnUNmhPTFjemxjNmLGmNjGbKsXKjr9NBy+djZmI+bVu6d2F5J9qMxvQPx3aAE/cx7l4NDZeXj1wMweP3c0129fr/B8+QObQcyEXh/x7wAAAcB6VFh0U01JTEVTIHJka2l0IDIwMjEuMDkuMgAAeJxFkk1u21AMhK/SpQTIDP8eHwnDQAFtsooPEGSlvU+Qw3copa0XtvCR5gyH2g85juXj0P62ff9Y9nXfbX0dy9v+2N8OO/oDoOvr8/X+JfR87Mtz/dx/v39dv9fDvjye6/PX9+JUXDU3IzEbut2NBksOgBjsY7sr6RiVm1LKkO0uxDPQaTS9G+83Jgs0F5CZWp5omrnYJpRzVncpyQx3EI/0PMlwiG5MLA6tCWZUGRjRQ2u4GtggDu22cKAmAZNjGJA68/QT5VSMvwmpuAVQksnw2Shg6xo1K0y2W29V1sjJhml2VzrEgdriVG3ExqXnQqwu1RA+oNBZoTyk7BSIcO2wvLzidDGi90aeU5ANFI2lpIkyktuw6xyB/e9OmtyunJLFMAc+FQE2sTpTDuKChZ5ThuS7B7HJ6TJL+1TIzHUCMaVp9cZCWBO1QeJ9Z6hjIQSMf6NwnUFqcm5FMYfkuX+l4ilR4RjnWSTY5jZJQ/W68Ei8Og4ULXC+BvAFEn+1/vUkjdA8yU+PIFDvw/3XYo4pHeFlBzdWlWrHl3Zi8YHC+v0HNUGfy8UF3dYAAAAASUVORK5CYII=",
      "text/plain": [
       "<rdkit.Chem.rdchem.Mol at 0x7fe01c79b7b0>"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Test in a kinase inhibitor\n",
    "mol = Chem.MolFromSmiles(\"O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\\\\C=C\\\\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1\")\n",
    "# Default\n",
    "mol"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a506e269-702b-41fd-9e6d-7409a6b10dce",
   "metadata": {},
   "source": [
    "___________"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13744dab-0de6-4bfb-8ddb-0b858c9c3e69",
   "metadata": {},
   "source": [
    "## Create drug SMILES dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "1e0f4ec2-23f9-48d9-91df-d3f420a22f5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "drug_dict = dict(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f46640c-cf9f-462c-96ff-fe6f6c75c1cf",
   "metadata": {},
   "source": [
    "The dict has 188 different entries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "b820dbf4-c441-46bf-9e32-aee10366c83a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "188"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(drug_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73dfbd1d-87aa-487c-812a-7d32479b512b",
   "metadata": {},
   "source": [
    "Checking that the `'ENMD-2076'` entry does not include the adid:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "0be9e830-9da9-4978-a4f2-1953c2cac198",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAcIAAACWCAIAAADCEh9HAAAABmJLR0QA/wD/AP+gvaeTAAAgAElEQVR4nO3deVxV5dYH8N9hngQEBwYRVJxSQSWVwwGupGIpoqCVduWGmZp9nPLVVxuMsqxeNBNzblA0xdREEBXBCTkDkEdQTPSCJTEIMioynuF5/9iEZJZyBg4c1/dzP/eDe5+znrW5uu6z934GHmMMhBBCVGWg6wQIIaRzozJKCCFqoTJKCCFqoTJKCCFqoTJKCCFqoTJKiIp+/x2VlQCgUODWLV1nQ3SHyighKoqMxL//DQA1NVi5UtfZEN2hMkqI6pydERPT/HN1NRSKp/3imTOorgaAkhLcvImUlObjFRXIztZ0lkTLjHSdACGd2LJlWLQIfD4ATJyIjAx07QpHR3TtCicnODo2/3fLHx0cYGAAAB98AC8vbN2KzExkZ+P4caSmAsDNmzh2DJGRurwo0lZURglRnYkJPvgAn3wCAAoFeDxUVaGq6m8/b2qKHj3g5wdraygUkEjaLVOiRVRGCVHL+PHYvRsALl0CgKoqFBfjzh0UF6OqqvmHlj+WlKCgAHfvgsfDJ5/g1VexfDkAVFcjLAwAysrg4aG7iyEqoTJKiIpcXGBrCwAbNiAtrflg167o2hVDhjz+K/X1KC2FXI6330b37pg5E7t2wdcXtrbYtw8AxGIcO9Yu2RPNoVdMhKgiIwOrV2PiRABwdERIyFN9y9wcbm5wd2/+45tv4u5dbWVI2g31RglRRVQUgOYyqoIvvgCAigosWAA+H35+zccHD4adnSbyI+2IRwvlEdJWd+7AzQ0KBfLy4OamepxRo3DpEs6dQ0CAxnIj7Y9u6glps+3b0dSEkBC4uSEnB7t3o7FRlTjcSCl6X9/ZURklpG0aG7FrFwAsWQIAGzfijTewZo0qoaiM6ge6qSekbaKjER6OESNw+TIqK+Higvp65ORg4MA2h8rPh5sb7O1RVgYeTwu5knZBvVFC2mbLFuCPruiOHairw6RJqtRQAK6ucHZGRQX++19NZkjaGZVRQtpAJBI1NS0cP75g5kzI5dixAwCWLlU9oLc3QPf1nRyVUULaICoq6urVHXz+N2ZmOHHiVm2tcvBgjB+vekB6PKoHqIwS8rSKioqOHTtmbGy8YMECAOvXv15X123VqnR1Hmv6+KB799Ly8jMay5K0OyqjhDytLVu2yGSyl19+2dnZ+fLlyyKRyMyMzZgxVJ2YI0c23b/veuzYxHv37mkqT9LOqIwS8lTq6+u/+eYbAEuWLAEQFRUFYN68eZaWluqENTU1GTlypFKpTE9P10iepP1RGSXkqezfv7+iosLLy2vMmDF37949dOiQoaHhwoUL1Y/M5/MBiMVi9UMRnaAySshT+frrrwEsX74cwP79+xsaGqZMmdKnTx/1I3NlVEKvmTotGn5PyOPJ5fKbN2+KRCKhUHjx4sUHDx7U1dUVFBTY29srlcrExEQHB4eRI0eq31BxcbGzs7O1tXVVVZWBAfVsOh8qo4Q8VFyM9PTjIlGKRCKRSqWNrabK29raVldXDx8+/MSJE05OTmo2dP78+W+//Xbv3r2GhoYA3Nzc8vPzs7Ozhw5V64UV0QlaKI/orfXrmzfsvHkT//0vpFL8+9/o3x9ZWaisxAsvAIBCgRs3IBJBKIRUiuvX4en55ZUrzTvM9e3bVyAQeHl5+fr62traBgUFZWVljRo16uTJk56enqplVVBQsGLFikOHDgEIDAx8/fXXAfD5/Pz8fLFYTGW0M6LeKNFbvr4QCgHg3DmkpCApCY6OOHoUsbH45RfU1kIsxqVLqKt7+BVra8yatdvBId/b25vP59vY2LQOWFVVNW3atIsXL9ra2sbGxo4dO7ZN+chksm3btq1Zs6ampsbCwmLlypWrV682MzOrrKwMDQ2VSqVmZmYrVqyYP39+165d1b980m6ojBK9NWoUtm0DgEuXUFICiQQBAXB1hbk5rl3Dhx82f8zREb6+EAjg64sRI/DPDycbGxvDw8MPHjxoYmKye/fu11577SmTOXPmzJIlS3JycgAEBQVt2bLF1dVVoVDs3LlzzZo1lZWVZmZmDQ0NALp06RIeHr5o0aIBAwaofvGkPTFC9NSwYSwqikVFsUWL2IcfsgkTWEMD8/dne/awqCj22Wfs+HFWXt7msEqlMiIiAgCPx4uIiHji5/Py8l5++WXun9vAgQMTExO54xkZGaNHj+aOv/DCC1evXk1OTg4KCuLxeFzw8ePHx8fHK5XKNqdI2heVUaK3BILmH86ebS6jjLGkJPbccywqSt3gmzZt4t6qv/nmmzKZ7LGfqauri4iIMDMzA2BpaRkREdHY2MgYKy4uDgsL48plr169oqOjW3/r5s2bS5YsaRnV7+7uvmnTppqaGnUzJlpDZZTorceWUcbYK69ooIwyxo4ePWpubg5g6tSptbW1j5yNj493c3Pj+pVhYWElJSWMsaampk2bNllbWwMwNzdftWrV39XH6urqTZs2ubq6csXU2tp6/vz5N27c0EDeRNOojBK91dTU/INSyeRytnw527GD/aXcqUUikXTv3h3A6NGjS0tLW45fu3aN66t6eXmJxWLu4Llz54b8sfNyUFDQr7/++sT4CoUiPj5+/B9LSBkYGNCdfgdEZZQ8E0pKmIEBMzdnDx5oOHJeXl7//v0B9O3bt3VvceXKlTt27FAoFIyxwsLCsLAwrhT279//xIkTbW0lMzNz/vz5XOeXe8a6adOmv3aBiU5QGSXPhO3bGcCmTdNK8PLych8fHwB2dnapqamtTzU2Nm7atMnKygqAhYVFREREQ0ODyg2VlpauXbvW0dGRK6b29varV6+urKxU+wqIWqiMkmfCiy8ygO3era34Dx48CAoKAmBqavrjjz9yB+Pj4/v27dtyF5+fn6+Rtpqamg4dOiQQCABYWVlVV1drJCxRGY0bJfqvpgbdu0MuR0kJunXTVisKhWLx4sXbt283NDR8//33pVLpiRMnAAwaNGjz5s0TJkzQeIvp6ek5OTnh4eEaj0zahMoo0X8HD2LWLIwdi/Pntd7WunXr1qxZY2pq2tDQ0LVr148//njhwoVGRjTrWp/R/7pE/8XFAcDUqe3R1vvvv+/q6mpjY5OQkPDpp59y7/GJfqPeKNFzMhl69EB1NX79FZpYHZSQR1FvlOi5CxfOurklDB4c3qePimsyEfLPaI1YoueOHj2SlbVpwIBYXSdC9Bbd1BN9xhjr3bt3YWFhZmbm8OHDdZ0O0U/UGyX6LD09vbCw0NXVVeVVlgl5IiqjRJ/FxcUBCAkJ4ZZTIkQbqIwSfXbs2DEAU9tnrBN5VtGzUaK3cnNzBwwYYG9vX1JSQgPgifbQ3y2ib2QyWVZWlkQiiY6OBuDn50c1lGgV9UaJPrh3797PP/8sFApFIpFYLK77Y5s6W1tbExOTtLS0PjTynmgNlVHScdXVISsLPj4AcP06+vTBH+ttQqHAtWsQiyGR4NatRrHYrOVbPB5v4MCB3t7eo0ePPnjw4MWLF/v16ycUCh0cHHRxEUT/URnt3BQKxY0bN6RSqUgkEgqFCxcudHJyCg0N1XVempGXh2HDcPo0/P3xxhtYuhRFRZBIIJEgIwM1NQ8/6ebm06ePmY+PD5/P9/b2tre3547X1dVNmDBBLBYPGzYsJSWFNi4m2kBltPOpqKhIS0uTSCRisfjnn39+8OBByylTU1PGWEJCgjaWZWt/eXn46CPcuYNTp/DWW7C3x4YND8+2bIzs5YXRo2Fi8vDUrVu3GhoauB07ysvL/f39c3Jy+Hx+cnJyy1ZxhGgKldHO4ddffxUKhVyvMzMzU6lUtpxydHT09fUVCAReXl6xsbEbN260sLBISkrilvXt1PLysHEjvL1RVITcXEyahKgo+PjAxwfe3ujZ8+En6+sbpNJLYrFYLBanpaWVlpZOnjw5ISGBO1tYWOjr65ufnx8UFBQbG0tvnIhm0d+nDurBgwdZWVncrbpEIqmoqGg5ZWxsPGLECIFA4Ovr+69//atHjx4tpwQCwb1797777rugoKALFy7ox9SdsDBMngylEs8/j9TUh8eLiyGVQiSCUIiGhkap1K/lVI8ePVqvUNerV6/k5GQ/P7+EhIQ5c+ZER0dz+80Rohm6WXSf/L07d+4MHTr0kX/nvXv3njVrVlRUVEZGRlPLjpePI5fLZ8yYAcDZ2fm3335rr6y1IjeXLVzIGGPZ2czYmLVcTVERc3ZmwMP/WFoyLy/vhQsX7t27Nzc397HRMjIyunTpAmDRokXtdAHk2UBltGMpKioaOHCgmZmZkZGRl5fXkiVLoqOjn2YnXsZYenr6p59+yhhrbGzkno26u7tz26N3UmVl7Msvm7dEPn/+4aaeSiWzs2NdurDx41lEBIuPZ0+5q9u5c+fMzMwAfPbZZ9pKmjx7qIx2LNu2bQMwYcKEtu4fWV1dzb2G3rBhA2Ps3r17Xl5eADw8PKqqqrSTrNYdP84A5uv7mFMFBUy1rdqPHTtmZGTE4/F27typZnqEcKiMdiwTJ04EEB0drcJ3Dxw4YGBgwOPxvv32W8ZYWVnZoEGDAIwdO7a+vl7TmbaHuXMZwNat03DYPXv28Hg8AwODli08CVEHldEOpLq62sTExNDQsKysTLUIXGfW0NDwyJEjjLGCgoLevXsDCA4OlslkGk1W6xQK5uDAAPbLL5oPvm7dOgAmJiaJiYmaj06eMVRGO5ADBw4ACAgIYIxduXJl1KhRW7dubWuQiIgIrkAkJSUxxq5du8aNRQ8LC1OqdhusIxcvMoC5u2sr/ooVKwBYWFiIRCJttUGeDTTsowPhFsfkVnU7evTozz//nJ2d3dYgH3300TvvvNPU1DRjxgypVDpkyJCTJ09aWVnt27dv2bJlqiUml8szMzO3bt06e/bsy5cvl5WVcceVSuXSpUsPHz6sWth/xm3nqb0JWZGRkXPnzq2rqwsODs7JydFWM+RZoOs6Tpo1NDRYW1sD4N7LcztenDp1SoVQSqUyPDwcQPfu3W/cuMEYO3PmjKmpKYDIyMinDHLv3r3k5OSIiIigoCBbW9uWvzB2dnbPP//8/fv3GWPcap4tPV/N6t+fAUyrPUW5XD59+nQAzs7Ot2/f1mJLRK9RGe0oTp48CWDEiBGMsdu3b/N4vC5durT1fX2LpqamSZMmAXBxccnPz2eMHT161NDQkMfjffPNN4/9ikKhyM7O3rVrV3h4+KBBgx5ZLr5///5hYWGRkZH9+vUDEBAQwL22WrlyJbRwa3z1KgNYz55ModBg1Meoq6vz9vbu0qWLUCjUbktEf1EZ7SgWLFgA4KOPPmKMffXVVwBmzpypTsC6ujpfX18AQ4YMqaioYIxt374dgKGh4eHDh7nP1NTUpKamfvHFF0FBQS3LeXCMjY25gauHDh0qLS1tCfv777+7uLgAmDp1qkwmUyqVc+fOBWBvb/+L5l4GRUUd9fGJWbTonqYC/oPPPvsMwOzZs9uhLaKXqIx2CAqFwtHREUBWVhZjbOzYsQBiYmLUDFtdXc09HBg9enRNTQ1jbO3atQBMTU2nTJmi8lypa9eu2dnZAfjPf/6jVCpb3xprat4UN+j1xIkTGon2CIVCERwcvGHDBm70gr+/PwAa/ERURmW0QxCLxQDc3NwYYxUVFUZGRsbGxhoZNl9UVMStWLxnzx7uSHh4eMuzThXmSnHS0tKsrKwArF69mml63lRhYSGPx7OystLScFehUAigX79+jLHy8nIjIyNTU9N799qj50v0EpXRDmHVqlUA3nnnHcbYnj17ALz44ouaCp6bm9t64NTHH38MgM/ni0QilZ+9MsaSk5O511br169nreZNeXp6qvl/AJs3bwbwyiuvqBPkH3DPc1esWMEY++677wBMmjRJS22RZwGV0Q5h4MCBAC5cuMAYCwkJAbB9+3YttaXB++WYmJjW86bu3r3LXUjLCyjVjBs3DsD+/fvVz/CxBgwYACA1NZUxFhwcDIAmhhJ1UBnVvevXr3OvaGQyWV1dnaWlJY/HKygo0EZbGr9ffmTe1O+//67mvKmqqipjY2NjY+PKp1xupI2uXbsGoEePHnK5vLa21sLCwsDAoLi4WBttkWcEDb/XPW70ZXBwsJGRUVJSUm1t7ZgxY3r16qWNto4ePcoYmzRpErfQkfoWLlwYERGhUChee+215ORkFxeXkydP2tnZxcfHz507l7V9UfDjx4/LZLKAgICWDT/kcrlGUuW0/LYNDQ0TExO5AU/c+z1CVENlVPdaT15q/bO229KU1vOmLl++PGTIkISEBEtLy+Tk5Dt37jx9HG6uFPdgtCXDmJgYT0/P0tJSTWXbnr9t8qzQdXf4WVdUVMTj8SwsLGpra+VyObdme05Ojjba0t79slKpfP3119Fq3tTZs2ef5tX/X+dK8Xg8W1vbNWvWMMZkMtnIkSMBjBo1ips3pSbumYalpWV9fb1cLu/WrRsALmFCVEZlVMe4Z4shISGMsZqamnfffTcoKEhLbe3duxdAYGCgNoL/dd7UYz1xrpS/vz/32opbLVBTr604W7ZsATB9+nTG2NmzZwE899xzasYkhMqojnELjHJvurWNGySvwqpRT+mv86Y4bZ0rtXXrVgCGhoY//fQT+8u8KXUy5Aa37t27lzG2ZMkSAO+99546AQlhVEZ17vjx4xYWFqGhoQotzx5vaGjo0qWL9sYAcFrmTY0YMWLHjh3z589Xba7UmjVrAJibm6ekpDDGsrOz7ezs/P0Phoervtoft5yrkZERV+K5WQnp6emqXishzaiM6tjVq1e5V9ILuc3btCY+Ph7A6NGjtdoK+2PeFDdblKPaXKmlS5cCsLa2lkqljLG0tEILCwawd99VMbEffvgBwLhx4xhjUqkUgJOTU+dag5V0TFRGdU8sFltaWgKIiIjQXivcAiLrNL4jx+Pk5uaeP38+NDT0yy+/VHmulEKhePXVVwE4OfXKy3vAGEtOZqamDGDr16uS1csvvwxg8+bNjLEPP/wQwNtvv61KIEL+jMpohxAfH29kZARg48aN2oivUCgcHBwAaHARpnbQ1NQUFDTN1ze3Tx9WVMQYYwcOMAMDxuMxFR4mr1mzpm/fvtzrLw8PDwCnT5/WdMrkWURltKPYt28f94Z69+7dGg9+8eJFAO7a25FDax48YN7eDGDDhjXvovz11wxgn3yieszffvsNgI2NTWNjo6byJM8yGn7fUcyePTsqKooxNn/+fG4JZw3ixpmHam9HDq2xtERiIjw9kZ2Nl15CbS0WLUJGBj74QPWYsbGxACZPnmxiYqKxRMkzjMpoB7Jo0aJ3331XJpPNmDGDW8xNBbdv3z5w4MCyZctaz6Hk3i910uk6NjY4eRJubkhPx7RpaGzEqFGqR5PL5YcOHUKn/W2QDojH2j7rmWgPY+ytt97atWuXjY3NhQsXuMFD/0wul1+5ckUoFEql0tTU1Nu3b3PHL1++PGLECADZ2dkeHh49e/YsLi5+ZOxRJ5KXBz8/lJRg1iz88APadB3379/PyMjgfkVCoVCpVMpkMolE4unpqbV8yTPESNcJkD/h8Xjbt2+vqqo6fPjw5MmThUIhN7zxEUVFRWKxWCKRSCSSy5cvNzU1tZzq1q2bt7c3n8/n5pXij3vYqVOndt4aCsDdHSdOICAAMTGYMAFz5vzTh5VKXL+OzMzEc+d+TEtLu3nzZuvugq2tbX19/fTp01NTU2lREqIBun00Sx6rsbGRm93Ur1+/O3futD61c+dObkpPC0NDw2HDhi1YsGDPnj1cvWihUCiuXr3q7u4OrW3I0c7On2eLF7P161l0NGOMVVayTz9tPlVTw1JT2RdfsKAgZm/PADZ27Efcr+iRuVK1tbUCgQDA0KFDW8+2IkQ1dFPfQdXV1U2YMEEsFg8bNiwlJaVl1bjvv/9+7ty5Xbp08fDw8PX1FQgEAoGg9Vj3mpqaK1euiEQioVAoFosrKyutrKwCAwP379+vqcXxdG7ePFy9imPHYGCApUthYwOxGNevQ6l8+BlXV0ydeqVfvxQ+nz98+HBjY+PWESoqKvz9/a9fv+7t7X3mzBlu3C4hqqEy2nGVl5f7+/vn5OTw+fzk5GTun3plZeWdO3cGDx7ccofOGLtx40ZaWhp3m5+Tk6NsVU7c3Nx8fHx27tzJbZ2kH+bNw5QpOHQIX36JZcuQlYUbN2BkBE9PCATw8oK/P9zcnhCkqKjI19f39u3bEyZMSEhIoLf2RGVURju0wsJCX1/f/Pz8oKCg2NhYbog+gNra2szMTKlUKhKJzp8/X15e3vIVIyMjT09PgUDg5eXl7+/v9sRy0gnNm4cPPsDGjRgzBsePIzwc1tYYORKmpm2Lc/PmTT8/v7KystmzZ0dH7zUw4D35O4T8BZXRji43N9fPz6+0tDQkJCQkJITrdWZnZysUipbPODs78/l8Hx8fb29vLy8vve9YcWXU1haBgejbFzExqoe6cuVKQECAh8fqIUP+d+tWzaVIniVURjuBjIyMcePGmZqaVlRUcEeMjIwGDBjAPRv18vIaMmSIbjNsZ0uXYsUKuLjgwAGcO4dvv1UrWmrq3cDAHg0N+OQTtUb1k2cWldHOQSQSFRcX79+/38fHh8/nP//88+bm5rpOSn/Ex2P6dMjl2LQJS5fqOhvS2VAZJQQA9u3D66+Dx8OBA3j1VV1nQzoVKqOENPu//8Pq1TA2RlwcXnqpzV9vPVfKzc1t0KBBb7/99iO7pBC9RLOYCGm2ahXKy7FhA155BUIhnjhTVKnE9es5EolQLBY/MlfK0tKytrb21q1bGzdu1HreRNeojBLyUGQkqqpw9y6kUtjawtUVv/+OggIIBM0fePAAWVkQiSAUQiJBnz7LL11K5E4ZGxt7eHgIBAJfX18jI6PXXnvtq6++6tat23vvvaez6yHtgm7qCfkTbmGs8ePRowcOHcKFC0hKwoABEIshkTw6V2ratC3m5mJuEYNH5krFxcXNmDFDLpdHRUVx2+cRfUVllJDHCAyEnx8GD0a3bkhMxPr1zdWz9VwpPz88bt2Yh6Kjo+fMmcPj8WJiYl555ZX2yZy0PyqjhDxGYCDi4zFxIlauhFSK8nL06QNvb3h5tW2u1Oeff/7ee++ZmJjEx8dzy80Q/UNllJDHCAxEUhJOncK6dZgwARERqodauXLlhg0bLCwskpOTfXx8NJcj6Sg68QKUhGjbSy+hZ091g0RGRs6dO7euri44OPj69euayIt0LNQbJeQx1q/H0KEYNw4yGZqa8Mc6hSriNov+6aefnJ2dhUKhXq4X8yyjMkrIoxob0b07HjxAfj7+vEa26urr61966aWUlBR3d3ehUNhT/V4u6TDopp6QRyUno6YGXl4aq6EAzM3N4+PjR44cmZeXN3HixOrqao2FJrpGZZSQR8XFAYDGdw61trZOTEwcOHDglStXQkNDGxoaNNwA0RG6qSfkT5RKODujpATZ2Rg6VPPxCwoKBAJBQUHB1KlTjxw50rIUN+m8qDdKyJ+IxSgpQb9+WqmhAFxcXE6dOmVnZxcXFzd37lzqx+gBKqOE/Al3Rx8SosUmhgwZkpCQYGFhsXfv3vfff1+LLZF2QWWUkD+Jjwe08GD0EXw+Py4uzsTERKFQyLlp/KTTojJKyEO//JLTvfvOceOK+Hytt+Xu7t7U1LRz587WO7mSzojKKCEPxcb+JBK91afPR4aG7dBWLIBJkybp/RaEeo/KKCEPxcXFAZiq7Vv6dm+LaBUNeCKkWVFRkYuLi6WlZVlZmZmZmVbbqqiocHBwMDQ0vHv3rrW1tVbbItpGvVFCmsXGxjLGXnzxRW3XUADx8fFyufyFF16gGqoHqIwS0ozu6Ilq6KaeEACorq7u2bMnY6y0tLSrmgs6PUldXV337t0bGhoKCwsdHR212hZpBzQRjRAASEhIaGpq8vPz03YNBXD2rNGIEb+4up6hGqof6KaeEAAIDAwcPnx4bm7unTt3tN3WTz+ZiERuHh5varsh0j6ojBICAFZWVlZWViUlJYGBgZWVldprSKHAiRMAMG2a9hoh7YrKKCEAYGFhkZCQ4Onpee3atUmTJtXW1mqpoYsXUV6OwYMxcKCWWiDtjcooIc1sbGxOnjzp5uaWnp4+bdq0xsZGbbTCLX1CXVF9QmWUkIecnJySk5MdHBzOnDkzZ84cbcx2b5+lT0h7ojJKyJ+4u7ufPn3a1tY2JiZm8eLFmg2emYnffoOTE0aP1mxgoktURgl5lIeHR2xsrJmZ2bZt29auXavByFxXNDgYPJ4GoxIdo+H3hDze8ePHQ0ND5XL5V199tWzZMjWjicXw8oJcjh9/RK9eCAzUSI6kQ6DeKCGPN2XKlO+//57H4y1fvjw6OlrNaG++ichIWFrC0hK3bmkkQdJRUBkl5G+FhYVFRUUxxubNm3fq1Cl1QvXqhatXkZenqdRIB0JllJB/snjx4lWrVslkshkzZgiFwqf/ImPIycHu3Zg3D7t2AcBnn2H5cm3lSXSI5tQT8gSff/55ZWXlN998ExQUlJKS4unp+XefrK1FZiakUohEOH8e5eXNxwsKAKB/f4wYgbg4+Pm1S96kvdArJkKeTKFQzJo16/Dhw05OTkKhsE+fPi2nbt26JZFIJBKJWCy+f//Qr7/2bznl7AwfH/D58PfHu+8iKQn19Rg2DP/zP5g1C7a2urgSogXUGyXkyQwNDX/44Yd79+4lJSWNHz9+48aNN2/eFIvFaWlppaWlLR/z9xfb2/fn88HnQyCAi8vDCEuXAoC5OY4cwY0bGDAAmzdj5sx2vxKiBdQbJeRp1dTUBAQEZGVlKRSKloM2NjajRo0SCAS+vr4+Pj4WFhZPjPPll1ixAiYmOH6cRj7pAyqjhLRBWVnZpUuX1q5dO3LkSG9vbz6f7+7urkKcVasQGQkLCyQlQSDQeJqkXVEZJUQHGMP8+fj2W9ja4sIF/P1bK9IJUBklRDcUCsycicC7gv8AAAEFSURBVCNH4OQEkQhubrpOiKiKxo0SohuGhti3DwEBKC5GaKji7t0yXWdEVES9UUJ06f59hIY2FRXNMTG5lpKSYkvDoDoh6o0SokvW1jh48D5w+erVqyEhIQ0NDbrOiLQZlVFCdKxbt27Jycm9e/e+cOHCq6++KpfLdZ0RaRsqo4ToXq9evU6dOmVvbx8fH//GG2/Qo7bOhcooIR3Cc889d/LkSSsrq3379q1evVrX6ZA2oFdMhHQgp0+fDg4OlslkEolkzJgxuk6HPBUqo4R0LAcPHqypqZk3b56uEyFPi8ooIYSohZ6NEkKIWqiMEkKIWqiMEkKIWqiMEkKIWqiMEkKIWv4fMJuS9+yw9/AAAAIYelRYdHJka2l0UEtMIHJka2l0IDIwMjEuMDkuMgAAeJx7v2/tPQYg4GVAABkglgfiBkY2hgQgzcjMzqAApJkhXCYmGM3OoAESxhBncwCLs7A5ZIDlGWEC7AxgASaEAIRmZnewANGMzHCtUJvhRsBUIhTAzGZAswTJVsIMmLHcDIwMjEwMTMxAAxhYWBlY2RhY2BXYOTKYODgTOLkUuLgVuHkymHh4E3j5Evj4GfgEMpg4BTOYBIUShIRVmIVFtJiYmYRERURVmEXFMpjExBPEJTKYJCQTJKUymISkM5jYGBmk2RMEuBOkRBNEWNgY2VhZmJnYODgFhaTZWXm4Bfh42cTEJSSlRMWjGIEeh8dFlMkihz0rVx8AcUrWNzrkRneA2cabVR1u3J0EZp8NOmkvLB8PZgd4MzrMq2UGs1m/xzn0f7yzH8TOYJjgkCpqChbnaG6wZ2opBovfOrbTfj7zNzuw+Nl7dsq7TtmD2BrvEu3b/jk4gNj26mIOscxhYPYvExsHQeHpYPbzJ70Oe2dMArOD0mc4HG11B7O3VBxx4KpQBLOjzeMc1KaJgdkaS5r3l21YADb/1/M9+2dONtkHYm+StzngaMEIdk+dZu0BMytJsJpZWWsPbPPtArut/v3ZAyvPrAeLRx39c+DPoVYwW72b9eDW/0fB5mROfHVAc+c+sDl9u3cfqF4/BczuDHy9b/3pxWC2GADiUYxFhEuX8wAAApt6VFh0TU9MIHJka2l0IDIwMjEuMDkuMgAAeJx9VVtuGzEM/PcpdIEV+Jb0mcRBURRxgDbtHfqf+6NDGckqgNB1ROzKYy41nGEuJa+f1x9/38vnJdfLpRT6z98Yo/xRIrq8lLwpj8/fvt/K09vD48fO0+vv29uvIr0o4zf4fMU+vL2+fOxweSpeKXofVA6vGmYxClWa1/lTKbdilVwiejmsSjAHbYCKjFId+UISiDvvvAEagFy9SddRDq3WQ5psgI5XSyUhEWTEnZvbDhjImHm0SysH14YiZQdsAKIy7Dllxi7DdFdjnzUSadJ+UB3OoroBjgk0Z51ntU42YoNjtCe/V2fveXwUS74DMk5NtXdxA6mV2F23QJmEqwWbA6imw7dAnTwOo4YjGNhham0HtNlrU+MhyVPnTL0B+uSxRfKMVzO3LHYDzM5EZR1NLdUhMnzXGG73FqI2GXkql4FyNsCOGtFikm7WQKSAnr4tMlsDpDVHm7OJ3EJtl1SyOZDDUMjBE+pkLXZZJY0D0YLq8CTRR/d90uwPnJVkRkpHGrFskWmdI1Ic8ALOpGBrqzZJ7xyt9jFocCrUnHiLzBYdPeVhTfNI2qPxru0S96SqJGkLUDaIZKdiafcz9YiAMgDlwDTYEjUbBcX7CIqEShvhu1qfb9cvs+k+rR5fb9dzWuVHzpmEh6Ln5GEsO+cLY/k5RRgrzlnBWO2cCPltP31vWON0t2Hx6mKegRe78gyy+NIysC4GtAxsi9MsA/tiKZ4hFu/Y3GmLSWy+vS9usAw8FtVzBlnFLTPwomHOILJo1TKILpq0DGKL9iyD+KKxfISAFilZBmmLYmzW0xdhMGg/350kY/MTcG9hWyjVbCgynQWnZFaB5PPHv0LcX/4BujZd9NBEj0EAAAFeelRYdFNNSUxFUyByZGtpdCAyMDIxLjA5LjIAAHicHZE7juQwDESvsmEbUGv4/6CxkZON+gKDiZz3CebwW7SdCE8lslg8L76ux/uSz/X4Ov+eX5de8+kB8NbzfD/O4zz1uOT4fH/+/fCf30ds1k5dsl2kfb1sZ0jw0s2cbuul21J4FOrSvF44WLBDoaYd60W7SiDVTeyuA0ydvRZvLy28efImKXMQgaRvYonb9aTNGWpo9ZTd2nYzJ8uAn6fudo9YQF1+y3wbtwySJJZBMS2tUV8TtkByVzcNIXMaDzXGLae6ViTnrVIl8TV2mkjv6hURNogDPxAG8g6KYZI9vlCWbEq1s6hOLCQksjAEuU1ymD4x9MIIViE5Gq+eMjYHBIS4ySVyCFJnpOmboqoXfGgY7ELjRORTuWTimZ0gVx07iWcyvcxZC5FYkc1WkKURNouFCtPdyhS5renNbOv4/Q+Ce3OznRP1XwAAAABJRU5ErkJggg==",
      "text/plain": [
       "<rdkit.Chem.rdchem.Mol at 0x7fe0a0f88760>"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Chem.MolFromSmiles(drug_dict['ENMD-2076'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e054ed1d-9257-469c-a59c-f2dbacf35999",
   "metadata": {},
   "source": [
    "This is a good wat to check the unique `(drug, smiles)` combinations that exist in the `adata_cpi`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "9413be52-5a23-4fe8-8343-572cc09a612f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# np.unique([f'{condition}_{smiles}' for condition, smiles in list(zip(adata_cpi.obs.condition, adata_cpi.obs.SMILES))])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2eb256be",
   "metadata": {},
   "source": [
    "## Rename drug `(+)-JQ1`\n",
    "This had a different name in the old Sciplex dataset, where it was called `JQ1`. We rename it for consistency."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "70c32cd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_cpa.obs[\"condition\"] = adata_cpa.obs[\"condition\"].cat.rename_categories({\"(+)-JQ1\": \"JQ1\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c3eae9f1-e48a-4ad3-8300-c4c23ea0ca54",
   "metadata": {},
   "source": [
    "## Add SMILES to `adata_cpa`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "094213b7-2c85-41a4-be90-a822f449facc",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_cpa.obs['SMILES'] = adata_cpa.obs.condition.map(drug_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "aacd9954",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CC1=NN=C2[C@H](CC(=O)OC(C)(C)C)N=C(C3=C(SC(C)...]\n",
       "Categories (1, object): ['CC1=NN=C2[C@H](CC(=O)OC(C)(C)C)N=C(C3=C(SC(C)...]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_cpa[adata_cpa.obs[\"condition\"] == \"JQ1\"].obs[\"SMILES\"].unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5790f363-e2b4-440b-9e28-ad17645cb3b6",
   "metadata": {},
   "source": [
    "## Check that SMILES match `obs.condition` data\n",
    "\n",
    "Print some stats on the `condition` columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "867bbc79",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "We have 188 drug names in adata_cpa: \n",
      "\n",
      "\t['control', 'ENMD-2076', 'PD98059', 'Tranylcypromine', 'WP1066', 'RG108', 'Tubastatin', 'Curcumin', 'GSK-LSD1', 'SRT2104', 'Busulfan', 'Baricitinib', 'Tacedinaline', 'Capecitabine', 'Tofacitinib', 'Tazemetostat', 'Entacapone', 'BRD4770', 'WHI-P154', 'Mesna', 'Cerdulatinib', 'CEP-33779', 'Anacardic', 'GSK', 'Daphnetin', 'Altretamine', 'PJ34', 'Filgotinib', 'Ofloxacin', 'Triamcinolone', 'UNC0631', 'Clevudine', 'Sirtinol', 'AICAR', 'Valproic', 'AG-490', 'NVP-BSK805', 'MK-0752', 'IOX2', 'Sodium', 'Zileuton', 'Fasudil', 'Meprednisone', 'S3I-201', 'Ramelteon', 'Fulvestrant', 'JNJ-26854165', 'Aminoglutethimide', 'Andarine', 'S-Ruxolitinib', 'MC1568', 'Costunolide', 'INO-1001', 'AG-14361', 'A-366', 'Fluorouracil', 'Streptozotocin', 'Entinostat', 'Selisistat', 'UNC1999', 'Motesanib', 'TGX-221', 'Ki8751', 'Resminostat', 'Quercetin', 'Maraviroc', 'Ki16425', 'PD173074', 'UNC0379', 'BMS-265246', 'EED226', 'Amisulpride', 'Tie2', 'Thiotepa', 'Roscovitine', 'AC480', 'Carmofur', 'Thalidomide', 'Avagacestat', 'Divalproex', 'Ruxolitinib', 'CUDC-101', 'PFI-1', 'Gandotinib', 'SL-327', 'Droxinostat', 'Glesatinib?(MGCD265)', 'Veliparib', 'Linifanib', 'Momelotinib', 'M344', 'Azacitidine', 'Lomustine', 'Obatoclax', 'Lenalidomide', 'SGI-1776', 'Givinostat', 'Prednisone', 'Disulfiram', 'Nilotinib', 'Trichostatin', 'Sorafenib', 'Cyclocytidine', 'SRT1720', 'Mercaptopurine', 'Cediranib', 'AZ', 'Celecoxib', 'Cimetidine', 'Lapatinib', 'JQ1', 'Aurora', 'KW-2449', 'Navitoclax', 'Belinostat', 'FLLL32', 'SRT3025', 'SB431542', 'MK-5108', 'Toremifene', 'TG101209', 'Nintedanib', 'JNJ-7706621', 'Resveratrol', 'G007-LK', 'CYC116', 'Pracinostat', 'Roxadustat', 'PCI-34051', 'Pelitinib', 'Abexinostat', 'AR-42', 'BMS-536924', 'Ellagic', 'PF-3845', 'Vandetanib', '2-Methoxyestradiol', 'ITSA-1', 'Fedratinib', 'XAV-939', 'Crizotinib', 'ABT-737', 'Enzastaurin', 'AZD1480', 'ZM', 'AMG-900', 'Regorafenib', 'BMS-754807', 'Alendronate', 'BMS-911543', 'TMP195', 'Panobinostat', 'Dasatinib', 'Dacinostat', 'Raltitrexed', 'GSK1070916', 'Ivosidenib', 'Bisindolylmaleimide', 'Trametinib', 'PF-573228', 'Bosutinib', 'Barasertib', 'CUDC-907', 'Rucaparib', 'Danusertib', 'Pirarubicin', 'Decitabine', 'Quisinostat', 'MLN8054', 'Tucidinostat', 'SNS-314', 'Temsirolimus', 'Tanespimycin', 'PHA-680632', 'Iniparib', 'Alisertib', 'TAK-901', 'Tozasertib', 'Luminespib', 'Mocetinostat', 'Hesperadin', 'Rigosertib', 'Alvespimycin', 'AT9283', 'Patupilone', 'Flavopiridol', 'Epothilone', 'YM155']\n",
      "\n",
      "\n",
      "We have 188 drug names in adata_cpi: \n",
      "\n",
      "\t['control', 'ENMD-2076', 'GSK-LSD1', 'BRD4770', 'Baricitinib', 'Entacapone', 'RG108', 'WP1066', 'Curcumin', 'Capecitabine', 'Mesna', 'Tubastatin', 'Tranylcypromine', 'Busulfan', 'Cerdulatinib', 'Tofacitinib', 'PD98059', 'AICAR', 'Tacedinaline', 'SRT2104', 'Valproic', 'Triamcinolone', 'CEP-33779', 'Clevudine', 'Anacardic', 'MK-0752', 'Filgotinib', 'Altretamine', 'GSK', 'NVP-BSK805', 'UNC0631', 'Tazemetostat', 'WHI-P154', 'Sirtinol', 'PJ34', 'Daphnetin', 'Sodium', 'Ofloxacin', 'Meprednisone', 'AG-490', 'S3I-201', 'IOX2', 'JNJ-26854165', 'Motesanib', 'Zileuton', 'MC1568', 'Streptozotocin', 'INO-1001', 'Fasudil', 'Ramelteon', 'Entinostat', 'Selisistat', 'Aminoglutethimide', 'S-Ruxolitinib', 'Costunolide', 'A-366', 'Andarine', 'Quercetin', 'Fluorouracil', 'Fulvestrant', 'TGX-221', 'UNC1999', 'Tie2', 'Ki8751', 'UNC0379', 'Resminostat', 'AC480', 'AG-14361', 'PD173074', 'Divalproex', 'EED226', 'Maraviroc', 'Ki16425', 'CUDC-101', 'Carmofur', 'SL-327', 'Thalidomide', 'Thiotepa', 'Roscovitine', 'Glesatinib?(MGCD265)', 'Avagacestat', 'Amisulpride', 'BMS-265246', 'Linifanib', 'PFI-1', 'Droxinostat', 'Gandotinib', 'Veliparib', 'M344', 'Obatoclax', 'Azacitidine', 'Prednisone', 'SGI-1776', 'Ruxolitinib', 'Lomustine', 'Givinostat', 'Nilotinib', 'Belinostat', 'FLLL32', 'Momelotinib', 'AZ', 'Mercaptopurine', 'Trichostatin', 'Disulfiram', 'MK-5108', 'Cyclocytidine', 'Sorafenib', 'KW-2449', 'TG101209', 'Cediranib', 'Celecoxib', 'Navitoclax', 'JNJ-7706621', 'Lenalidomide', 'SB431542', 'SRT1720', 'SRT3025', 'JQ1', 'Toremifene', 'Resveratrol', 'Lapatinib', 'Aurora', '2-Methoxyestradiol', 'G007-LK', 'Cimetidine', 'Pelitinib', 'Roxadustat', 'CYC116', 'Ellagic', 'PCI-34051', 'Nintedanib', 'Pracinostat', 'Abexinostat', 'BMS-536924', 'PF-3845', 'XAV-939', 'Crizotinib', 'ITSA-1', 'Fedratinib', 'AR-42', 'ZM', 'ABT-737', 'Enzastaurin', 'Vandetanib', 'Regorafenib', 'AZD1480', 'AMG-900', 'BMS-754807', 'Alendronate', 'Panobinostat', 'BMS-911543', 'Raltitrexed', 'TMP195', 'Dasatinib', 'GSK1070916', 'CUDC-907', 'Bisindolylmaleimide', 'Trametinib', 'PF-573228', 'Bosutinib', 'Dacinostat', 'Danusertib', 'Ivosidenib', 'Rucaparib', 'Pirarubicin', 'Barasertib', 'Quisinostat', 'Decitabine', 'MLN8054', 'Tucidinostat', 'Tanespimycin', 'SNS-314', 'Temsirolimus', 'Iniparib', 'PHA-680632', 'Alisertib', 'TAK-901', 'Hesperadin', 'Rigosertib', 'Luminespib', 'Tozasertib', 'Mocetinostat', 'Alvespimycin', 'AT9283', 'Patupilone', 'Flavopiridol', 'Epothilone', 'YM155']\n"
     ]
    }
   ],
   "source": [
    "print(f'We have {len(list(adata_cpa.obs.condition.value_counts().index))} drug names in adata_cpa: \\n\\n\\t{list(adata_cpa.obs.condition.value_counts().index)}\\n\\n')\n",
    "print(f'We have {len(list(adata_cpi.obs.condition.value_counts().index))} drug names in adata_cpi: \\n\\n\\t{list(adata_cpi.obs.condition.value_counts().index)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82f81005-0b9f-4c58-9c95-9657f389c3f7",
   "metadata": {},
   "source": [
    "Check that assigned SMILES match the condition,  \n",
    "it should be just one smiles string per condition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "14e573d3-7094-46f4-8d08-19281af939cd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(adata_cpa.obs.condition=='nan').sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2071fdb-3994-4429-9f31-a0d7d25e987f",
   "metadata": {},
   "source": [
    "### Check for nans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "d3316c26",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(adata_cpa.obs.condition=='nan').sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "75a1f6a1-5d38-4da0-908b-8ebbfa512ca2",
   "metadata": {},
   "source": [
    "### Take care of `control` SMILES"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "f19c63df",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['']"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counts = adata_cpa[adata_cpa.obs.condition=='control'].obs.SMILES.value_counts()\n",
    "list(counts.index[counts>0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e60d483-21d8-44bd-9477-18e96202f983",
   "metadata": {},
   "source": [
    "Add DMSO SMILES:`CS(C)=O`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "329cde95-df2a-4e46-a969-dda70c32eb94",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_cpa.obs[\"SMILES\"] = adata_cpa.obs[\"SMILES\"].astype(\"category\").cat.rename_categories({\"\": \"CS(C)=O\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "59cff3e8-2922-4988-91f2-0c44c3766810",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CS(C)=O                                                                                                                13004\n",
       "Cl.Cl.CN1C=C(CNCC2CCN(CC2)C2=NC=C(C=N2)C(=O)NO)C2=CC=CC=C12 |c:16,18,27,t:2,14,25,29|                                      0\n",
       "CCC1=CC=CC(CC)=C1NC(=O)N1CC2=C(C1)C(NC(=O)C1=CC=C(C=C1)N1CCN(C)CC1)=NN2 |c:4,8,16,26,28,38,t:2,24|                         0\n",
       "CN(C)CC(=O)NC1=CC2=C(NC(=O)C3=C2C=CC=C3)C=C1 |c:14,17,19,22,t:7,9|                                                         0\n",
       "CC1=C(CCNCC2=CC=C(\\C=C\\C(=O)NO)C=C2)C2=C(N1)C=CC=C2 |c:1,17,20,24,26,t:7,9|                                                0\n",
       "                                                                                                                       ...  \n",
       "CN1C=C(C2=CC=CC=C12)C1=C(C(=O)NC1=O)C1=CN(C2CCN(CC3=NC=CC=C3)CC2)C2=CC=CC=C12 |c:2,6,30,32,40,t:4,8,12,20,28,38,42|        0\n",
       "CC1CCCC2OC2CC(OC(=O)CC(O)C(C)(C)C(=O)C(C)C1O)\\C(C)=C\\C3=CSC(=N3)C                                                          0\n",
       "COC1=CC=C(\\C=C\\C(=O)C2(CCCCC2)C(=O)\\C=C\\C2=CC(OC)=C(OC)C=C2)C=C1OC |c:29,32,t:2,4,21,25|                                   0\n",
       "Cl.O=S(=O)(N1CCCNCC1)C1=C2C=CN=CC2=CC=C1 |c:11,13,15,18,20|                                                                0\n",
       "ClCCN(N=O)C(=O)NC1CCCCC1                                                                                                   0\n",
       "Name: SMILES, Length: 188, dtype: int64"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_cpa.obs.loc[adata_cpa.obs.condition=='control', 'SMILES'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c21f5b4-4e49-45f2-96e0-d87edac2f935",
   "metadata": {},
   "source": [
    "### Check double assigned condition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "3f481a8e-0e72-4433-b994-b34a7696d609",
   "metadata": {},
   "outputs": [],
   "source": [
    "for pert, df in adata_cpa.obs.groupby('condition'):\n",
    "    n_smiles = (df.SMILES.value_counts()!=0).sum()\n",
    "    print(f\"{pert}: {n_smiles}\") if n_smiles > 1 else None"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec83a754-01eb-4c2c-8767-28c66ae6d1b0",
   "metadata": {},
   "source": [
    "Check that condition align with SMILES\n",
    "\n",
    "If everything is correct there should be no output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "2625673c",
   "metadata": {},
   "outputs": [],
   "source": [
    "for pert, df in adata_cpa.obs.groupby('condition'):\n",
    "    n_smiles = (df.SMILES.value_counts()!=0).sum()\n",
    "    print(f\"{pert}: {n_smiles}\") if n_smiles > 1 else None"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa1c21f0-a9f5-4d7c-adf3-72cd83813f84",
   "metadata": {},
   "source": [
    "## Make SMILES canonical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "145cb77d-c8e7-4857-b5f6-6432ce5aec5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "rdkit version: 2021.09.2\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(f'rdkit version: {rdkit.__version__}\\n')\n",
    "\n",
    "adata_cpa.obs.SMILES = adata_cpa.obs.SMILES.apply(Chem.CanonSmiles)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eaf3d01c-49b9-4afd-a444-4b316d2f82d3",
   "metadata": {},
   "source": [
    "## Add a random split to adata_cpa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "18d445a8-408e-4a57-ba36-dcdd858d9a83",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This does not make sense\n",
    "\n",
    "# from sklearn.model_selection import train_test_split\n",
    "\n",
    "# if 'split' not in list(adata_cpa.obs):\n",
    "#     print(\"Addig 'split' to 'adata_cpa.obs'.\")\n",
    "#     unique_drugs = np.unique(adata_cpa.obs.SMILES)\n",
    "#     drugs_train, drugs_tmp = train_test_split(unique_drugs, test_size=0.2)\n",
    "#     drugs_val, drugs_test = train_test_split(drugs_tmp, test_size=0.5)\n",
    "\n",
    "#     adata_cpa.obs['split'] = 'train'\n",
    "#     adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_val), 'split'] = 'test'\n",
    "#     adata_cpa.obs.loc[adata_cpa.obs.SMILES.isin(drugs_test), 'split'] = 'ood'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf25f7a9-4d72-45e2-836d-dfb45bc891c0",
   "metadata": {},
   "source": [
    "## Create subset `adata_cpa_subset` from `adata_cpa`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "e9656ff9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/nfs/staff-hdd/hetzell/miniconda3/envs/chemical_CPA/lib/python3.7/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
      "  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 9400 × 977\n",
       "    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val', 'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug', 'control', 'split_ho_pathway', 'split_tyrosine_ood', 'split_epigenetic_ood', 'split_cellcycle_ood', 'SMILES'\n",
       "    var: 'id', 'gene_id', 'in_lincs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1'\n",
       "    uns: 'all_DEGs', 'hvg', 'lincs_DEGs', 'log1p'"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adatas = []\n",
    "\n",
    "for drug in np.unique(adata_cpa.obs.condition): \n",
    "    tmp = adata_cpa[adata_cpa.obs.condition == drug].copy()\n",
    "    tmp = sc.pp.subsample(tmp, n_obs=50, copy=True, random_state=42)\n",
    "    adatas.append(tmp)\n",
    "\n",
    "adata_cpa_subset = adatas[0].concatenate(adatas[1:])\n",
    "adata_cpa_subset.uns = adata_cpa.uns.copy()\n",
    "\n",
    "adata_cpa_subset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7bdb55ef-15a9-431f-991b-29e467aa9020",
   "metadata": {},
   "source": [
    "## Safe both adata objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "7afe61ee-cb21-4c3d-a1a1-f0a3fd10ecc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_cpa.write(adata_out)\n",
    "adata_cpa_subset.write(adata_out_subset)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bfc9546e",
   "metadata": {},
   "source": [
    "### Loading the result for `adata_out`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "ad5d19e9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100.0      2469\n",
       "10.0       2447\n",
       "1000.0     2346\n",
       "10000.0    2088\n",
       "0.0          50\n",
       "Name: dose, dtype: int64"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata = sc.read(adata_out_subset)\n",
    "adata.obs.dose.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "7f0ffd85-e875-48fd-8c6a-6a5023139463",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{}"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_cpa.uns[\"log1p\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c626753",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chemical_CPA",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}