# 6 BASELINE SCIPLEX DATASET

**Requires**
sciplex_complete_middle_subset_lincs_genes.h5ad

**Outputs**
adata_baseline_high_dose.h5ad



In [1]:
import pandas as pd
import scanpy as sc

from chemCPA.paths import DATA_DIR

pd.set_option('display.max_columns', 200)

In [2]:
list(DATA_DIR.iterdir())

[PosixPath('/nfs/staff-ssd/hetzell/code/chemCPA_v2/project_folder/datasets/sciplex_complete_middle_subset_lincs_genes.h5ad'),
 PosixPath('/nfs/staff-ssd/hetzell/code/chemCPA_v2/project_folder/datasets/sciplex_complete_middle_subset.h5ad'),
 PosixPath('/nfs/staff-ssd/hetzell/code/chemCPA_v2/project_folder/datasets/adata_baseline.h5ad'),
 PosixPath('/nfs/staff-ssd/hetzell/code/chemCPA_v2/project_folder/datasets/preds_scgen_A549.h5ad')]

In [3]:
adata_sciplex = sc.read(DATA_DIR/ "sciplex_complete_middle_subset_lincs_genes.h5ad")

In [4]:
adata_sciplex.obs.columns

Index(['cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score',
       'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2',
       'product_dose', 'product_name', 'proliferation_index', 'replicate',
       'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val',
       'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug',
       'control', 'split_ho_pathway', 'split_tyrosine_ood',
       'split_epigenetic_ood', 'split_cellcycle_ood', 'SMILES',
       'split_ood_finetuning', 'split_ho_epigenetic',
       'split_ho_epigenetic_all', 'split_random', 'split_ood_multi_task'],
      dtype='object')

In [5]:
adata_sciplex.obs.loc[adata_sciplex.obs.split_ood_multi_task == 'ood', 'condition'].unique()

['Quisinostat', 'Hesperadin', 'Flavopiridol', 'Belinostat', 'Alvespimycin', 'TAK-901', 'Dacinostat', 'Tanespimycin', 'Givinostat']
Categories (188, object): ['2-Methoxyestradiol', 'JQ1', 'A-366', 'ABT-737', ..., 'YM155', 'ZM', 'Zileuton', 'control']

In [6]:
# Subset to second largest dose

print(adata_sciplex.obs.dose.unique())
adata_sciplex = adata_sciplex[adata_sciplex.obs.dose.isin([0., 1e4])].copy()

[ 1000.     0.   100. 10000.    10.]


In [7]:
# Add new splits for dose=1000 and cell_type (A549, MCF7, K562) being unseen for ood drugs 

for cell_type in adata_sciplex.obs.cell_type.unique():
    print(cell_type)
    adata_sciplex.obs[f'split_baseline_{cell_type}'] = adata_sciplex.obs['split_ood_multi_task']
    sub_df = adata_sciplex.obs.loc[(adata_sciplex.obs[f'split_baseline_{cell_type}'] == 'ood') * (adata_sciplex.obs.cell_type != cell_type)]

    train_test = sub_df.index
    test = sub_df.sample(frac=0.5).index 

    adata_sciplex.obs.loc[train_test,f'split_baseline_{cell_type}'] = 'train'
    adata_sciplex.obs.loc[test,f'split_baseline_{cell_type}'] = 'test'

A549
MCF7
K562


In [8]:
adata_sciplex.obs['split_baseline_A549'].value_counts()

train    107544
test      12008
ood         775
Name: split_baseline_A549, dtype: int64

In [9]:
pd.crosstab(adata_sciplex.obs['split_ood_multi_task'], adata_sciplex.obs['condition'])

condition,2-Methoxyestradiol,JQ1,A-366,ABT-737,AC480,AG-490,AG-14361,AICAR,AMG-900,AR-42,AT9283,AZ,AZD1480,Abexinostat,Alendronate,Alisertib,Altretamine,Alvespimycin,Aminoglutethimide,Amisulpride,Anacardic,Andarine,Aurora,Avagacestat,Azacitidine,BMS-265246,BMS-536924,BMS-754807,BMS-911543,BRD4770,Barasertib,Baricitinib,Belinostat,Bisindolylmaleimide,Bosutinib,Busulfan,CEP-33779,CUDC-101,CUDC-907,CYC116,Capecitabine,Carmofur,Cediranib,Celecoxib,Cerdulatinib,Cimetidine,Clevudine,Costunolide,Crizotinib,Curcumin,Cyclocytidine,Dacinostat,Danusertib,Daphnetin,Dasatinib,Decitabine,Disulfiram,Divalproex,Droxinostat,EED226,ENMD-2076,Ellagic,Entacapone,Entinostat,Enzastaurin,Epothilone,FLLL32,Fasudil,Fedratinib,Filgotinib,Flavopiridol,Fluorouracil,Fulvestrant,G007-LK,GSK,GSK1070916,GSK-LSD1,Gandotinib,Givinostat,Glesatinib?(MGCD265),Hesperadin,INO-1001,IOX2,ITSA-1,Iniparib,Ivosidenib,JNJ-7706621,JNJ-26854165,KW-2449,Ki8751,Ki16425,Lapatinib,Lenalidomide,Linifanib,Lomustine,Luminespib,M344,MC1568,MK-0752,MK-5108,MLN8054,Maraviroc,Meprednisone,Mercaptopurine,Mesna,Mocetinostat,Momelotinib,Motesanib,NVP-BSK805,Navitoclax,Nilotinib,Nintedanib,Obatoclax,Ofloxacin,PCI-34051,PD98059,PD173074,PF-3845,PF-573228,PFI-1,PHA-680632,PJ34,Panobinostat,Patupilone,Pelitinib,Pirarubicin,Pracinostat,Prednisone,Quercetin,Quisinostat,RG108,Raltitrexed,Ramelteon,Regorafenib,Resminostat,Resveratrol,Rigosertib,Roscovitine,Roxadustat,Rucaparib,Ruxolitinib,S3I-201,S-Ruxolitinib,SB431542,SGI-1776,SL-327,SNS-314,SRT1720,SRT2104,SRT3025,Selisistat,Sirtinol,Sodium,Sorafenib,Streptozotocin,TAK-901,TG101209,TGX-221,TMP195,Tacedinaline,Tanespimycin,Tazemetostat,Temsirolimus,Thalidomide,Thiotepa,Tie2,Tofacitinib,Toremifene,Tozasertib,Trametinib,Tranylcypromine,Triamcinolone,Trichostatin,Tubastatin,Tucidinostat,UNC0379,UNC0631,UNC1999,Valproic,Vandetanib,Veliparib,WHI-P154,WP1066,XAV-939,YM155,ZM,Zileuton,control
split_ood_multi_task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1
ood,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,175,0,0,0,0,0,0,0,0,0,0,0,0,0,0,536,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,517,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,0,0,0,0,0,0,0,530,0,372,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,525,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,195,0,0,0,0,320,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
test,18,298,35,27,24,31,25,35,20,221,17,14,9,277,27,139,29,0,35,23,26,34,22,32,24,20,25,162,26,28,217,26,0,1,9,23,30,150,102,25,25,24,18,23,26,23,24,32,170,30,17,0,19,25,165,30,29,33,23,39,324,34,22,249,26,125,14,25,10,16,0,31,306,23,11,195,41,20,0,26,0,23,27,22,15,15,19,25,176,23,29,17,26,23,19,144,209,30,26,14,20,25,33,16,38,161,11,22,23,12,21,153,14,30,25,27,26,25,24,28,10,30,256,149,17,104,233,28,19,0,30,181,27,16,221,25,51,31,36,26,29,34,26,32,14,33,14,34,52,25,29,32,26,15,27,0,14,21,23,305,0,40,15,27,30,20,24,13,19,139,30,37,212,22,244,19,32,26,30,16,24,30,18,10,45,16,29,1132
train,385,362,686,528,538,725,620,690,572,281,363,392,322,306,565,211,618,0,762,732,716,802,555,693,602,493,499,224,631,689,281,802,0,52,365,753,735,350,210,419,799,628,431,724,582,450,704,713,224,744,464,0,380,653,237,545,688,687,664,713,415,724,716,330,564,182,492,796,257,675,0,597,405,557,692,227,749,457,0,412,0,731,749,542,412,325,466,573,245,563,719,390,655,584,529,200,306,722,594,397,555,669,781,379,713,200,403,692,625,320,594,211,329,738,517,767,611,527,398,609,320,593,336,185,365,183,326,673,588,0,752,397,672,225,335,670,65,657,726,512,789,740,674,746,380,578,451,530,748,618,756,728,674,372,735,0,300,621,629,389,0,759,376,668,660,707,726,362,440,328,768,767,280,738,265,420,704,674,704,318,605,630,710,397,51,502,670,11872


In [10]:
# Quick check that everything is correct

cell_type = 'K562'

# pd.crosstab(adata_sciplex.obs[f'split_baseline_{cell_type}'], adata_sciplex.obs['condition'])
pd.crosstab(adata_sciplex.obs[f'split_baseline_{cell_type}'], adata_sciplex.obs['cell_type'])

cell_type,A549,K562,MCF7
split_baseline_K562,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ood,0,543,0
test,3108,2386,6630
train,26941,26527,54192


In [11]:
# write adata 

adata_sciplex.write(DATA_DIR/'adata_baseline_high_dose.h5ad', compression="gzip")