chemCPA / preprocessing /6_baseline_sciplex_dataset.py
github-actions[bot]
HF snapshot
a48f0ae
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.16.1
# kernelspec:
# display_name: Python 3.7.12 ('chemical_CPA')
# language: python
# name: python3
# ---
# # 6 BASELINE SCIPLEX DATASET
# **Requires**
# sciplex_complete_middle_subset_lincs_genes.h5ad
#
# **Outputs**
# adata_baseline_high_dose.h5ad
#
#
# +
import pandas as pd
import scanpy as sc
from chemCPA.paths import DATA_DIR
pd.set_option('display.max_columns', 200)
# -
list(DATA_DIR.iterdir())
adata_sciplex = sc.read(DATA_DIR/ "sciplex_complete_middle_subset_lincs_genes.h5ad")
adata_sciplex.obs.columns
adata_sciplex.obs.loc[adata_sciplex.obs.split_ood_multi_task == 'ood', 'condition'].unique()
# +
# Subset to second largest dose
print(adata_sciplex.obs.dose.unique())
adata_sciplex = adata_sciplex[adata_sciplex.obs.dose.isin([0., 1e4])].copy()
# +
# Add new splits for dose=1000 and cell_type (A549, MCF7, K562) being unseen for ood drugs
for cell_type in adata_sciplex.obs.cell_type.unique():
print(cell_type)
adata_sciplex.obs[f'split_baseline_{cell_type}'] = adata_sciplex.obs['split_ood_multi_task']
sub_df = adata_sciplex.obs.loc[(adata_sciplex.obs[f'split_baseline_{cell_type}'] == 'ood') * (adata_sciplex.obs.cell_type != cell_type)]
train_test = sub_df.index
test = sub_df.sample(frac=0.5).index
adata_sciplex.obs.loc[train_test,f'split_baseline_{cell_type}'] = 'train'
adata_sciplex.obs.loc[test,f'split_baseline_{cell_type}'] = 'test'
# -
adata_sciplex.obs['split_baseline_A549'].value_counts()
pd.crosstab(adata_sciplex.obs['split_ood_multi_task'], adata_sciplex.obs['condition'])
# +
# Quick check that everything is correct
cell_type = 'K562'
# pd.crosstab(adata_sciplex.obs[f'split_baseline_{cell_type}'], adata_sciplex.obs['condition'])
pd.crosstab(adata_sciplex.obs[f'split_baseline_{cell_type}'], adata_sciplex.obs['cell_type'])
# +
# write adata
adata_sciplex.write(DATA_DIR/'adata_baseline_high_dose.h5ad', compression="gzip")
# -