|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import sys |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
import sfaira |
|
import warnings |
|
os.getcwd() |
|
|
|
from chemCPA.paths import DATA_DIR, PROJECT_DIR |
|
|
|
pd.set_option('display.max_columns', 100) |
|
|
|
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
sys.path.append(root_dir) |
|
import raw_data.datasets as datasets |
|
import logging |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
from notebook_utils import suppress_output |
|
|
|
import scanpy as sc |
|
with suppress_output(): |
|
sc.set_figure_params(dpi=80, frameon=False) |
|
sc.logging.print_header() |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
adata_lincs = sc.read(DATA_DIR/'lincs_full_smiles.h5ad' ) |
|
|
|
|
|
|
|
|
|
from tqdm import tqdm |
|
from chemCPA.paths import DATA_DIR, PROJECT_DIR |
|
from raw_data.datasets import sciplex |
|
|
|
|
|
adatas_sciplex = [] |
|
logging.info("Starting to load in sciplex data") |
|
|
|
|
|
chunk_paths = sciplex() |
|
|
|
|
|
for chunk_path in tqdm(chunk_paths, desc="Loading sciplex chunks"): |
|
tqdm.write(f"Loading {os.path.basename(chunk_path)}") |
|
adatas_sciplex.append(sc.read(chunk_path)) |
|
|
|
adata_sciplex = adatas_sciplex[0].concatenate(adatas_sciplex[1:]) |
|
logging.info("Sciplex data loaded") |
|
|
|
|
|
|
|
|
|
adata_sciplex.var['gene_id'] = adata_sciplex.var.id.str.split('.').str[0] |
|
adata_sciplex.var['gene_id'].head() |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
import json |
|
with open(DATA_DIR/ 'symbols_dict.json') as json_file: |
|
symbols_dict = json.load(json_file) |
|
except: |
|
logging.info("No symbols_dict.json found, falling back to sfaira") |
|
genome_container = sfaira.versions.genomes.GenomeContainer(organism="homo_sapiens", release="82") |
|
symbols_dict = genome_container.symbol_to_id_dict |
|
|
|
symbols_dict.update({'PLSCR3':'ENSG00000187838'}) |
|
|
|
|
|
|
|
|
|
adata_lincs.var['gene_id'] = adata_lincs.var_names.map(symbols_dict) |
|
adata_lincs.var['in_sciplex'] = adata_lincs.var.gene_id.isin(adata_sciplex.var.gene_id) |
|
|
|
|
|
adata_sciplex.var['in_lincs'] = adata_sciplex.var.gene_id.isin(adata_lincs.var.gene_id) |
|
|
|
|
|
print("\nGene matching statistics:") |
|
print(f"Number of genes in LINCS: {adata_lincs.shape[1]}") |
|
print(f"Number of genes in sciplex: {adata_sciplex.shape[1]}") |
|
print(f"Number of shared genes: {sum(adata_sciplex.var.in_lincs)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SUBSET = False |
|
|
|
if SUBSET: |
|
sc.pp.subsample(adata_sciplex, fraction=0.5, random_state=42) |
|
|
|
|
|
sc.pp.normalize_per_cell(adata_sciplex) |
|
|
|
sc.pp.log1p(adata_sciplex) |
|
|
|
sc.pp.highly_variable_genes(adata_sciplex, n_top_genes=1032, subset=False) |
|
|
|
|
|
|
|
|
|
|
|
((adata_sciplex.var.in_lincs) | (adata_sciplex.var.highly_variable)).sum() |
|
|
|
|
|
|
|
adata_sciplex = adata_sciplex[:, (adata_sciplex.var.in_lincs) | (adata_sciplex.var.highly_variable)].copy() |
|
|
|
|
|
|
|
|
|
|
|
adata_sciplex.obs['dose_val'] = adata_sciplex.obs.dose.astype(float) / np.max(adata_sciplex.obs.dose.astype(float)) |
|
adata_sciplex.obs.loc[adata_sciplex.obs['product_name'].str.contains('Vehicle'), 'dose_val'] = 1.0 |
|
|
|
adata_sciplex.obs['dose_val'].value_counts() |
|
|
|
|
|
|
|
adata_sciplex.obs['product_name'] = [x.split(' ')[0] for x in adata_sciplex.obs['product_name']] |
|
adata_sciplex.obs.loc[adata_sciplex.obs['product_name'].str.contains('Vehicle'), 'product_name'] = 'control' |
|
|
|
|
|
|
|
adata_sciplex.obs['condition'] = adata_sciplex.obs.product_name.copy() |
|
|
|
|
|
|
|
|
|
adata_sciplex.obs["condition"] = adata_sciplex.obs["condition"].astype('category').cat.rename_categories({"(+)-JQ1": "JQ1"}) |
|
adata_sciplex.obs['drug_dose_name'] = adata_sciplex.obs.condition.astype(str) + '_' + adata_sciplex.obs.dose_val.astype(str) |
|
adata_sciplex.obs['cov_drug_dose_name'] = adata_sciplex.obs.cell_type.astype(str) + '_' + adata_sciplex.obs.drug_dose_name.astype(str) |
|
adata_sciplex.obs['cov_drug'] = adata_sciplex.obs.cell_type.astype(str) + '_' + adata_sciplex.obs.condition.astype(str) |
|
|
|
|
|
|
|
adata_sciplex.obs['control'] = [1 if x == 'control_1.0' else 0 for x in adata_sciplex.obs.drug_dose_name.values] |
|
|
|
|
|
|
|
|
|
from chemCPA.helper import rank_genes_groups_by_cov |
|
|
|
rank_genes_groups_by_cov(adata_sciplex, groupby='cov_drug', covariate='cell_type', control_group='control', key_added='all_DEGs') |
|
|
|
|
|
adata_subset = adata_sciplex[:, adata_sciplex.var.in_lincs].copy() |
|
rank_genes_groups_by_cov(adata_subset, groupby='cov_drug', covariate='cell_type', control_group='control', key_added='lincs_DEGs') |
|
adata_sciplex.uns['lincs_DEGs'] = adata_subset.uns['lincs_DEGs'] |
|
|
|
|
|
|
|
|
|
|
|
cov_drug_dose_unique = adata_sciplex.obs.cov_drug_dose_name.unique() |
|
|
|
remove_dose = lambda s: '_'.join(s.split('_')[:-1]) |
|
cov_drug = pd.Series(cov_drug_dose_unique).apply(remove_dose) |
|
dose_no_dose_dict = dict(zip(cov_drug_dose_unique, cov_drug)) |
|
|
|
|
|
|
|
uns_keys = ['all_DEGs', 'lincs_DEGs'] |
|
|
|
for uns_key in uns_keys: |
|
new_DEGs_dict = {} |
|
|
|
df_DEGs = pd.Series(adata_sciplex.uns[uns_key]) |
|
|
|
for key, value in dose_no_dose_dict.items(): |
|
if 'control' in key: |
|
continue |
|
new_DEGs_dict[key] = df_DEGs.loc[value] |
|
adata_sciplex.uns[uns_key] = new_DEGs_dict |
|
|
|
adata_sciplex |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
adata_sciplex.obs['split_ho_pathway'] = 'train' |
|
|
|
ho_drugs = [ |
|
|
|
"Azacitidine", |
|
"Carmofur", |
|
"Pracinostat", |
|
"Cediranib", |
|
"Luminespib", |
|
"Crizotinib", |
|
"SNS-314", |
|
"Obatoclax", |
|
"Momelotinib", |
|
"AG-14361", |
|
"Entacapone", |
|
"Fulvestrant", |
|
"Mesna", |
|
"Zileuton", |
|
"Enzastaurin", |
|
"IOX2", |
|
"Alvespimycin", |
|
"XAV-939", |
|
"Fasudil", |
|
] |
|
|
|
ho_drug_pathway = adata_sciplex.obs['condition'].isin(ho_drugs) |
|
adata_sciplex.obs.loc[ho_drug_pathway, 'pathway_level_1'].value_counts() |
|
|
|
|
|
ho_drug_pathway.sum() |
|
|
|
|
|
adata_sciplex.obs.loc[ho_drug_pathway & (adata_sciplex.obs['dose_val'] == 1.0), 'split_ho_pathway'] = 'ood' |
|
|
|
test_idx = sc.pp.subsample(adata_sciplex[adata_sciplex.obs['split_ho_pathway'] != 'ood'], .15, copy=True).obs.index |
|
adata_sciplex.obs.loc[test_idx, 'split_ho_pathway'] = 'test' |
|
|
|
|
|
pd.crosstab(adata_sciplex.obs.pathway_level_1, adata_sciplex.obs['condition'][adata_sciplex.obs.condition.isin(ho_drugs)]) |
|
|
|
adata_sciplex.obs['split_ho_pathway'].value_counts() |
|
|
|
adata_sciplex[adata_sciplex.obs.split_ho_pathway == 'ood'].obs.condition.value_counts() |
|
|
|
adata_sciplex[adata_sciplex.obs.split_ho_pathway == 'test'].obs.condition.value_counts() |
|
|
|
|
|
|
|
adata_sciplex.obs['pathway_level_1'].value_counts() |
|
|
|
|
|
|
|
|
|
|
|
adata_sciplex.obs.loc[adata_sciplex.obs.pathway_level_1.isin(["Tyrosine kinase signaling"]),'condition'].value_counts() |
|
|
|
tyrosine_drugs = adata_sciplex.obs.loc[adata_sciplex.obs.pathway_level_1.isin(["Tyrosine kinase signaling"]),'condition'].unique() |
|
|
|
|
|
adata_sciplex.obs['split_tyrosine_ood'] = 'train' |
|
|
|
test_idx = sc.pp.subsample(adata_sciplex[adata_sciplex.obs.pathway_level_1.isin(["Tyrosine kinase signaling"])], .20, copy=True).obs.index |
|
adata_sciplex.obs.loc[test_idx, 'split_tyrosine_ood'] = 'test' |
|
|
|
adata_sciplex.obs.loc[adata_sciplex.obs.condition.isin(["Cediranib", "Crizotinib", "Motesanib", "BMS-754807", "Nintedanib"]), 'split_tyrosine_ood'] = 'ood' |
|
|
|
|
|
adata_sciplex.obs.split_tyrosine_ood.value_counts() |
|
|
|
pd.crosstab(adata_sciplex.obs.split_tyrosine_ood, adata_sciplex.obs['condition'][adata_sciplex.obs.condition.isin(tyrosine_drugs)]) |
|
|
|
pd.crosstab(adata_sciplex.obs.split_tyrosine_ood, adata_sciplex.obs.dose_val) |
|
|
|
|
|
|
|
|
|
|
|
adata_sciplex.obs.loc[adata_sciplex.obs.pathway_level_1.isin(["Epigenetic regulation"]),'condition'].value_counts() |
|
|
|
epigenetic_drugs = adata_sciplex.obs.loc[adata_sciplex.obs.pathway_level_1.isin(["Epigenetic regulation"]),'condition'].unique() |
|
|
|
|
|
adata_sciplex.obs['split_epigenetic_ood'] = 'train' |
|
|
|
test_idx = sc.pp.subsample(adata_sciplex[adata_sciplex.obs.pathway_level_1.isin(["Epigenetic regulation"])], .20, copy=True).obs.index |
|
adata_sciplex.obs.loc[test_idx, 'split_epigenetic_ood'] = 'test' |
|
|
|
adata_sciplex.obs.loc[adata_sciplex.obs.condition.isin(["Azacitidine", "Pracinostat", "Trichostatin", "Quisinostat", "Tazemetostat"]), 'split_epigenetic_ood'] = 'ood' |
|
|
|
|
|
adata_sciplex.obs.split_epigenetic_ood.value_counts() |
|
|
|
pd.crosstab(adata_sciplex.obs.split_epigenetic_ood, adata_sciplex.obs['condition'][adata_sciplex.obs.condition.isin(epigenetic_drugs)]) |
|
|
|
pd.crosstab(adata_sciplex.obs.split_tyrosine_ood, adata_sciplex.obs.dose_val) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
adata_sciplex.obs.loc[adata_sciplex.obs.pathway_level_1.isin(["Cell cycle regulation"]),'condition'].value_counts() |
|
|
|
cell_cycle_drugs = adata_sciplex.obs.loc[adata_sciplex.obs.pathway_level_1.isin(["Cell cycle regulation"]),'condition'].unique() |
|
|
|
|
|
adata_sciplex.obs['split_cellcycle_ood'] = 'train' |
|
|
|
test_idx = sc.pp.subsample(adata_sciplex[adata_sciplex.obs.pathway_level_1.isin(["Cell cycle regulation"])], .20, copy=True).obs.index |
|
adata_sciplex.obs.loc[test_idx, 'split_cellcycle_ood'] = 'test' |
|
|
|
adata_sciplex.obs.loc[adata_sciplex.obs.condition.isin(["SNS-314", "Flavopiridol", "Roscovitine"]), 'split_cellcycle_ood'] = 'ood' |
|
|
|
|
|
adata_sciplex.obs.split_cellcycle_ood.value_counts() |
|
|
|
pd.crosstab(adata_sciplex.obs.split_cellcycle_ood, adata_sciplex.obs['condition'][adata_sciplex.obs.condition.isin(cell_cycle_drugs)]) |
|
|
|
pd.crosstab(adata_sciplex.obs.split_cellcycle_ood, adata_sciplex.obs.dose_val) |
|
|
|
[c for c in adata_sciplex.obs.columns if 'split' in c] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sciplex_ids = pd.Index(adata_sciplex.var.gene_id) |
|
|
|
lincs_idx = [sciplex_ids.get_loc(_id) for _id in adata_lincs.var.gene_id[adata_lincs.var.in_sciplex]] |
|
|
|
|
|
non_lincs_idx = [sciplex_ids.get_loc(_id) for _id in adata_sciplex.var.gene_id if not adata_lincs.var.gene_id.isin([_id]).any()] |
|
|
|
lincs_idx.extend(non_lincs_idx) |
|
|
|
|
|
adata_sciplex = adata_sciplex[:, lincs_idx].copy() |
|
|
|
|
|
fname = PROJECT_DIR/'datasets'/'sciplex3_matched_genes_lincs.h5ad' |
|
|
|
sc.write(fname, adata_sciplex) |
|
|
|
|
|
|
|
|
|
sc.read(fname) |
|
|
|
|
|
|
|
|
|
|
|
adata_lincs = adata_lincs[:, adata_lincs.var.in_sciplex].copy() |
|
|
|
adata_sciplex = adata_sciplex[:, adata_sciplex.var.in_lincs].copy() |
|
|
|
adata_lincs.var_names |
|
|
|
adata_sciplex.var_names |
|
|
|
|
|
|
|
print("\nGene matching statistics:") |
|
print(f"Number of genes in LINCS: {adata_lincs.shape[1]}") |
|
print(f"Number of genes in combinatorial sciplex: {adata_sciplex.shape[1]}") |
|
print(f"Number of shared genes: {sum(adata_sciplex.var.in_lincs)}") |
|
|
|
|
|
|
|
|
|
|
|
fname = PROJECT_DIR/'datasets'/'sciplex3_lincs_genes.h5ad' |
|
|
|
sc.write(fname, adata_sciplex) |
|
|
|
|
|
|
|
|
|
|
|
fname_lincs = PROJECT_DIR/'datasets'/'lincs_full_smiles_sciplex_genes.h5ad' |
|
|
|
sc.write(fname_lincs, adata_lincs) |
|
|
|
|
|
|
|
|
|
|