|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import warnings |
|
|
|
import numpy as np |
|
import pandas as pd |
|
|
|
from scipy import sparse |
|
from tqdm.auto import tqdm |
|
|
|
from chemCPA.helper import rank_genes_groups_by_cov |
|
from chemCPA.paths import DATA_DIR |
|
from pathlib import Path |
|
import sys |
|
import logging |
|
from notebook_utils import suppress_output |
|
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
|
sys.path.append(root_dir) |
|
import raw_data.datasets as datasets |
|
|
|
|
|
import scanpy as sc |
|
with suppress_output(): |
|
sc.set_figure_params(dpi=100, frameon=False) |
|
sc.logging.print_header() |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
if not any('ipykernel' in arg for arg in sys.argv): |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
datefmt='%Y-%m-%d %H:%M:%S' |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
full = True |
|
load_adata = True |
|
|
|
adata_path = Path(datasets.lincs_full()) if full else DATA_DIR / datasets.lincs() |
|
logging.info(f"Starting to load in data from {adata_path}") |
|
adata = sc.read(adata_path) if load_adata else None |
|
logging.info(f"Data loaded from {adata_path}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info("Renaming and cleaning up columns") |
|
import re |
|
|
|
def remove_non_alphanumeric(input_string): |
|
return re.sub(r'[^a-zA-Z0-9]', '', input_string) |
|
|
|
adata.obs['condition'] = adata.obs['pert_iname'].apply(remove_non_alphanumeric) |
|
adata.obs['cell_type'] = adata.obs['cell_id'] |
|
adata.obs['dose_val'] = adata.obs['pert_dose'].astype(float) / np.max(adata.obs['pert_dose'].astype(float)) |
|
adata.obs['cov_drug_dose_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str) + '_' + adata.obs.dose_val.astype(str) |
|
adata.obs['cov_drug_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str) |
|
adata.obs['eval_category'] = adata.obs['cov_drug_name'] |
|
adata.obs['control'] = (adata.obs['condition'] == 'DMSO').astype(int) |
|
|
|
|
|
|
|
|
|
pd.crosstab(adata.obs.condition, adata.obs.cell_type) |
|
|
|
drug_abundance = adata.obs.condition.value_counts() |
|
suff_drug_abundance = drug_abundance.index[drug_abundance>5] |
|
|
|
|
|
adata = adata[adata.obs.condition.isin(suff_drug_abundance)].copy() |
|
adata |
|
logging.info("Finished cleaning up columns") |
|
|
|
|
|
|
|
|
|
logging.info("Processing DEGs") |
|
|
|
|
|
de_genes = {} |
|
de_genes_quick = {} |
|
|
|
adata_df = adata.to_df() |
|
adata_df = adata_df.join(adata.obs['condition']) |
|
dmso = adata_df[adata_df.condition == "DMSO"].mean(numeric_only=True) |
|
|
|
|
|
for cond, df in tqdm(adata_df.groupby('condition')): |
|
if cond != 'DMSO': |
|
drug_mean = df.mean(numeric_only=True) |
|
de_50_idx = np.argsort(abs(drug_mean - dmso))[-50:] |
|
de_genes_quick[cond] = drug_mean.index[de_50_idx].values |
|
|
|
if full: |
|
de_genes = de_genes_quick |
|
else: |
|
sc.tl.rank_genes_groups( |
|
adata, |
|
groupby='condition', |
|
reference='DMSO', |
|
rankby_abs=True, |
|
n_genes=50 |
|
) |
|
for cond in tqdm(np.unique(adata.obs['condition'])): |
|
if cond != 'DMSO': |
|
df = sc.get.rank_genes_groups_df(adata, group=cond) |
|
de_genes[cond] = df['names'][:50].values |
|
|
|
logging.info("Completed processing DEGs") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_drug(cond): |
|
split = cond.split('_') |
|
if len(split) == 2: |
|
return split[-1] |
|
return '_'.join(split[1:-1]) |
|
|
|
adata.obs['cov_drug_dose_name'].apply(lambda s: len(s.split('_'))).value_counts() |
|
adata.obs['eval_category'].apply(lambda s: len(s.split('_'))).value_counts() |
|
|
|
|
|
adata.uns['rank_genes_groups_cov'] = {cat: de_genes_quick[extract_drug(cat)] for cat in adata.obs.eval_category.unique() if extract_drug(cat) != 'DMSO'} |
|
|
|
adata.uns['rank_genes_groups_cov'] |
|
|
|
|
|
adata.obs['split'] = 'train' |
|
|
|
|
|
ood_idx = sc.pp.subsample( |
|
adata[adata.obs.condition.isin(list(adata.obs.condition.value_counts().index[1:50]))], |
|
.1, |
|
copy=True |
|
).obs.index |
|
adata.obs['split'].loc[ood_idx] = 'ood' |
|
|
|
|
|
test_idx = sc.pp.subsample( |
|
adata[adata.obs.split != 'ood'], |
|
.16, |
|
copy=True |
|
).obs.index |
|
adata.obs['split'].loc[test_idx] = 'test' |
|
|
|
|
|
pd.crosstab(adata.obs['split'], adata.obs['condition']) |
|
|
|
try: |
|
del(adata.uns['rank_genes_groups']) |
|
except: |
|
print('All good.') |
|
|
|
logging.info("Converting to sparse matrix") |
|
|
|
adata.X = sparse.csr_matrix(adata.X) |
|
logging.info("Finished converting to sparse matrix") |
|
|
|
output_path = adata_path.with_name(adata_path.stem + "_pp.h5ad") |
|
logging.info(f"Writing file to disk at {output_path}") |
|
output_path.parent.mkdir(parents=True, exist_ok=True) |
|
sc.write(output_path, adata) |
|
logging.info(f"File was written successfully at {output_path}.") |
|
|
|
|
|
|
|
for i, k in enumerate(adata.obs.eval_category.unique()): |
|
try: |
|
adata.uns['rank_genes_groups_cov'][k] |
|
except: |
|
print(f"{i}: {k}") if 'DMSO' not in k else None |
|
|
|
|
|
|
|
adata_2 = sc.read(output_path) |
|
|
|
for i, k in enumerate(adata_2.obs.eval_category.unique()): |
|
try: |
|
adata_2.uns['rank_genes_groups_cov'][k] |
|
except: |
|
print(f"{i}: {k}") if 'DMSO' not in k else None |
|
|
|
set(list(adata.uns['rank_genes_groups_cov'])) - set((list(adata_2.uns['rank_genes_groups_cov']))) |
|
|
|
set((list(adata_2.uns['rank_genes_groups_cov']))) - set(list(adata.uns['rank_genes_groups_cov'])) |
|
|
|
len(list(adata_2.uns["rank_genes_groups_cov"].keys())) |
|
|
|
adata.obs["dose_val"].value_counts() |
|
|
|
|
|
|