|
from pathlib import Path |
|
|
|
import numpy as np |
|
import dask.distributed as dd |
|
import scanpy as sc |
|
import anndata as ad |
|
import h5py |
|
import dask |
|
|
|
from collections import Counter |
|
import pandas as pd |
|
from tqdm import tqdm |
|
import dask |
|
import time |
|
from dask.distributed import Client, LocalCluster |
|
|
|
import dask |
|
|
|
sc.logging.print_header() |
|
|
|
if use_gpu: |
|
import rapids_singlecell as rsc |
|
SPARSE_CHUNK_SIZE = 100_000 |
|
from dask_cuda import LocalCUDACluster |
|
|
|
import cupy as cp |
|
|
|
|
|
preprocessing_gpus="0,1,2,3,4,5,6,7" |
|
cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=preprocessing_gpus, |
|
threads_per_worker=25, |
|
protocol="tcp") |
|
else: |
|
SPARSE_CHUNK_SIZE = 100_000 |
|
cluster = LocalCluster(n_workers=16) |
|
|
|
client = Client(cluster) |
|
if use_gpu: |
|
mod = rsc |
|
else: |
|
mod = sc |
|
|
|
from packaging.version import parse as parse_version |
|
import gc |
|
|
|
adatas = [] |
|
all_highly_variable_genes = [] |
|
|
|
if parse_version(ad.__version__) < parse_version("0.12.0rc1"): |
|
from anndata.experimental import read_elem_as_dask as read_dask |
|
else: |
|
from anndata.experimental import read_elem_lazy as read_dask |
|
|
|
|
|
def filter_protein_coding_genes(gene_list): |
|
""" |
|
Filter a list of gene names to keep only protein-coding genes. |
|
|
|
Args: |
|
gene_list: A list of gene names as strings |
|
|
|
Returns: |
|
A list containing only likely protein-coding gene names |
|
""" |
|
|
|
non_coding_patterns = [ |
|
'LOC', 'LINC', 'MIR', 'mir-', 'SNOR', 'SNHG', |
|
'RNU', 'tRNA', 'rRNA', 'snoR', 'snR', 'lncRNA', |
|
'pseudo', 'NEAT', 'XIST', 'MALAT', 'HOTAIR' |
|
] |
|
|
|
protein_coding_genes = [] |
|
|
|
for gene in gene_list: |
|
|
|
if not any(pattern in gene.upper() for pattern in non_coding_patterns): |
|
|
|
if not gene.startswith('ENSG'): |
|
|
|
if not gene.upper().startswith('MT-'): |
|
protein_coding_genes.append(gene) |
|
|
|
return protein_coding_genes |
|
|
|
for i in tqdm(range(4)): |
|
id = str(i + 1) |
|
PATH = f"data/raw/plate{id}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad" |
|
|
|
with h5py.File(PATH, "r") as f: |
|
adata = ad.AnnData( |
|
obs=ad.io.read_elem(f["obs"]), |
|
var=ad.io.read_elem(f["var"]), |
|
) |
|
adata.X = read_dask( |
|
f["X"], chunks=(SPARSE_CHUNK_SIZE, adata.shape[1]) |
|
) |
|
|
|
if use_gpu: |
|
rsc.get.anndata_to_GPU(adata) |
|
|
|
pass_filter_mask = adata.obs["pass_filter"] == "full" |
|
adata = adata[pass_filter_mask, :].copy() |
|
|
|
|
|
protein_coding_genes = filter_protein_coding_genes(adata.var_names) |
|
adata = adata[:, protein_coding_genes].copy() |
|
|
|
mod.pp.normalize_total(adata, target_sum=10_000) |
|
mod.pp.log1p(adata) |
|
mod.pp.highly_variable_genes(adata, n_top_genes=8_000) |
|
|
|
highly_variable_genes = set(adata.var_names[adata.var["highly_variable"]]) |
|
all_highly_variable_genes.append(highly_variable_genes) |
|
adatas.append(adata) |
|
|
|
|
|
gene_counts = Counter(gene for genes in all_highly_variable_genes for gene in genes) |
|
selected_genes = {gene for gene, count in gene_counts.items() if count > 2} |
|
|
|
with open('selected_genes.txt', 'w') as f: |
|
for gene in selected_genes: |
|
f.write(f"{gene}\n") |
|
selected_genes = set([x.strip() for x in open("tahoe/selected_genes.txt")]) |
|
|
|
adatas_to_merge = [] |
|
for i in tqdm(range(4)): |
|
adata = sc.read_h5ad(f"/home/ubuntu/data/raw/plate{i+1}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad") |
|
|
|
common_genes = list(set(adata.var_names) & selected_genes) |
|
adata = adata[:, common_genes].copy() |
|
adatas_to_merge.append(adata) |
|
|
|
|
|
merged_adata = adatas_to_merge[0].concatenate(adatas_to_merge[1:], join='inner') |
|
|
|
|
|
merged_adata.write_h5ad("/home/ubuntu/data/merged.h5ad") |
|
|