emdann's picture
Upload folder using huggingface_hub
032c0ea verified
from pathlib import Path
import numpy as np
import dask.distributed as dd
import scanpy as sc
import anndata as ad
import h5py
import dask
from collections import Counter
import pandas as pd
from tqdm import tqdm
import dask
import time
from dask.distributed import Client, LocalCluster
import dask
sc.logging.print_header()
if use_gpu:
import rapids_singlecell as rsc
SPARSE_CHUNK_SIZE = 100_000
from dask_cuda import LocalCUDACluster
import cupy as cp
preprocessing_gpus="0,1,2,3,4,5,6,7"
cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=preprocessing_gpus,
threads_per_worker=25,
protocol="tcp")
else:
SPARSE_CHUNK_SIZE = 100_000
cluster = LocalCluster(n_workers=16)
client = Client(cluster)
if use_gpu:
mod = rsc
else:
mod = sc
from packaging.version import parse as parse_version
import gc
adatas = []
all_highly_variable_genes = []
if parse_version(ad.__version__) < parse_version("0.12.0rc1"):
from anndata.experimental import read_elem_as_dask as read_dask
else:
from anndata.experimental import read_elem_lazy as read_dask
# non comprehensive filter
def filter_protein_coding_genes(gene_list):
"""
Filter a list of gene names to keep only protein-coding genes.
Args:
gene_list: A list of gene names as strings
Returns:
A list containing only likely protein-coding gene names
"""
# Common non-coding gene identifiers/patterns
non_coding_patterns = [
'LOC', 'LINC', 'MIR', 'mir-', 'SNOR', 'SNHG',
'RNU', 'tRNA', 'rRNA', 'snoR', 'snR', 'lncRNA',
'pseudo', 'NEAT', 'XIST', 'MALAT', 'HOTAIR'
]
protein_coding_genes = []
for gene in gene_list:
# Skip gene if it contains any of the non-coding patterns
if not any(pattern in gene.upper() for pattern in non_coding_patterns):
# Skip ENSG IDs (uncharacterized Ensembl genes)
if not gene.startswith('ENSG'):
# Skip mitochondrial genes (MT-*)
if not gene.upper().startswith('MT-'):
protein_coding_genes.append(gene)
return protein_coding_genes
for i in tqdm(range(4)):
id = str(i + 1)
PATH = f"data/raw/plate{id}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad"
with h5py.File(PATH, "r") as f:
adata = ad.AnnData(
obs=ad.io.read_elem(f["obs"]),
var=ad.io.read_elem(f["var"]),
)
adata.X = read_dask(
f["X"], chunks=(SPARSE_CHUNK_SIZE, adata.shape[1])
)
if use_gpu:
rsc.get.anndata_to_GPU(adata)
# 100m filtering
pass_filter_mask = adata.obs["pass_filter"] == "full"
adata = adata[pass_filter_mask, :].copy()
# Filter to keep only protein coding genes before HVG selection
protein_coding_genes = filter_protein_coding_genes(adata.var_names)
adata = adata[:, protein_coding_genes].copy()
mod.pp.normalize_total(adata, target_sum=10_000)
mod.pp.log1p(adata)
mod.pp.highly_variable_genes(adata, n_top_genes=8_000)
highly_variable_genes = set(adata.var_names[adata.var["highly_variable"]])
all_highly_variable_genes.append(highly_variable_genes)
adatas.append(adata)
## select the genes appears more than two plates
gene_counts = Counter(gene for genes in all_highly_variable_genes for gene in genes)
selected_genes = {gene for gene, count in gene_counts.items() if count > 2}
with open('selected_genes.txt', 'w') as f:
for gene in selected_genes:
f.write(f"{gene}\n")
selected_genes = set([x.strip() for x in open("tahoe/selected_genes.txt")])
adatas_to_merge = []
for i in tqdm(range(4)):
adata = sc.read_h5ad(f"/home/ubuntu/data/raw/plate{i+1}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad")
# Filter to only include selected genes
common_genes = list(set(adata.var_names) & selected_genes)
adata = adata[:, common_genes].copy()
adatas_to_merge.append(adata)
# Merge all datasets at the end
merged_adata = adatas_to_merge[0].concatenate(adatas_to_merge[1:], join='inner')
# Save the merged dataset
merged_adata.write_h5ad("/home/ubuntu/data/merged.h5ad")