File size: 4,231 Bytes
032c0ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from pathlib import Path

import numpy as np
import dask.distributed as dd
import scanpy as sc
import anndata as ad
import h5py
import dask

from collections import Counter
import pandas as pd
from tqdm import tqdm
import dask
import time
from dask.distributed import Client, LocalCluster

import dask

sc.logging.print_header()

if use_gpu:
    import rapids_singlecell as rsc
    SPARSE_CHUNK_SIZE = 100_000
    from dask_cuda import LocalCUDACluster

    import cupy as cp


    preprocessing_gpus="0,1,2,3,4,5,6,7"
    cluster = LocalCUDACluster(CUDA_VISIBLE_DEVICES=preprocessing_gpus,
                                threads_per_worker=25,
                                protocol="tcp")
else:
    SPARSE_CHUNK_SIZE = 100_000
    cluster = LocalCluster(n_workers=16)

client = Client(cluster)
if use_gpu:
    mod = rsc
else:
    mod = sc

from packaging.version import parse as parse_version
import gc

adatas = []
all_highly_variable_genes = []

if parse_version(ad.__version__) < parse_version("0.12.0rc1"):
    from anndata.experimental import read_elem_as_dask as read_dask
else:
    from anndata.experimental import read_elem_lazy as read_dask

# non comprehensive filter
def filter_protein_coding_genes(gene_list):
    """
    Filter a list of gene names to keep only protein-coding genes.
    
    Args:
        gene_list: A list of gene names as strings
        
    Returns:
        A list containing only likely protein-coding gene names
    """
    # Common non-coding gene identifiers/patterns
    non_coding_patterns = [
        'LOC', 'LINC', 'MIR', 'mir-', 'SNOR', 'SNHG', 
        'RNU', 'tRNA', 'rRNA', 'snoR', 'snR', 'lncRNA',
        'pseudo', 'NEAT', 'XIST', 'MALAT', 'HOTAIR'
    ]
    
    protein_coding_genes = []
    
    for gene in gene_list:
        # Skip gene if it contains any of the non-coding patterns
        if not any(pattern in gene.upper() for pattern in non_coding_patterns):
            # Skip ENSG IDs (uncharacterized Ensembl genes)
            if not gene.startswith('ENSG'):
                # Skip mitochondrial genes (MT-*)
                if not gene.upper().startswith('MT-'):
                    protein_coding_genes.append(gene)
            
    return protein_coding_genes

for i in tqdm(range(4)):
    id = str(i + 1)
    PATH = f"data/raw/plate{id}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad"

    with h5py.File(PATH, "r") as f:
        adata = ad.AnnData(
            obs=ad.io.read_elem(f["obs"]),
            var=ad.io.read_elem(f["var"]),
        )
        adata.X = read_dask(
            f["X"], chunks=(SPARSE_CHUNK_SIZE, adata.shape[1])
        )

    if use_gpu:
        rsc.get.anndata_to_GPU(adata)
    # 100m filtering
    pass_filter_mask = adata.obs["pass_filter"] == "full"
    adata = adata[pass_filter_mask, :].copy()

    # Filter to keep only protein coding genes before HVG selection
    protein_coding_genes = filter_protein_coding_genes(adata.var_names)
    adata = adata[:, protein_coding_genes].copy()

    mod.pp.normalize_total(adata, target_sum=10_000)
    mod.pp.log1p(adata)
    mod.pp.highly_variable_genes(adata, n_top_genes=8_000)

    highly_variable_genes = set(adata.var_names[adata.var["highly_variable"]])
    all_highly_variable_genes.append(highly_variable_genes)
    adatas.append(adata)

## select the genes appears more than two plates
gene_counts = Counter(gene for genes in all_highly_variable_genes for gene in genes)
selected_genes = {gene for gene, count in gene_counts.items() if count > 2}

with open('selected_genes.txt', 'w') as f:
    for gene in selected_genes:
        f.write(f"{gene}\n")
selected_genes = set([x.strip() for x in open("tahoe/selected_genes.txt")])

adatas_to_merge = []
for i in tqdm(range(4)):
    adata = sc.read_h5ad(f"/home/ubuntu/data/raw/plate{i+1}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad")
    # Filter to only include selected genes
    common_genes = list(set(adata.var_names) & selected_genes)
    adata = adata[:, common_genes].copy()
    adatas_to_merge.append(adata)

# Merge all datasets at the end
merged_adata = adatas_to_merge[0].concatenate(adatas_to_merge[1:], join='inner')

# Save the merged dataset
merged_adata.write_h5ad("/home/ubuntu/data/merged.h5ad")