Spaces:

maomlab
/

ToxoCEN-Network

Sleeping

File size: 5,662 Bytes


import numpy as np
import pandas as pd
import datasets
import streamlit as st
from streamlit_cytoscapejs import st_cytoscapejs
import networkx as nx

st.set_page_config(layout='wide')

# parse out gene_ids from URL query args to it's possible to link to this page
query_params = st.query_params
if "gene_ids" in query_params.keys():
    input_gene_ids = query_params["gene_ids"]
else:
    input_gene_ids = "TGME49_231630,TGME49_230210"

# use "\n" as the separator so it shows correctly in the text area
input_gene_ids = input_gene_ids.replace(",", "\n")
    


st.markdown("""
# ToxoCEN Network
**ToxoCEN** is a co-expression network for *Toxoplasma gondii* built on 719 RNA-seq runs across 39 studies.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes. 
To Cite:

    CS Arnold, Y Wang, VB Carruthers, MJ O'Meara
    ToxoCEN: A Co-Expression Network for Toxoplasma gondii

* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/ToxoCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/ToxoCEN
## Plot a network for a set of genes
Put a ``TGME49_######`` gene_id, one each row to seed the network
""")

TGME49_transcript_annotations = datasets.load_dataset(
    path = "maomlab/ToxoCEN",
    data_files = {"TGME49_transcript_annotations": "TGME49_transcript_annotations.tsv"})
TGME49_transcript_annotations = TGME49_transcript_annotations["TGME49_transcript_annotations"].to_pandas()

top_coexp_hits = datasets.load_dataset(
    path = "maomlab/ToxoCEN",
    data_files = {"top_coexp_hits": "top_coexp_hits.tsv"})
top_coexp_hits = top_coexp_hits["top_coexp_hits"].to_pandas()


col1, col3, padding = st.columns(spec = [0.2, 0.2, 0.6])
with col1:
    input_gene_ids = st.text_area(
        label = "Gene IDs",
        value = f"{input_gene_ids}",
        help = "TGME49 Gene IDs e.g. TGME49_231630")

coexp_score_threshold = 0.85
    
##################################
# Parse and check the user input #
##################################

seed_gene_ids = [gene_id.strip() for gene_id in input_gene_ids.split("\n")]

neighbors = []
for seed_gene_id in seed_gene_ids:
    neighbors.append(
        top_coexp_hits[
            (top_coexp_hits.gene_id_1 == seed_gene_id) & (top_coexp_hits.coexp_score > coexp_score_threshold)])

neighbors = pd.concat(neighbors)

neighbor_gene_ids = list(set(neighbors.gene_id_2))
gene_ids = seed_gene_ids + neighbor_gene_ids
gene_types = ['seed'] * len(seed_gene_ids) + ['neighbor'] * len(neighbor_gene_ids)

TGME49_ids = []
gene_names = []
descriptions = []
    
for gene_id in gene_ids:
    try:
        TGME49_id = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id]["TGME49_id"].values[0]
        gene_name = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id]["gene_name"].values[0]
        description = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id]["description"].values[0]
    except:
        st.error(f"Unable to locate TGME49_id for Gene ID: {gene_id}, it should be of the form 'TGME49_######'")
        TGME49_id = None
        gene_name = None
        description = None

    TGME49_ids.append(TGME49_id)
    gene_names.append(gene_name)
    descriptions.append(description)

node_info = pd.DataFrame({
    "gene_index": range(len(gene_ids)),
    "gene_id" : gene_ids,
    "gene_type" : gene_types,
    "TGME49_id": TGME49_ids,
    "gene_name": gene_names,
    "description": description})

neighbors = neighbors.merge(
    right = node_info,
    left_on = "gene_id_1",
    right_on = "gene_id")

neighbors = neighbors.merge(
    right = node_info,
    left_on = "gene_id_2",
    right_on = "gene_id",
    suffixes = ("_a", "_b"))

################################
# Use NetworkX to layout graph #
################################
# note I think CytoscapeJS can layout graphs
# but I'm unsure how to do it through the streamlit-cytoscapejs interface :(

st.write(neighbors)


G = nx.Graph()
for i in range(len(neighbors.index)):
    edge = neighbors.iloc[i]
    G.add_edge(
        edge["gene_index_a"],
        edge["gene_index_b"],
        weight = edge["coexp_score"])
layout = nx.spring_layout(G)



elements = []
for i in range(len(node_info.index)):
    node = node_info.iloc[i]
    elements.append({
        "data": {
            "id": node["gene_id"],
            "label": node["gene_name"] if node["gene_name"] is not None else node["gene_id"]},
        "position": {
            "x" : layout[node["gene_index"]][0] * 1200 + 1500/2,
            "y" : layout[node["gene_index"]][1] * 1200 + 1500/2}})

for i in range(len(neighbors.index)):
    edge = neighbors.iloc[i]
    elements.append({
        "data" : {
            "source" : edge["gene_id_1"],
            "target" : edge["gene_id_2"],
            "label" : edge["coexp_score"]}})


with col3:
    st.text('') # help alignment with input box
    st.download_button(
        label = "Download as as TSV",
        data = neighbors.to_csv(sep ='\t').encode('utf-8'),
        file_name = f"ToxoCEN_network.tsv",
        mime = "text/csv")

##########################################################

stylesheet = [
    {"selector": "node", "style": {
        "width": 200,
        "height": 75,
        "shape": "rectangle",
        "labelFontSize": 100,
        
    }},
    {"selector": "edge", "style": {"width": 10}}
]

st.title("ToxoCEN Network")
clicked_elements = st_cytoscapejs(
    elements = elements,
    stylesheet = stylesheet,
    width = 1500,
    height= 1500)