Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,22 +2,28 @@ import gradio as gr
|
|
2 |
import sqlite3
|
3 |
import json
|
4 |
import numpy as np
|
5 |
-
import subprocess # To run OntoGPT as a CLI command
|
6 |
from numpy.linalg import norm
|
7 |
from huggingface_hub import hf_hub_download
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
import os
|
|
|
10 |
|
11 |
# Get Hugging Face Token from Environment Variables
|
12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
13 |
if not HF_TOKEN:
|
14 |
-
raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable
|
15 |
|
16 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
|
18 |
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
|
19 |
|
20 |
-
# Download database from Hugging Face
|
21 |
db_filename = "hpo_genes.db"
|
22 |
db_repo = "UoS-HGIG/hpo_genes"
|
23 |
db_path = os.path.join(os.getcwd(), db_filename)
|
@@ -25,6 +31,7 @@ db_path = os.path.join(os.getcwd(), db_filename)
|
|
25 |
if not os.path.exists(db_path):
|
26 |
db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
|
27 |
|
|
|
28 |
def find_best_hpo_match(finding, region, threshold):
|
29 |
"""Finds the best HPO match using semantic similarity."""
|
30 |
query_text = f"{finding} in {region}"
|
@@ -45,7 +52,8 @@ def find_best_hpo_match(finding, region, threshold):
|
|
45 |
best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
|
46 |
|
47 |
conn.close()
|
48 |
-
return best_match if best_score > threshold else None
|
|
|
49 |
|
50 |
def get_genes_for_hpo(hpo_id):
|
51 |
"""Retrieves associated genes for a given HPO ID."""
|
@@ -56,46 +64,35 @@ def get_genes_for_hpo(hpo_id):
|
|
56 |
conn.close()
|
57 |
return result[0].split(", ") if result else []
|
58 |
|
59 |
-
def extract_with_ontogpt(finding, region):
|
60 |
-
"""Uses OntoGPT CLI to extract ontology terms."""
|
61 |
-
input_text = f"{finding} observed in {region}."
|
62 |
-
|
63 |
-
try:
|
64 |
-
# Run OntoGPT extraction (modify parameters as needed)
|
65 |
-
result = subprocess.run(
|
66 |
-
["ontogpt", "extract", "-t", "hpo", "-m", "meta-llama/Llama-3.1-70B-Instruct"],
|
67 |
-
input=input_text,
|
68 |
-
text=True,
|
69 |
-
capture_output=True
|
70 |
-
)
|
71 |
-
return result.stdout.strip() # Return extracted ontology term
|
72 |
-
except Exception as e:
|
73 |
-
return f"Error running OntoGPT: {str(e)}"
|
74 |
|
75 |
def get_hpo_for_finding(finding, region, threshold):
|
76 |
-
"""Finds the best HPO term and retrieves associated genes
|
77 |
hpo_match = find_best_hpo_match(finding, region, threshold)
|
78 |
-
|
79 |
if hpo_match:
|
80 |
-
|
81 |
-
hpo_match["genes"] = get_genes_for_hpo(hpo_id)
|
82 |
-
|
83 |
-
# Use OntoGPT to refine the mapping
|
84 |
-
enriched_description = extract_with_ontogpt(finding, region)
|
85 |
-
hpo_match["description"] = enriched_description
|
86 |
-
|
87 |
else:
|
88 |
-
hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": []
|
89 |
-
|
90 |
return hpo_match
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def hpo_mapper_ui(finding, region, threshold):
|
93 |
-
"""Function for Gradio UI to get HPO mappings."""
|
94 |
if not finding or not region:
|
95 |
-
return "Please enter both finding and region.", "", ""
|
|
|
|
|
|
|
96 |
|
97 |
-
|
98 |
-
return result["hpo_id"], result["hpo_term"], ", ".join(result["genes"]), result["description"]
|
99 |
|
100 |
# Create Gradio UI
|
101 |
demo = gr.Interface(
|
@@ -109,7 +106,7 @@ demo = gr.Interface(
|
|
109 |
gr.Textbox(label="HPO ID"),
|
110 |
gr.Textbox(label="HPO Term"),
|
111 |
gr.Textbox(label="Associated Genes"),
|
112 |
-
gr.Textbox(label="OntoGPT
|
113 |
],
|
114 |
title="HPO Mapper with OntoGPT",
|
115 |
description=(
|
@@ -120,6 +117,7 @@ demo = gr.Interface(
|
|
120 |
"Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
|
121 |
"medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
|
122 |
)
|
|
|
123 |
)
|
124 |
|
125 |
if __name__ == "__main__":
|
|
|
2 |
import sqlite3
|
3 |
import json
|
4 |
import numpy as np
|
|
|
5 |
from numpy.linalg import norm
|
6 |
from huggingface_hub import hf_hub_download
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
import os
|
9 |
+
import subprocess
|
10 |
|
11 |
# Get Hugging Face Token from Environment Variables
|
12 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
13 |
if not HF_TOKEN:
|
14 |
+
raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.")
|
15 |
|
16 |
+
# Set Hugging Face API key for OntoGPT
|
17 |
+
subprocess.run(["runoak", "set-apikey", "-e", "huggingface-key", HF_TOKEN], check=True)
|
18 |
+
|
19 |
+
# Define OntoGPT model
|
20 |
+
ONTOGPT_MODEL = "huggingface/WizardLM/WizardCoder-Python-34B-V1.0"
|
21 |
+
|
22 |
+
# Load the Nomic-Embed Model
|
23 |
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
|
24 |
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
|
25 |
|
26 |
+
# Download database from Hugging Face if not exists
|
27 |
db_filename = "hpo_genes.db"
|
28 |
db_repo = "UoS-HGIG/hpo_genes"
|
29 |
db_path = os.path.join(os.getcwd(), db_filename)
|
|
|
31 |
if not os.path.exists(db_path):
|
32 |
db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
|
33 |
|
34 |
+
|
35 |
def find_best_hpo_match(finding, region, threshold):
|
36 |
"""Finds the best HPO match using semantic similarity."""
|
37 |
query_text = f"{finding} in {region}"
|
|
|
52 |
best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
|
53 |
|
54 |
conn.close()
|
55 |
+
return best_match if best_score > threshold else None
|
56 |
+
|
57 |
|
58 |
def get_genes_for_hpo(hpo_id):
|
59 |
"""Retrieves associated genes for a given HPO ID."""
|
|
|
64 |
conn.close()
|
65 |
return result[0].split(", ") if result else []
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
def get_hpo_for_finding(finding, region, threshold):
|
69 |
+
"""Finds the best HPO term and retrieves associated genes."""
|
70 |
hpo_match = find_best_hpo_match(finding, region, threshold)
|
|
|
71 |
if hpo_match:
|
72 |
+
hpo_match["genes"] = get_genes_for_hpo(hpo_match["hpo_id"])
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
else:
|
74 |
+
hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": []}
|
|
|
75 |
return hpo_match
|
76 |
|
77 |
+
|
78 |
+
def run_ontogpt(finding, region):
|
79 |
+
"""Runs OntoGPT to extract information."""
|
80 |
+
input_text = f"{finding} in {region}"
|
81 |
+
result = subprocess.run([
|
82 |
+
"ontogpt", "complete", "-m", ONTOGPT_MODEL, "-i", input_text
|
83 |
+
], capture_output=True, text=True)
|
84 |
+
return result.stdout.strip()
|
85 |
+
|
86 |
+
|
87 |
def hpo_mapper_ui(finding, region, threshold):
|
88 |
+
"""Function for Gradio UI to get HPO mappings and OntoGPT results."""
|
89 |
if not finding or not region:
|
90 |
+
return "Please enter both finding and region.", "", ""
|
91 |
+
|
92 |
+
hpo_result = get_hpo_for_finding(finding, region, threshold)
|
93 |
+
ontogpt_output = run_ontogpt(finding, region)
|
94 |
|
95 |
+
return hpo_result["hpo_id"], hpo_result["hpo_term"], ", ".join(hpo_result["genes"]), ontogpt_output
|
|
|
96 |
|
97 |
# Create Gradio UI
|
98 |
demo = gr.Interface(
|
|
|
106 |
gr.Textbox(label="HPO ID"),
|
107 |
gr.Textbox(label="HPO Term"),
|
108 |
gr.Textbox(label="Associated Genes"),
|
109 |
+
gr.Textbox(label="OntoGPT Output")
|
110 |
],
|
111 |
title="HPO Mapper with OntoGPT",
|
112 |
description=(
|
|
|
117 |
"Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
|
118 |
"medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
|
119 |
)
|
120 |
+
|
121 |
)
|
122 |
|
123 |
if __name__ == "__main__":
|