Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,35 +3,18 @@ import sqlite3
|
|
3 |
import json
|
4 |
import numpy as np
|
5 |
from numpy.linalg import norm
|
6 |
-
from huggingface_hub import hf_hub_download
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
import os
|
9 |
-
import logging
|
10 |
-
|
11 |
-
# Set up logging in a hidden directory
|
12 |
-
log_dir = os.path.expanduser("~/.logs")
|
13 |
-
os.makedirs(log_dir, exist_ok=True)
|
14 |
-
log_file = os.path.join(log_dir, "hpo_mapper.log")
|
15 |
-
|
16 |
-
logging.basicConfig(
|
17 |
-
filename=log_file,
|
18 |
-
level=logging.INFO,
|
19 |
-
format="%(asctime)s - %(levelname)s - %(message)s"
|
20 |
-
)
|
21 |
|
22 |
# Get Hugging Face Token from Environment Variables
|
23 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
24 |
if not HF_TOKEN:
|
25 |
raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.")
|
26 |
|
27 |
-
# Load the Nomic-Embed Model from Hugging Face
|
28 |
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
|
29 |
-
|
30 |
-
model_path = snapshot_download(EMBEDDING_MODEL, trust_remote_code=True, force_download=True)
|
31 |
-
embedder = SentenceTransformer(model_path, trust_remote_code=True)
|
32 |
-
except Exception as e:
|
33 |
-
logging.error(f"Failed to load model: {e}")
|
34 |
-
raise
|
35 |
|
36 |
# Download database from Hugging Face Datasets if it does not exist
|
37 |
db_filename = "hpo_genes.db"
|
@@ -41,6 +24,7 @@ db_path = os.path.join(os.getcwd(), db_filename)
|
|
41 |
if not os.path.exists(db_path):
|
42 |
db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
|
43 |
|
|
|
44 |
def find_best_hpo_match(finding, region, threshold):
|
45 |
query_text = f"{finding} in {region}" if region else finding
|
46 |
query_embedding = embedder.encode(query_text)
|
@@ -62,6 +46,7 @@ def find_best_hpo_match(finding, region, threshold):
|
|
62 |
conn.close()
|
63 |
return best_match if best_score >= threshold else None
|
64 |
|
|
|
65 |
def get_genes_for_hpo(hpo_id):
|
66 |
conn = sqlite3.connect(db_path)
|
67 |
cursor = conn.cursor()
|
@@ -70,6 +55,7 @@ def get_genes_for_hpo(hpo_id):
|
|
70 |
conn.close()
|
71 |
return result[0].split(", ") if result else []
|
72 |
|
|
|
73 |
def hpo_mapper_ui(finding, region, threshold):
|
74 |
if not finding:
|
75 |
return "Please enter a pathological finding.", "", ""
|
@@ -78,19 +64,17 @@ def hpo_mapper_ui(finding, region, threshold):
|
|
78 |
|
79 |
if match:
|
80 |
genes = get_genes_for_hpo(match["hpo_id"])
|
81 |
-
|
82 |
-
|
83 |
-
return output
|
84 |
-
|
85 |
-
logging.info(f"Input: Finding='{finding}', Region='{region}', Threshold={threshold} -> No match found.")
|
86 |
return "No match found.", "", ""
|
87 |
|
|
|
88 |
demo = gr.Interface(
|
89 |
fn=hpo_mapper_ui,
|
90 |
inputs=[
|
91 |
gr.Textbox(label="Pathological Finding"),
|
92 |
gr.Textbox(label="Anatomical Region (optional)"),
|
93 |
-
gr.Slider(0.
|
94 |
],
|
95 |
outputs=[
|
96 |
gr.Textbox(label="HPO ID"),
|
@@ -101,9 +85,17 @@ demo = gr.Interface(
|
|
101 |
description=(
|
102 |
'Enter a pathological finding (e.g., "chronic inflammation") and anatomical region '
|
103 |
'(e.g., "terminal ileum") to map it to the closest Human Phenotype Ontology (HPO) '
|
104 |
-
'term and retrieve genes annotated as being associated with this HPO term
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
)
|
106 |
)
|
107 |
|
108 |
if __name__ == "__main__":
|
109 |
-
demo.launch()
|
|
|
3 |
import json
|
4 |
import numpy as np
|
5 |
from numpy.linalg import norm
|
6 |
+
from huggingface_hub import hf_hub_download
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Get Hugging Face Token from Environment Variables
|
11 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
12 |
if not HF_TOKEN:
|
13 |
raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.")
|
14 |
|
15 |
+
# Load the Nomic-Embed Model from Hugging Face
|
16 |
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
|
17 |
+
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
# Download database from Hugging Face Datasets if it does not exist
|
20 |
db_filename = "hpo_genes.db"
|
|
|
24 |
if not os.path.exists(db_path):
|
25 |
db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
|
26 |
|
27 |
+
|
28 |
def find_best_hpo_match(finding, region, threshold):
|
29 |
query_text = f"{finding} in {region}" if region else finding
|
30 |
query_embedding = embedder.encode(query_text)
|
|
|
46 |
conn.close()
|
47 |
return best_match if best_score >= threshold else None
|
48 |
|
49 |
+
|
50 |
def get_genes_for_hpo(hpo_id):
|
51 |
conn = sqlite3.connect(db_path)
|
52 |
cursor = conn.cursor()
|
|
|
55 |
conn.close()
|
56 |
return result[0].split(", ") if result else []
|
57 |
|
58 |
+
|
59 |
def hpo_mapper_ui(finding, region, threshold):
|
60 |
if not finding:
|
61 |
return "Please enter a pathological finding.", "", ""
|
|
|
64 |
|
65 |
if match:
|
66 |
genes = get_genes_for_hpo(match["hpo_id"])
|
67 |
+
return match["hpo_id"], match["hpo_term"], ", ".join(genes)
|
68 |
+
|
|
|
|
|
|
|
69 |
return "No match found.", "", ""
|
70 |
|
71 |
+
|
72 |
demo = gr.Interface(
|
73 |
fn=hpo_mapper_ui,
|
74 |
inputs=[
|
75 |
gr.Textbox(label="Pathological Finding"),
|
76 |
gr.Textbox(label="Anatomical Region (optional)"),
|
77 |
+
gr.Slider(0.0, 1.0, step=0.01, value=0.76, label="Similarity Threshold")
|
78 |
],
|
79 |
outputs=[
|
80 |
gr.Textbox(label="HPO ID"),
|
|
|
85 |
description=(
|
86 |
'Enter a pathological finding (e.g., "chronic inflammation") and anatomical region '
|
87 |
'(e.g., "terminal ileum") to map it to the closest Human Phenotype Ontology (HPO) '
|
88 |
+
'term and retrieve genes annotated as being associated with this HPO term.\n\n'
|
89 |
+
'**References:**\n'
|
90 |
+
'Kadhim, A. Z., Green, Z., Nazari, I., Baker, J., George, M., Heinson, A., Stammers, M., Kipps, C., Beattie, R. M., Ashton, J. J., & Ennis, S. (2025).\n'
|
91 |
+
'Application of generative artificial intelligence to utilise unstructured clinical data for acceleration of inflammatory bowel disease research.\n'
|
92 |
+
'*medRxiv*. [https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n\n'
|
93 |
+
'Gargano, M. A., Matentzoglu, N., Coleman, B., Addo-Lartey, E. B., Anagnostopoulos, A. V., Anderton, J., Avillach, P., Bagley, A. M., Bakštein, E., Balhoff, J. P., Baynam, G., Bello, S. M., Berk, M., Bertram, H., Bishop, S., Blau, H., Bodenstein, D. F., Botas, P., Boztug, K., Čady, J., … Robinson, P. N. (2024)\n'
|
94 |
+
'The Human Phenotype Ontology in 2024: phenotypes around the world.\n'
|
95 |
+
'*Nucleic Acids Research* [https://doi.org/10.1093/nar/gkad1005](https://doi.org/10.1093/nar/gkad1005)\n\n'
|
96 |
+
'HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)'
|
97 |
)
|
98 |
)
|
99 |
|
100 |
if __name__ == "__main__":
|
101 |
+
demo.launch()
|