Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -2,15 +2,26 @@ import gradio as gr
|
|
2 |
import sqlite3
|
3 |
import json
|
4 |
import numpy as np
|
5 |
-
import
|
6 |
from sentence_transformers import SentenceTransformer
|
|
|
7 |
|
8 |
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
|
9 |
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
|
10 |
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
def find_best_hpo(finding, region, threshold):
|
16 |
query = f"{finding} {region}" if region else finding
|
@@ -18,21 +29,28 @@ def find_best_hpo(finding, region, threshold):
|
|
18 |
|
19 |
conn = sqlite3.connect(db_path)
|
20 |
cursor = conn.cursor()
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
best_match, best_score = None, -1
|
24 |
|
25 |
for hpo_id, hpo_name, embedding_str in cursor.fetchall():
|
26 |
hpo_embedding = np.array(json.loads(embedding_str))
|
27 |
-
|
|
|
|
|
28 |
|
29 |
-
if
|
30 |
-
best_score =
|
31 |
best_match = {"hpo_id": hpo_id, "hpo_name": hpo_name}
|
32 |
|
|
|
33 |
return best_match if best_score >= threshold else None
|
34 |
|
35 |
-
|
36 |
def get_genes(hpo_id):
|
37 |
conn = sqlite3.connect(db_path)
|
38 |
cursor = conn.cursor()
|
@@ -41,26 +59,27 @@ def get_genes(hpo_id):
|
|
41 |
conn.close()
|
42 |
return genes
|
43 |
|
44 |
-
|
45 |
def hpo_mapper(finding, region, threshold):
|
46 |
if not finding:
|
47 |
return "Please provide a pathological finding.", "", ""
|
48 |
|
49 |
-
|
|
|
|
|
|
|
50 |
|
51 |
if match:
|
52 |
genes = get_genes(match["hpo_id"])
|
53 |
return match["hpo_id"], match["hpo_name"], ", ".join(genes)
|
54 |
|
55 |
-
return "No match found", "", ""
|
56 |
-
|
57 |
|
58 |
demo = gr.Interface(
|
59 |
fn=hpo_mapper,
|
60 |
inputs=[
|
61 |
gr.Textbox(label="Pathological Finding"),
|
62 |
gr.Textbox(label="Anatomical Region (optional)"),
|
63 |
-
gr.Slider(0.5, 1.0, step=0.01, value=0.74, label="Similarity Threshold")
|
64 |
],
|
65 |
outputs=[
|
66 |
gr.Textbox(label="HPO ID"),
|
@@ -73,12 +92,15 @@ demo = gr.Interface(
|
|
73 |
'(e.g. "terminal ileum") to map it to the closest Human Phenotype Ontology (HPO) '
|
74 |
'term and retrieve associated genes.\n\n'
|
75 |
'**Reference:**\n'
|
76 |
-
'Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data
|
77 |
-
'
|
78 |
-
'
|
|
|
|
|
|
|
79 |
'HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)'
|
80 |
-
)
|
81 |
)
|
82 |
|
83 |
if __name__ == "__main__":
|
84 |
-
demo.launch()
|
|
|
2 |
import sqlite3
|
3 |
import json
|
4 |
import numpy as np
|
5 |
+
import os
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
|
9 |
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
|
10 |
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
|
11 |
|
12 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
13 |
+
if not HF_TOKEN:
|
14 |
+
raise ValueError("Missing Hugging Face API token.")
|
15 |
+
|
16 |
+
db_repo = "UoS-HGIG/hpo_genes"
|
17 |
+
db_filename = "hpo_genes.db"
|
18 |
+
db_path = db_filename
|
19 |
|
20 |
+
# Download database if not present locally
|
21 |
+
if not os.path.exists(db_path):
|
22 |
+
db_path = hf_hub_download(
|
23 |
+
repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN
|
24 |
+
)
|
25 |
|
26 |
def find_best_hpo(finding, region, threshold):
|
27 |
query = f"{finding} {region}" if region else finding
|
|
|
29 |
|
30 |
conn = sqlite3.connect(db_path)
|
31 |
cursor = conn.cursor()
|
32 |
+
|
33 |
+
try:
|
34 |
+
cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
|
35 |
+
except sqlite3.OperationalError as e:
|
36 |
+
conn.close()
|
37 |
+
raise ValueError(f"Database error: {e}")
|
38 |
|
39 |
best_match, best_score = None, -1
|
40 |
|
41 |
for hpo_id, hpo_name, embedding_str in cursor.fetchall():
|
42 |
hpo_embedding = np.array(json.loads(embedding_str))
|
43 |
+
similarity = np.dot(query_embedding, hpo_embedding) / (
|
44 |
+
np.linalg.norm(query_embedding) * np.linalg.norm(hpo_embedding)
|
45 |
+
)
|
46 |
|
47 |
+
if similarity > best_score:
|
48 |
+
best_score = similarity
|
49 |
best_match = {"hpo_id": hpo_id, "hpo_name": hpo_name}
|
50 |
|
51 |
+
conn.close()
|
52 |
return best_match if best_score >= threshold else None
|
53 |
|
|
|
54 |
def get_genes(hpo_id):
|
55 |
conn = sqlite3.connect(db_path)
|
56 |
cursor = conn.cursor()
|
|
|
59 |
conn.close()
|
60 |
return genes
|
61 |
|
|
|
62 |
def hpo_mapper(finding, region, threshold):
|
63 |
if not finding:
|
64 |
return "Please provide a pathological finding.", "", ""
|
65 |
|
66 |
+
try:
|
67 |
+
match = find_best_hpo(finding, region, threshold)
|
68 |
+
except ValueError as e:
|
69 |
+
return str(e), "", ""
|
70 |
|
71 |
if match:
|
72 |
genes = get_genes(match["hpo_id"])
|
73 |
return match["hpo_id"], match["hpo_name"], ", ".join(genes)
|
74 |
|
75 |
+
return "No match found.", "", ""
|
|
|
76 |
|
77 |
demo = gr.Interface(
|
78 |
fn=hpo_mapper,
|
79 |
inputs=[
|
80 |
gr.Textbox(label="Pathological Finding"),
|
81 |
gr.Textbox(label="Anatomical Region (optional)"),
|
82 |
+
gr.Slider(0.5, 1.0, step=0.01, value=0.74, label="Similarity Threshold"),
|
83 |
],
|
84 |
outputs=[
|
85 |
gr.Textbox(label="HPO ID"),
|
|
|
92 |
'(e.g. "terminal ileum") to map it to the closest Human Phenotype Ontology (HPO) '
|
93 |
'term and retrieve associated genes.\n\n'
|
94 |
'**Reference:**\n'
|
95 |
+
'Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data '
|
96 |
+
'for Acceleration of Inflammatory Bowel Disease Research\n'
|
97 |
+
'Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, '
|
98 |
+
'Matt Stammers, Christopher M Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n'
|
99 |
+
'medRxiv 2025.03.07.25323569; doi: '
|
100 |
+
'[https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n\n'
|
101 |
'HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)'
|
102 |
+
),
|
103 |
)
|
104 |
|
105 |
if __name__ == "__main__":
|
106 |
+
demo.launch()
|