parsboy1987 commited on
Commit
5aebf41
·
verified ·
1 Parent(s): 106bedc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -37
app.py CHANGED
@@ -2,18 +2,22 @@ import gradio as gr
2
  import sqlite3
3
  import json
4
  import numpy as np
 
5
  from numpy.linalg import norm
6
  from huggingface_hub import hf_hub_download
7
  from sentence_transformers import SentenceTransformer
8
  import os
9
 
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN")
11
  if not HF_TOKEN:
12
- raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.")
13
 
 
14
  EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
15
  embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
16
 
 
17
  db_filename = "hpo_genes.db"
18
  db_repo = "UoS-HGIG/hpo_genes"
19
  db_path = os.path.join(os.getcwd(), db_filename)
@@ -22,72 +26,101 @@ if not os.path.exists(db_path):
22
  db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
23
 
24
  def find_best_hpo_match(finding, region, threshold):
25
- query = f"{finding} {region}" if region else finding
26
- query_embedding = embedder.encode(query)
 
27
 
28
  conn = sqlite3.connect(db_path)
29
  cursor = conn.cursor()
30
  cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
31
-
32
  best_match, best_score = None, -1
33
-
34
  for hpo_id, hpo_name, embedding_str in cursor.fetchall():
35
  hpo_embedding = np.array(json.loads(embedding_str))
36
  similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
37
-
38
  if similarity > best_score:
39
  best_score = similarity
40
- best_match = {"hpo_id": hpo_id, "hpo_name": hpo_name}
41
-
42
  conn.close()
43
-
44
- return best_match if best_score >= threshold else None
45
 
46
  def get_genes_for_hpo(hpo_id):
 
47
  conn = sqlite3.connect(db_path)
48
  cursor = conn.cursor()
49
  cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
50
  result = cursor.fetchone()
51
  conn.close()
52
-
53
- return result[0].split(',') if result else []
54
-
55
- def hpo_mapper_ui(finding, region, threshold):
56
- if not finding:
57
- return "Please enter a clinical pathological finding.", "", ""
58
-
59
- match = find_best_hpo_match(finding, region, threshold)
60
- if match:
61
- genes = get_genes_for_hpo(match['hpo_id'])
62
- return match['hpo_id'], match['hpo_name'], ", ".join(genes)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  else:
64
- return "", "No match found.", ""
65
-
66
- image_path = "https://huggingface.co/UoS-HGIG/MIMIC/resolve/main/images/hpo.png"
67
 
 
 
 
 
 
 
 
 
 
68
  demo = gr.Interface(
69
  fn=hpo_mapper_ui,
70
  inputs=[
71
- gr.Textbox(label="Pathological Finding"),
72
- gr.Textbox(label="Anatomical Region (optional)"),
73
- gr.Slider(0.5, 1.0, 0.01, value=0.74, label="Similarity Threshold")
74
  ],
75
  outputs=[
76
  gr.Textbox(label="HPO ID"),
77
  gr.Textbox(label="HPO Term"),
78
- gr.Textbox(label="HPO-associated Genes")
 
79
  ],
80
- title="Human Phenotype Ontology (HPO) Mapper",
81
  description=(
82
- f"![Workflow](https://huggingface.co/UoS-HGIG/MIMIC/blob/main/images/hpo.png)\n\n"
83
- "Enter a pathological finding and optionally a region to map it to the closest Human Phenotype Ontology (HPO) term and retrieve associated genes.\n\n"
84
- "**Reference:**\n"
85
- "Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research\n"
86
- "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher M Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
87
- "medRxiv 2025.03.07.25323569; doi: [https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n"
88
- "HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)"
89
  )
90
  )
91
 
92
  if __name__ == "__main__":
93
- demo.launch()
 
2
  import sqlite3
3
  import json
4
  import numpy as np
5
+ import subprocess # To run OntoGPT as a CLI command
6
  from numpy.linalg import norm
7
  from huggingface_hub import hf_hub_download
8
  from sentence_transformers import SentenceTransformer
9
  import os
10
 
11
+ # Get Hugging Face Token from Environment Variables
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
  if not HF_TOKEN:
14
+ raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.")
15
 
16
+ # Load the Nomic-Embed Model from Hugging Face with trust_remote_code=True
17
  EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
18
  embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
19
 
20
+ # Download database from Hugging Face Datasets if not exists
21
  db_filename = "hpo_genes.db"
22
  db_repo = "UoS-HGIG/hpo_genes"
23
  db_path = os.path.join(os.getcwd(), db_filename)
 
26
  db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
27
 
28
  def find_best_hpo_match(finding, region, threshold):
29
+ """Finds the best HPO match using semantic similarity."""
30
+ query_text = f"{finding} in {region}"
31
+ query_embedding = embedder.encode(query_text)
32
 
33
  conn = sqlite3.connect(db_path)
34
  cursor = conn.cursor()
35
  cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
36
+
37
  best_match, best_score = None, -1
38
+
39
  for hpo_id, hpo_name, embedding_str in cursor.fetchall():
40
  hpo_embedding = np.array(json.loads(embedding_str))
41
  similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
42
+
43
  if similarity > best_score:
44
  best_score = similarity
45
+ best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
46
+
47
  conn.close()
48
+ return best_match if best_score > threshold else None # Adjust threshold based on user input
 
49
 
50
  def get_genes_for_hpo(hpo_id):
51
+ """Retrieves associated genes for a given HPO ID."""
52
  conn = sqlite3.connect(db_path)
53
  cursor = conn.cursor()
54
  cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
55
  result = cursor.fetchone()
56
  conn.close()
57
+ return result[0].split(", ") if result else []
58
+
59
+ def extract_with_ontogpt(finding, region):
60
+ """Uses OntoGPT CLI to extract ontology terms."""
61
+ input_text = f"{finding} observed in {region}."
62
+
63
+ try:
64
+ # Run OntoGPT extraction (modify parameters as needed)
65
+ result = subprocess.run(
66
+ ["ontogpt", "extract", "-t", "hpo", "-m", "meta-llama/Llama-3.1-70B-Instruct"],
67
+ input=input_text,
68
+ text=True,
69
+ capture_output=True
70
+ )
71
+ return result.stdout.strip() # Return extracted ontology term
72
+ except Exception as e:
73
+ return f"Error running OntoGPT: {str(e)}"
74
+
75
+ def get_hpo_for_finding(finding, region, threshold):
76
+ """Finds the best HPO term and retrieves associated genes, enriched with OntoGPT."""
77
+ hpo_match = find_best_hpo_match(finding, region, threshold)
78
+
79
+ if hpo_match:
80
+ hpo_id = hpo_match["hpo_id"]
81
+ hpo_match["genes"] = get_genes_for_hpo(hpo_id)
82
+
83
+ # Use OntoGPT to refine the mapping
84
+ enriched_description = extract_with_ontogpt(finding, region)
85
+ hpo_match["description"] = enriched_description
86
+
87
  else:
88
+ hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": [], "description": "No match found."}
89
+
90
+ return hpo_match
91
 
92
+ def hpo_mapper_ui(finding, region, threshold):
93
+ """Function for Gradio UI to get HPO mappings."""
94
+ if not finding or not region:
95
+ return "Please enter both finding and region.", "", "", ""
96
+
97
+ result = get_hpo_for_finding(finding, region, threshold)
98
+ return result["hpo_id"], result["hpo_term"], ", ".join(result["genes"]), result["description"]
99
+
100
+ # Create Gradio UI
101
  demo = gr.Interface(
102
  fn=hpo_mapper_ui,
103
  inputs=[
104
+ gr.Textbox(label="Finding"),
105
+ gr.Textbox(label="Region"),
106
+ gr.Slider(minimum=0.5, maximum=1.0, step=0.01, value=0.74, label="Threshold")
107
  ],
108
  outputs=[
109
  gr.Textbox(label="HPO ID"),
110
  gr.Textbox(label="HPO Term"),
111
+ gr.Textbox(label="Associated Genes"),
112
+ gr.Textbox(label="OntoGPT Description") # New field for enriched ontology output
113
  ],
114
+ title="HPO Mapper with OntoGPT",
115
  description=(
116
+ "Enter a clinical finding and anatomical region to get the best-matching HPO term and associated genes, "
117
+ "now enriched with OntoGPT-generated ontology-based descriptions.\n\n"
118
+ "### Reference:\n"
119
+ "**Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research**\n"
120
+ "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
121
+ "medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
 
122
  )
123
  )
124
 
125
  if __name__ == "__main__":
126
+ demo.launch()