akadhim commited on
Commit
d60b7c4
·
verified ·
1 Parent(s): 9ec518b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -68
app.py CHANGED
@@ -6,28 +6,14 @@ from numpy.linalg import norm
6
  from huggingface_hub import hf_hub_download
7
  from sentence_transformers import SentenceTransformer
8
  import os
9
- import subprocess
10
- from huggingface_hub import login
11
 
12
-
13
- # Get Hugging Face Token from Environment Variables
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
  if not HF_TOKEN:
16
- raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.")
17
-
18
-
19
- # Set Hugging Face API key for OntoGPT
20
- subprocess.run(["runoak", "set-apikey", "-e", "huggingface-key", HF_TOKEN], check=True)
21
-
22
 
23
- # Define OntoGPT model
24
- ONTOGPT_MODEL = "huggingface/WizardLM/WizardCoder-Python-34B-V1.0"
25
-
26
- # Load the Nomic-Embed Model
27
  EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
28
  embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
29
 
30
- # Download database from Hugging Face if not exists
31
  db_filename = "hpo_genes.db"
32
  db_repo = "UoS-HGIG/hpo_genes"
33
  db_path = os.path.join(os.getcwd(), db_filename)
@@ -35,94 +21,73 @@ db_path = os.path.join(os.getcwd(), db_filename)
35
  if not os.path.exists(db_path):
36
  db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
37
 
38
-
39
  def find_best_hpo_match(finding, region, threshold):
40
- """Finds the best HPO match using semantic similarity."""
41
- query_text = f"{finding} in {region}"
42
- query_embedding = embedder.encode(query_text)
43
 
44
  conn = sqlite3.connect(db_path)
45
  cursor = conn.cursor()
46
  cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
47
-
48
  best_match, best_score = None, -1
49
-
50
  for hpo_id, hpo_name, embedding_str in cursor.fetchall():
51
  hpo_embedding = np.array(json.loads(embedding_str))
52
  similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
53
-
54
  if similarity > best_score:
55
  best_score = similarity
56
- best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
57
-
58
  conn.close()
59
- return best_match if best_score > threshold else None
60
 
 
61
 
62
  def get_genes_for_hpo(hpo_id):
63
- """Retrieves associated genes for a given HPO ID."""
64
  conn = sqlite3.connect(db_path)
65
  cursor = conn.cursor()
66
  cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
67
  result = cursor.fetchone()
68
  conn.close()
69
- return result[0].split(", ") if result else []
70
 
 
71
 
72
- def get_hpo_for_finding(finding, region, threshold):
73
- """Finds the best HPO term and retrieves associated genes."""
74
- hpo_match = find_best_hpo_match(finding, region, threshold)
75
- if hpo_match:
76
- hpo_match["genes"] = get_genes_for_hpo(hpo_match["hpo_id"])
77
- else:
78
- hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": []}
79
- return hpo_match
80
-
81
 
82
- def run_ontogpt(finding, region):
83
- """Runs OntoGPT to extract information."""
84
- input_text = f"{finding} in {region}"
85
- result = subprocess.run([
86
- "ontogpt", "complete", "-m", ONTOGPT_MODEL, "-i", input_text
87
- ], capture_output=True, text=True)
88
- return result.stdout.strip()
89
 
 
90
 
91
- def hpo_mapper_ui(finding, region, threshold):
92
- """Function for Gradio UI to get HPO mappings and OntoGPT results."""
93
- if not finding or not region:
94
- return "Please enter both finding and region.", "", ""
95
-
96
- hpo_result = get_hpo_for_finding(finding, region, threshold)
97
- ontogpt_output = run_ontogpt(finding, region)
98
-
99
- return hpo_result["hpo_id"], hpo_result["hpo_term"], ", ".join(hpo_result["genes"]), ontogpt_output
100
-
101
- # Create Gradio UI
102
  demo = gr.Interface(
103
  fn=hpo_mapper_ui,
104
  inputs=[
105
- gr.Textbox(label="Finding"),
106
- gr.Textbox(label="Region"),
107
- gr.Slider(minimum=0.5, maximum=1.0, step=0.01, value=0.74, label="Threshold")
108
  ],
109
  outputs=[
110
  gr.Textbox(label="HPO ID"),
111
  gr.Textbox(label="HPO Term"),
112
- gr.Textbox(label="Associated Genes"),
113
- gr.Textbox(label="OntoGPT Output")
114
  ],
115
- title="HPO Mapper with OntoGPT",
116
  description=(
117
- "Enter a clinical finding and anatomical region to get the best-matching HPO term and associated genes, "
118
- "now enriched with OntoGPT-generated ontology-based descriptions.\n\n"
119
- "### Reference:\n"
120
- "**Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research**\n"
121
- "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
122
- "medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
 
123
  )
124
-
125
  )
126
 
127
  if __name__ == "__main__":
128
- demo.launch()
 
6
  from huggingface_hub import hf_hub_download
7
  from sentence_transformers import SentenceTransformer
8
  import os
 
 
9
 
 
 
10
  HF_TOKEN = os.environ.get("HF_TOKEN")
11
  if not HF_TOKEN:
12
+ raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.")
 
 
 
 
 
13
 
 
 
 
 
14
  EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
15
  embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
16
 
 
17
  db_filename = "hpo_genes.db"
18
  db_repo = "UoS-HGIG/hpo_genes"
19
  db_path = os.path.join(os.getcwd(), db_filename)
 
21
  if not os.path.exists(db_path):
22
  db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
23
 
 
24
  def find_best_hpo_match(finding, region, threshold):
25
+ query = f"{finding} {region}" if region else finding
26
+ query_embedding = embedder.encode(query)
 
27
 
28
  conn = sqlite3.connect(db_path)
29
  cursor = conn.cursor()
30
  cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
31
+
32
  best_match, best_score = None, -1
33
+
34
  for hpo_id, hpo_name, embedding_str in cursor.fetchall():
35
  hpo_embedding = np.array(json.loads(embedding_str))
36
  similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
37
+
38
  if similarity > best_score:
39
  best_score = similarity
40
+ best_match = {"hpo_id": hpo_id, "hpo_name": hpo_name}
41
+
42
  conn.close()
 
43
 
44
+ return best_match if best_score >= threshold else None
45
 
46
  def get_genes_for_hpo(hpo_id):
 
47
  conn = sqlite3.connect(db_path)
48
  cursor = conn.cursor()
49
  cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
50
  result = cursor.fetchone()
51
  conn.close()
 
52
 
53
+ return result[0].split(',') if result else []
54
 
55
+ def hpo_mapper_ui(finding, region, threshold):
56
+ if not finding:
57
+ return "Please enter a clinical pathological finding.", "", ""
 
 
 
 
 
 
58
 
59
+ match = find_best_hpo_match(finding, region, threshold)
60
+ if match:
61
+ genes = get_genes_for_hpo(match['hpo_id'])
62
+ return match['hpo_id'], match['hpo_name'], ", ".join(genes)
63
+ else:
64
+ return "", "No match found.", ""
 
65
 
66
+ image_path = "https://huggingface.co/UoS-HGIG/MIMIC/resolve/main/images/hpo.png"
67
 
 
 
 
 
 
 
 
 
 
 
 
68
  demo = gr.Interface(
69
  fn=hpo_mapper_ui,
70
  inputs=[
71
+ gr.Textbox(label="Pathological Finding"),
72
+ gr.Textbox(label="Anatomical Region (optional)"),
73
+ gr.Slider(0.5, 1.0, 0.01, value=0.74, label="Similarity Threshold")
74
  ],
75
  outputs=[
76
  gr.Textbox(label="HPO ID"),
77
  gr.Textbox(label="HPO Term"),
78
+ gr.Textbox(label="HPO-associated Genes")
 
79
  ],
80
+ title="Human Phenotype Ontology (HPO) Mapper",
81
  description=(
82
+ "![Workflow](https://huggingface.co/UoS-HGIG/MIMIC/blob/main/images/hpo.png)\n\n"
83
+ "Enter a pathological finding and optionally a region to map it to the closest Human Phenotype Ontology (HPO) term and retrieve associated genes.\n\n"
84
+ "**Reference:**\n"
85
+ "Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research\n"
86
+ "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher M Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
87
+ "medRxiv 2025.03.07.25323569; doi: [https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n"
88
+ "HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)"
89
  )
 
90
  )
91
 
92
  if __name__ == "__main__":
93
+ demo.launch()