Spaces:

UoS-HGIG
/

HPO_Mapper

Running

App Files Files Community

parsboy1987 commited on Mar 12

Commit

5aebf41

verified ·

1 Parent(s): 106bedc

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -37

app.py CHANGED Viewed

@@ -2,18 +2,22 @@ import gradio as gr
 import sqlite3
 import json
 import numpy as np
 from numpy.linalg import norm
 from huggingface_hub import hf_hub_download
 from sentence_transformers import SentenceTransformer
 import os
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
-    raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.")
 EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
 embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
 db_filename = "hpo_genes.db"
 db_repo = "UoS-HGIG/hpo_genes"
 db_path = os.path.join(os.getcwd(), db_filename)
@@ -22,72 +26,101 @@ if not os.path.exists(db_path):
     db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
 def find_best_hpo_match(finding, region, threshold):
-    query = f"{finding} {region}" if region else finding
-    query_embedding = embedder.encode(query)
     conn = sqlite3.connect(db_path)
     cursor = conn.cursor()
     cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
     best_match, best_score = None, -1
     for hpo_id, hpo_name, embedding_str in cursor.fetchall():
         hpo_embedding = np.array(json.loads(embedding_str))
         similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
         if similarity > best_score:
             best_score = similarity
-            best_match = {"hpo_id": hpo_id, "hpo_name": hpo_name}
     conn.close()
-    return best_match if best_score >= threshold else None
 def get_genes_for_hpo(hpo_id):
     conn = sqlite3.connect(db_path)
     cursor = conn.cursor()
     cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
     result = cursor.fetchone()
     conn.close()
-    return result[0].split(',') if result else []
-def hpo_mapper_ui(finding, region, threshold):
-    if not finding:
-        return "Please enter a clinical pathological finding.", "", ""
-    match = find_best_hpo_match(finding, region, threshold)
-    if match:
-        genes = get_genes_for_hpo(match['hpo_id'])
-        return match['hpo_id'], match['hpo_name'], ", ".join(genes)
     else:
-        return "", "No match found.", ""
-image_path = "https://huggingface.co/UoS-HGIG/MIMIC/resolve/main/images/hpo.png"
 demo = gr.Interface(
     fn=hpo_mapper_ui,
     inputs=[
-        gr.Textbox(label="Pathological Finding"),
-        gr.Textbox(label="Anatomical Region (optional)"),
-        gr.Slider(0.5, 1.0, 0.01, value=0.74, label="Similarity Threshold")
     ],
     outputs=[
         gr.Textbox(label="HPO ID"),
         gr.Textbox(label="HPO Term"),
-        gr.Textbox(label="HPO-associated Genes")
     ],
-    title="Human Phenotype Ontology (HPO) Mapper",
     description=(
-        f"![Workflow](https://huggingface.co/UoS-HGIG/MIMIC/blob/main/images/hpo.png)\n\n"
-        "Enter a pathological finding and optionally a region to map it to the closest Human Phenotype Ontology (HPO) term and retrieve associated genes.\n\n"
-        "**Reference:**\n"
-        "Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research\n"
-        "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher M Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
-        "medRxiv 2025.03.07.25323569; doi: [https://doi.org/10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)\n"
-        "HPO to gene mappings obtained from [Jax](https://hpo.jax.org/data/annotations)"
     )
 )
 if __name__ == "__main__":
-    demo.launch()

 import sqlite3
 import json
 import numpy as np
+import subprocess  # To run OntoGPT as a CLI command
 from numpy.linalg import norm
 from huggingface_hub import hf_hub_download
 from sentence_transformers import SentenceTransformer
 import os
+# Get Hugging Face Token from Environment Variables
 HF_TOKEN = os.environ.get("HF_TOKEN")
 if not HF_TOKEN:
+    raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable in Hugging Face Secrets.")
+# Load the Nomic-Embed Model from Hugging Face with trust_remote_code=True
 EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
 embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
+# Download database from Hugging Face Datasets if not exists
 db_filename = "hpo_genes.db"
 db_repo = "UoS-HGIG/hpo_genes"
 db_path = os.path.join(os.getcwd(), db_filename)
     db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
 def find_best_hpo_match(finding, region, threshold):
+    """Finds the best HPO match using semantic similarity."""
+    query_text = f"{finding} in {region}"
+    query_embedding = embedder.encode(query_text)
     conn = sqlite3.connect(db_path)
     cursor = conn.cursor()
     cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
     best_match, best_score = None, -1
     for hpo_id, hpo_name, embedding_str in cursor.fetchall():
         hpo_embedding = np.array(json.loads(embedding_str))
         similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
         if similarity > best_score:
             best_score = similarity
+            best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
     conn.close()
+    return best_match if best_score > threshold else None  # Adjust threshold based on user input
 def get_genes_for_hpo(hpo_id):
+    """Retrieves associated genes for a given HPO ID."""
     conn = sqlite3.connect(db_path)
     cursor = conn.cursor()
     cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
     result = cursor.fetchone()
     conn.close()
+    return result[0].split(", ") if result else []
+def extract_with_ontogpt(finding, region):
+    """Uses OntoGPT CLI to extract ontology terms."""
+    input_text = f"{finding} observed in {region}."
+    try:
+        # Run OntoGPT extraction (modify parameters as needed)
+        result = subprocess.run(
+            ["ontogpt", "extract", "-t", "hpo", "-m", "meta-llama/Llama-3.1-70B-Instruct"],
+            input=input_text,
+            text=True,
+            capture_output=True
+        )
+        return result.stdout.strip()  # Return extracted ontology term
+    except Exception as e:
+        return f"Error running OntoGPT: {str(e)}"
+def get_hpo_for_finding(finding, region, threshold):
+    """Finds the best HPO term and retrieves associated genes, enriched with OntoGPT."""
+    hpo_match = find_best_hpo_match(finding, region, threshold)
+    if hpo_match:
+        hpo_id = hpo_match["hpo_id"]
+        hpo_match["genes"] = get_genes_for_hpo(hpo_id)
+        # Use OntoGPT to refine the mapping
+        enriched_description = extract_with_ontogpt(finding, region)
+        hpo_match["description"] = enriched_description
     else:
+        hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": [], "description": "No match found."}
+    return hpo_match
+def hpo_mapper_ui(finding, region, threshold):
+    """Function for Gradio UI to get HPO mappings."""
+    if not finding or not region:
+        return "Please enter both finding and region.", "", "", ""
+    result = get_hpo_for_finding(finding, region, threshold)
+    return result["hpo_id"], result["hpo_term"], ", ".join(result["genes"]), result["description"]
+# Create Gradio UI
 demo = gr.Interface(
     fn=hpo_mapper_ui,
     inputs=[
+        gr.Textbox(label="Finding"),
+        gr.Textbox(label="Region"),
+        gr.Slider(minimum=0.5, maximum=1.0, step=0.01, value=0.74, label="Threshold")
     ],
     outputs=[
         gr.Textbox(label="HPO ID"),
         gr.Textbox(label="HPO Term"),
+        gr.Textbox(label="Associated Genes"),
+        gr.Textbox(label="OntoGPT Description")  # New field for enriched ontology output
     ],
+    title="HPO Mapper with OntoGPT",
     description=(
+        "Enter a clinical finding and anatomical region to get the best-matching HPO term and associated genes, "
+        "now enriched with OntoGPT-generated ontology-based descriptions.\n\n"
+        "### Reference:\n"
+        "**Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research**\n"
+        "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
+        "medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
     )
 )
 if __name__ == "__main__":
+    demo.launch()