File size: 4,674 Bytes
b71fded
 
 
 
 
 
f5a3220
b71fded
a2e895f
e8e9ee3
f201229
b71fded
5aebf41
e8e9ee3
 
d6e0d2b
a2e895f
d6e0d2b
a2e895f
 
 
f201229
a2e895f
021035e
a2e895f
 
c3ec480
5ad5796
2ce8d11
a2e895f
c3ec480
 
b71fded
 
 
c3ec480
b71fded
a2e895f
f28a058
5aebf41
 
 
b71fded
 
 
c3ec480
5aebf41
b71fded
5aebf41
b71fded
 
 
5aebf41
b71fded
 
5aebf41
 
b71fded
a2e895f
 
b71fded
 
5aebf41
b71fded
 
c3ec480
b71fded
 
5aebf41
 
 
 
a2e895f
5aebf41
 
a2e895f
b71fded
a2e895f
5aebf41
c10c141
a2e895f
 
 
 
 
 
 
 
 
 
5aebf41
a2e895f
5aebf41
a2e895f
 
 
 
5aebf41
a2e895f
5aebf41
 
b71fded
 
f28a058
5aebf41
 
 
f28a058
b71fded
c3ec480
 
5aebf41
a2e895f
b71fded
5aebf41
f28a058
5aebf41
 
 
 
 
 
f0f8dd4
a2e895f
b71fded
 
c3ec480
5aebf41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
import sqlite3
import json
import numpy as np
from numpy.linalg import norm
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
import os
import subprocess
from huggingface_hub import login


# Get Hugging Face Token from Environment Variables
HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")

if not HF_TOKEN:
    raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.")

# Set Hugging Face API key for OntoGPT
subprocess.run(["runoak", "set-apikey", "-e", "huggingface-key", HF_TOKEN], check=True)


# Define OntoGPT model
ONTOGPT_MODEL = "huggingface/WizardLM/WizardCoder-Python-34B-V1.0"

# Load the Nomic-Embed Model
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)

# Download database from Hugging Face if not exists
db_filename = "hpo_genes.db"
db_repo = "UoS-HGIG/hpo_genes"
db_path = os.path.join(os.getcwd(), db_filename)

if not os.path.exists(db_path):
    db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)


def find_best_hpo_match(finding, region, threshold):
    """Finds the best HPO match using semantic similarity."""
    query_text = f"{finding} in {region}"
    query_embedding = embedder.encode(query_text)

    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
    
    best_match, best_score = None, -1
    
    for hpo_id, hpo_name, embedding_str in cursor.fetchall():
        hpo_embedding = np.array(json.loads(embedding_str))
        similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
        
        if similarity > best_score:
            best_score = similarity
            best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
    
    conn.close()
    return best_match if best_score > threshold else None


def get_genes_for_hpo(hpo_id):
    """Retrieves associated genes for a given HPO ID."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
    result = cursor.fetchone()
    conn.close()
    return result[0].split(", ") if result else []


def get_hpo_for_finding(finding, region, threshold):
    """Finds the best HPO term and retrieves associated genes."""
    hpo_match = find_best_hpo_match(finding, region, threshold)
    if hpo_match:
        hpo_match["genes"] = get_genes_for_hpo(hpo_match["hpo_id"])
    else:
        hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": []}
    return hpo_match


def run_ontogpt(finding, region):
    """Runs OntoGPT to extract information."""
    input_text = f"{finding} in {region}"
    result = subprocess.run([
        "ontogpt", "complete", "-m", ONTOGPT_MODEL, "-i", input_text
    ], capture_output=True, text=True)
    return result.stdout.strip()


def hpo_mapper_ui(finding, region, threshold):
    """Function for Gradio UI to get HPO mappings and OntoGPT results."""
    if not finding or not region:
        return "Please enter both finding and region.", "", ""
    
    hpo_result = get_hpo_for_finding(finding, region, threshold)
    ontogpt_output = run_ontogpt(finding, region)
    
    return hpo_result["hpo_id"], hpo_result["hpo_term"], ", ".join(hpo_result["genes"]), ontogpt_output

# Create Gradio UI
demo = gr.Interface(
    fn=hpo_mapper_ui,
    inputs=[
        gr.Textbox(label="Finding"),
        gr.Textbox(label="Region"),
        gr.Slider(minimum=0.5, maximum=1.0, step=0.01, value=0.74, label="Threshold")
    ],
    outputs=[
        gr.Textbox(label="HPO ID"),
        gr.Textbox(label="HPO Term"),
        gr.Textbox(label="Associated Genes"),
        gr.Textbox(label="OntoGPT Output")
    ],
    title="HPO Mapper with OntoGPT",
    description=(
        "Enter a clinical finding and anatomical region to get the best-matching HPO term and associated genes, "
        "now enriched with OntoGPT-generated ontology-based descriptions.\n\n"
        "### Reference:\n"
        "**Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research**\n"
        "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
        "medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
    )

)

if __name__ == "__main__":
    demo.launch()