Spaces:
Running
Running
File size: 4,674 Bytes
b71fded f5a3220 b71fded a2e895f e8e9ee3 f201229 b71fded 5aebf41 e8e9ee3 d6e0d2b a2e895f d6e0d2b a2e895f f201229 a2e895f 021035e a2e895f c3ec480 5ad5796 2ce8d11 a2e895f c3ec480 b71fded c3ec480 b71fded a2e895f f28a058 5aebf41 b71fded c3ec480 5aebf41 b71fded 5aebf41 b71fded 5aebf41 b71fded 5aebf41 b71fded a2e895f b71fded 5aebf41 b71fded c3ec480 b71fded 5aebf41 a2e895f 5aebf41 a2e895f b71fded a2e895f 5aebf41 c10c141 a2e895f 5aebf41 a2e895f 5aebf41 a2e895f 5aebf41 a2e895f 5aebf41 b71fded f28a058 5aebf41 f28a058 b71fded c3ec480 5aebf41 a2e895f b71fded 5aebf41 f28a058 5aebf41 f0f8dd4 a2e895f b71fded c3ec480 5aebf41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import gradio as gr
import sqlite3
import json
import numpy as np
from numpy.linalg import norm
from huggingface_hub import hf_hub_download
from sentence_transformers import SentenceTransformer
import os
import subprocess
from huggingface_hub import login
# Get Hugging Face Token from Environment Variables
HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
if not HF_TOKEN:
raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.")
# Set Hugging Face API key for OntoGPT
subprocess.run(["runoak", "set-apikey", "-e", "huggingface-key", HF_TOKEN], check=True)
# Define OntoGPT model
ONTOGPT_MODEL = "huggingface/WizardLM/WizardCoder-Python-34B-V1.0"
# Load the Nomic-Embed Model
EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)
# Download database from Hugging Face if not exists
db_filename = "hpo_genes.db"
db_repo = "UoS-HGIG/hpo_genes"
db_path = os.path.join(os.getcwd(), db_filename)
if not os.path.exists(db_path):
db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)
def find_best_hpo_match(finding, region, threshold):
"""Finds the best HPO match using semantic similarity."""
query_text = f"{finding} in {region}"
query_embedding = embedder.encode(query_text)
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")
best_match, best_score = None, -1
for hpo_id, hpo_name, embedding_str in cursor.fetchall():
hpo_embedding = np.array(json.loads(embedding_str))
similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))
if similarity > best_score:
best_score = similarity
best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}
conn.close()
return best_match if best_score > threshold else None
def get_genes_for_hpo(hpo_id):
"""Retrieves associated genes for a given HPO ID."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
result = cursor.fetchone()
conn.close()
return result[0].split(", ") if result else []
def get_hpo_for_finding(finding, region, threshold):
"""Finds the best HPO term and retrieves associated genes."""
hpo_match = find_best_hpo_match(finding, region, threshold)
if hpo_match:
hpo_match["genes"] = get_genes_for_hpo(hpo_match["hpo_id"])
else:
hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": []}
return hpo_match
def run_ontogpt(finding, region):
"""Runs OntoGPT to extract information."""
input_text = f"{finding} in {region}"
result = subprocess.run([
"ontogpt", "complete", "-m", ONTOGPT_MODEL, "-i", input_text
], capture_output=True, text=True)
return result.stdout.strip()
def hpo_mapper_ui(finding, region, threshold):
"""Function for Gradio UI to get HPO mappings and OntoGPT results."""
if not finding or not region:
return "Please enter both finding and region.", "", ""
hpo_result = get_hpo_for_finding(finding, region, threshold)
ontogpt_output = run_ontogpt(finding, region)
return hpo_result["hpo_id"], hpo_result["hpo_term"], ", ".join(hpo_result["genes"]), ontogpt_output
# Create Gradio UI
demo = gr.Interface(
fn=hpo_mapper_ui,
inputs=[
gr.Textbox(label="Finding"),
gr.Textbox(label="Region"),
gr.Slider(minimum=0.5, maximum=1.0, step=0.01, value=0.74, label="Threshold")
],
outputs=[
gr.Textbox(label="HPO ID"),
gr.Textbox(label="HPO Term"),
gr.Textbox(label="Associated Genes"),
gr.Textbox(label="OntoGPT Output")
],
title="HPO Mapper with OntoGPT",
description=(
"Enter a clinical finding and anatomical region to get the best-matching HPO term and associated genes, "
"now enriched with OntoGPT-generated ontology-based descriptions.\n\n"
"### Reference:\n"
"**Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research**\n"
"Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
"medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
)
)
if __name__ == "__main__":
demo.launch()
|