import gradio as gr import sqlite3 import json import numpy as np from numpy.linalg import norm from huggingface_hub import hf_hub_download from sentence_transformers import SentenceTransformer import os import subprocess from huggingface_hub import login # Get Hugging Face Token from Environment Variables HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY") if not HF_TOKEN: raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.") # Set Hugging Face API key for OntoGPT subprocess.run(["runoak", "set-apikey", "-e", "huggingface-key", HF_TOKEN], check=True) # Define OntoGPT model ONTOGPT_MODEL = "huggingface/WizardLM/WizardCoder-Python-34B-V1.0" # Load the Nomic-Embed Model EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5" embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True) # Download database from Hugging Face if not exists db_filename = "hpo_genes.db" db_repo = "UoS-HGIG/hpo_genes" db_path = os.path.join(os.getcwd(), db_filename) if not os.path.exists(db_path): db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN) def find_best_hpo_match(finding, region, threshold): """Finds the best HPO match using semantic similarity.""" query_text = f"{finding} in {region}" query_embedding = embedder.encode(query_text) conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings") best_match, best_score = None, -1 for hpo_id, hpo_name, embedding_str in cursor.fetchall(): hpo_embedding = np.array(json.loads(embedding_str)) similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding)) if similarity > best_score: best_score = similarity best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name} conn.close() return best_match if best_score > threshold else None def get_genes_for_hpo(hpo_id): """Retrieves associated genes for a given HPO ID.""" conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,)) result = cursor.fetchone() conn.close() return result[0].split(", ") if result else [] def get_hpo_for_finding(finding, region, threshold): """Finds the best HPO term and retrieves associated genes.""" hpo_match = find_best_hpo_match(finding, region, threshold) if hpo_match: hpo_match["genes"] = get_genes_for_hpo(hpo_match["hpo_id"]) else: hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": []} return hpo_match def run_ontogpt(finding, region): """Runs OntoGPT to extract information.""" input_text = f"{finding} in {region}" result = subprocess.run([ "ontogpt", "complete", "-m", ONTOGPT_MODEL, "-i", input_text ], capture_output=True, text=True) return result.stdout.strip() def hpo_mapper_ui(finding, region, threshold): """Function for Gradio UI to get HPO mappings and OntoGPT results.""" if not finding or not region: return "Please enter both finding and region.", "", "" hpo_result = get_hpo_for_finding(finding, region, threshold) ontogpt_output = run_ontogpt(finding, region) return hpo_result["hpo_id"], hpo_result["hpo_term"], ", ".join(hpo_result["genes"]), ontogpt_output # Create Gradio UI demo = gr.Interface( fn=hpo_mapper_ui, inputs=[ gr.Textbox(label="Finding"), gr.Textbox(label="Region"), gr.Slider(minimum=0.5, maximum=1.0, step=0.01, value=0.74, label="Threshold") ], outputs=[ gr.Textbox(label="HPO ID"), gr.Textbox(label="HPO Term"), gr.Textbox(label="Associated Genes"), gr.Textbox(label="OntoGPT Output") ], title="HPO Mapper with OntoGPT", description=( "Enter a clinical finding and anatomical region to get the best-matching HPO term and associated genes, " "now enriched with OntoGPT-generated ontology-based descriptions.\n\n" "### Reference:\n" "**Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research**\n" "Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n" "medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)" ) ) if __name__ == "__main__": demo.launch()