Spaces:

UoS-HGIG
/

HPO_Mapper

Running

App Files Files Community

HPO_Mapper / app.py

parsboy1987

Update app.py

e8e9ee3 verified 5 months ago

raw

history blame

4.67 kB

	import gradio as gr
	import sqlite3
	import json
	import numpy as np
	from numpy.linalg import norm
	from huggingface_hub import hf_hub_download
	from sentence_transformers import SentenceTransformer
	import os
	import subprocess
	from huggingface_hub import login


	# Get Hugging Face Token from Environment Variables
	HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")

	if not HF_TOKEN:
	raise ValueError("Missing Hugging Face API token. Please set HF_TOKEN as an environment variable.")

	# Set Hugging Face API key for OntoGPT
	subprocess.run(["runoak", "set-apikey", "-e", "huggingface-key", HF_TOKEN], check=True)


	# Define OntoGPT model
	ONTOGPT_MODEL = "huggingface/WizardLM/WizardCoder-Python-34B-V1.0"

	# Load the Nomic-Embed Model
	EMBEDDING_MODEL = "nomic-ai/nomic-embed-text-v1.5"
	embedder = SentenceTransformer(EMBEDDING_MODEL, trust_remote_code=True)

	# Download database from Hugging Face if not exists
	db_filename = "hpo_genes.db"
	db_repo = "UoS-HGIG/hpo_genes"
	db_path = os.path.join(os.getcwd(), db_filename)

	if not os.path.exists(db_path):
	db_path = hf_hub_download(repo_id=db_repo, filename=db_filename, repo_type="dataset", use_auth_token=HF_TOKEN)


	def find_best_hpo_match(finding, region, threshold):
	"""Finds the best HPO match using semantic similarity."""
	query_text = f"{finding} in {region}"
	query_embedding = embedder.encode(query_text)

	conn = sqlite3.connect(db_path)
	cursor = conn.cursor()
	cursor.execute("SELECT hpo_id, hpo_name, embedding FROM hpo_embeddings")

	best_match, best_score = None, -1

	for hpo_id, hpo_name, embedding_str in cursor.fetchall():
	hpo_embedding = np.array(json.loads(embedding_str))
	similarity = np.dot(query_embedding, hpo_embedding) / (norm(query_embedding) * norm(hpo_embedding))

	if similarity > best_score:
	best_score = similarity
	best_match = {"hpo_id": hpo_id, "hpo_term": hpo_name}

	conn.close()
	return best_match if best_score > threshold else None


	def get_genes_for_hpo(hpo_id):
	"""Retrieves associated genes for a given HPO ID."""
	conn = sqlite3.connect(db_path)
	cursor = conn.cursor()
	cursor.execute("SELECT genes FROM hpo_gene WHERE hpo_id = ?", (hpo_id,))
	result = cursor.fetchone()
	conn.close()
	return result[0].split(", ") if result else []


	def get_hpo_for_finding(finding, region, threshold):
	"""Finds the best HPO term and retrieves associated genes."""
	hpo_match = find_best_hpo_match(finding, region, threshold)
	if hpo_match:
	hpo_match["genes"] = get_genes_for_hpo(hpo_match["hpo_id"])
	else:
	hpo_match = {"hpo_id": "NA", "hpo_term": "NA", "genes": []}
	return hpo_match


	def run_ontogpt(finding, region):
	"""Runs OntoGPT to extract information."""
	input_text = f"{finding} in {region}"
	result = subprocess.run([
	"ontogpt", "complete", "-m", ONTOGPT_MODEL, "-i", input_text
	], capture_output=True, text=True)
	return result.stdout.strip()


	def hpo_mapper_ui(finding, region, threshold):
	"""Function for Gradio UI to get HPO mappings and OntoGPT results."""
	if not finding or not region:
	return "Please enter both finding and region.", "", ""

	hpo_result = get_hpo_for_finding(finding, region, threshold)
	ontogpt_output = run_ontogpt(finding, region)

	return hpo_result["hpo_id"], hpo_result["hpo_term"], ", ".join(hpo_result["genes"]), ontogpt_output

	# Create Gradio UI
	demo = gr.Interface(
	fn=hpo_mapper_ui,
	inputs=[
	gr.Textbox(label="Finding"),
	gr.Textbox(label="Region"),
	gr.Slider(minimum=0.5, maximum=1.0, step=0.01, value=0.74, label="Threshold")
	],
	outputs=[
	gr.Textbox(label="HPO ID"),
	gr.Textbox(label="HPO Term"),
	gr.Textbox(label="Associated Genes"),
	gr.Textbox(label="OntoGPT Output")
	],
	title="HPO Mapper with OntoGPT",
	description=(
	"Enter a clinical finding and anatomical region to get the best-matching HPO term and associated genes, "
	"now enriched with OntoGPT-generated ontology-based descriptions.\n\n"
	"### Reference:\n"
	"Application of Generative Artificial Intelligence to Utilise Unstructured Clinical Data for Acceleration of Inflammatory Bowel Disease Research\n"
	"Alex Z Kadhim, Zachary Green, Iman Nazari, Jonathan Baker, Michael George, Ashley Heinson, Matt Stammers, Christopher Kipps, R Mark Beattie, James J Ashton, Sarah Ennis\n"
	"medRxiv 2025.03.07.25323569; [DOI: 10.1101/2025.03.07.25323569](https://doi.org/10.1101/2025.03.07.25323569)"
	)

	)

	if __name__ == "__main__":
	demo.launch()