Spaces:

zehui127
/

Omni-DNA-Space-Multitask

Sleeping

App Files Files Community

Omni-DNA-Space-Multitask / app.py

zehui127

update

3606fab 4 months ago

raw

history blame contribute delete

2.8 kB

	import os
	import re
	import torch
	import gradio as gr
	from tqdm import tqdm
	from datasets import load_dataset, DatasetDict
	from transformers import AutoModelForCausalLM, AutoTokenizer

	# Automatically detect GPU or use CPU
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Default model path
	model_tokenizer_path = "zehui127/Omni-DNA-Multitask"

	# Load tokenizer and model with trusted remote code
	tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_path, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(model_tokenizer_path, trust_remote_code=True).to(device)

	# List of available tasks
	tasks = ['H3', 'H4', 'H3K9ac', 'H3K14ac', 'H4ac', 'H3K4me1', 'H3K4me2', 'H3K4me3', 'H3K36me3', 'H3K79me3']
	mapping = {'1':'It is a',
	'0':'It is not a',
	'No valid prediction':'Cannot be determined whether or not it is a',
	}
	def preprocess_response(response, mask_token="[MASK]"):
	"""Extracts the response after the [MASK] token."""
	if mask_token in response:
	response = response.split(mask_token, 1)[1]
	response = re.sub(r'^[\sATGC]+', '', response)
	return response

	def generate(dna_sequence, task_type, sample_num=1):
	"""
	Generates a response based on the DNA sequence and selected task.

	Args:
	dna_sequence (str): The input DNA sequence.
	task_type (str): The selected task type.
	sample_num (int): Number of samples for the generation process.

	Returns:
	str: Predicted function label.
	"""
	if task_type is None:
	task_type = 'H3'
	dna_sequence = dna_sequence + task_type +"[MASK]"

	tokenized_message = tokenizer(
	[dna_sequence], return_tensors='pt', return_token_type_ids=False, add_special_tokens=True
	).to(device)

	response = model.generate(**tokenized_message, max_new_tokens=sample_num, do_sample=False)
	reply = tokenizer.batch_decode(response, skip_special_tokens=False)[0].replace(" ", "")
	pred = extract_label(reply, task_type)
	return f"{mapping[pred]} {task_type}"

	def extract_label(message, task_type):
	"""Extracts the prediction label from the model's response."""
	task_type = '[MASK]'
	answer = message.split(task_type)[1]
	match = re.search(r'\d+', answer)
	return match.group() if match else "No valid prediction"

	# Gradio interface
	interface = gr.Interface(
	fn=generate,
	inputs=[
	gr.Textbox(label="Input DNA Sequence", placeholder="Enter a DNA sequence"),
	gr.Dropdown(choices=tasks, label="Select Task Type"),
	],
	outputs=gr.Textbox(label="Predicted Type"),
	title="Omni-DNA Multitask Prediction",
	description="Select a DNA-related task and input a sequence to generate function predictions.",
	)

	if __name__ == "__main__":
	interface.launch()