Spaces:

huro-ai
/

GABI-Multimodal-Demo

Sleeping

App Files Files Community

GABI-Multimodal-Demo / app.py

tommytracx

Rename app (2).py to app.py

838dd37 verified 4 months ago

raw

history blame

5.24 kB

	import gradio as gr
	from PIL import Image
	import torch
	import soundfile as sf
	from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
	from urllib.request import urlopen
	import spaces

	# Define model path
	model_path = "microsoft/Phi-4-multimodal-instruct"

	# Load model and processor
	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto",
	torch_dtype="auto",
	trust_remote_code=True,
	_attn_implementation="eager",
	)

	# Define prompt structure
	user_prompt = '<\|user\|>'
	assistant_prompt = '<\|assistant\|>'
	prompt_suffix = '<\|end\|>'

	# Define inference function
	@spaces.GPU
	def process_input(input_type, file, question):
	if not file or not question:
	return "Please upload a file and provide a question for Gabi."

	# Prepare the prompt
	if input_type == "Image":
	prompt = f'{user_prompt}<\|image_1\|>{question}{prompt_suffix}{assistant_prompt}'
	# Open image from uploaded file
	image = Image.open(file)
	inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)
	elif input_type == "Audio":
	prompt = f'{user_prompt}<\|audio_1\|>{question}{prompt_suffix}{assistant_prompt}'
	# Read audio from uploaded file
	audio, samplerate = sf.read(file)
	inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)
	else:
	return "Invalid input type selected."

	# Generate response
	with torch.no_grad():
	generate_ids = model.generate(
	**inputs,
	max_new_tokens=200,
	num_logits_to_keep=0,
	)
	generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
	response = processor.batch_decode(
	generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	return response

	# Gradio interface
	with gr.Blocks(
	title="Demo of how GABI could use a Multimodal",
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="gray",
	radius_size="lg",
	),
	) as demo:

	# Insert Simli Widget
	gr.HTML(
	"""
	<simli-widget
	token="gAAAAABoEN7c6Z4ZuimkCDa7PmB5OgiOqepELAtSQYwUliuC1Zdw6LOPejI0g1XpnDWchiwNCDFDPMd80TVY2NXjnEx2zvnv3FUSXfT4C0dsJT8QTXAklaXyxtGSZD4sG53AFxo1jSzjQWXPnQHVfIU_ISxQqenWluJrCIL1jmEMZehyj3Hx4xpnJ3lOZs3LX4YPPxbUR_CEtIMcp7roc083OVvDJO1Ycxew9KJmiBLqFbiT6hBQUjLi3BLTcEZtl8HxV_YKaKCqZNP9dt73H4a5QTQ5UvypJK2JlQiCWeH6t8LfpON66Hr-aDuZOhTiKbzhNF27jlPHJh6uXyF_rUSRvaOArQJL0S9_x3PCTCi-HBOs9VcSBCe7ICCQFMdQrF1rk7EiGQhjrJeD57rrxZXw6SeOBQjK8-a8JEeS6Fzd7ORNiWXeSEtT46TbVq03X0e44E7hZY90sSwERr2DIeCA7CM5eeHXf_iU_NCl0OwCLgF2Yd6TFQgtT-bPmEnyye5oH-GvZ52U"
	agentid="ff60ad9c-1afd-4b76-86a0-f94bf6e7b3b2"
	position="right"
	customimage="https://i.postimg.cc/K8PPT4GD/temp-Imagerldp-BZ.avif"
	customtext="FaceTime GABI"
	></simli-widget>
	<script src="https://app.simli.com/simli-widget/index.js" async type="text/javascript"></script>
	"""
	)

	gr.Markdown(
	"""
	# This Space is using Phi-4 as the LLM for the Gabi Multimodal Demo
	Try uploading an image or audio file, ask Gabi a question, and get a response!
	We want to leverage this to allow GABI to have the ability to interact and understand various contents.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_type = gr.Radio(
	choices=["Image", "Audio"],
	label="Select Input Type",
	value="Image",
	)
	file_input = gr.File(
	label="Upload Your File",
	file_types=["image", "audio"],
	)
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="e.g., 'Gabi, what is shown in this image?' or 'Gabi, transcribe this audio.'",
	lines=2,
	)
	submit_btn = gr.Button("Submit", variant="primary")

	with gr.Column(scale=2):
	output_text = gr.Textbox(
	label="Gabi's Response",
	placeholder="Gabi's response will appear here...",
	lines=10,
	interactive=False,
	)

	# Example section
	with gr.Accordion("Examples", open=False):
	gr.Markdown("Try these examples:")
	gr.Examples(
	examples=[
	["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "Gabi, what is shown in this image?"],
	["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Gabi, transcribe the audio to text."],
	],
	inputs=[input_type, file_input, question_input],
	outputs=output_text,
	fn=process_input,
	cache_examples=False,
	)

	# Connect the submit button
	submit_btn.click(
	fn=process_input,
	inputs=[input_type, file_input, question_input],
	outputs=output_text,
	)

	# Launch the demo
	demo.launch()