Spaces:

huro-ai
/

GABI-Multimodal-Demo

Sleeping

App Files Files Community

GABI-Multimodal-Demo / app.py

tommytracx

Update app.py

b0aa032 verified about 1 month ago

raw

history blame contribute delete

5.74 kB

	import gradio as gr
	from PIL import Image
	import torch
	import soundfile as sf
	from transformers import AutoModelForCausalLM, AutoProcessor
	from urllib.request import urlopen
	import spaces
	import os

	# ==============================
	# Model and Processor Loading
	# ==============================

	model_path = "microsoft/Phi-4-multimodal-instruct"

	processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	device_map="auto",
	torch_dtype="auto",
	trust_remote_code=True,
	_attn_implementation="eager",
	)

	# ==============================
	# Prompt Templates
	# ==============================

	user_prompt = '<\|user\|>'
	assistant_prompt = '<\|assistant\|>'
	prompt_suffix = '<\|end\|>'

	# ==============================
	# Inference Function
	# ==============================

	@spaces.GPU
	def process_input(input_type, file, question):
	if not file or not question:
	return "Please upload a file and provide a question."

	# Prepare the multimodal prompt
	if input_type == "Image":
	prompt = f'{user_prompt}<\|image_1\|>{question}{prompt_suffix}{assistant_prompt}'
	# Handle file or URL
	if isinstance(file, str) and file.startswith("http"):
	image = Image.open(urlopen(file))
	else:
	image = Image.open(file.name if hasattr(file, "name") else file)
	inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device)

	elif input_type == "Audio":
	prompt = f'{user_prompt}<\|audio_1\|>{question}{prompt_suffix}{assistant_prompt}'
	if isinstance(file, str) and file.startswith("http"):
	audio_file = urlopen(file)
	audio, samplerate = sf.read(audio_file)
	else:
	audio, samplerate = sf.read(file.name if hasattr(file, "name") else file)
	inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device)

	else:
	return "Invalid input type selected."

	# Generate the response
	with torch.no_grad():
	generate_ids = model.generate(
	**inputs,
	max_new_tokens=200,
	num_logits_to_keep=0,
	)
	generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
	response = processor.batch_decode(
	generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0]

	return response

	# ==============================
	# Gradio UI Setup
	# ==============================

	with gr.Blocks(
	title="Demo of how GABI could use a Multimodal",
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="gray",
	radius_size="lg",
	),
	) as demo:

	# Insert Simli FaceTime Widget
	gr.HTML(
	"""
	<simli-widget
	token="gAAAAABoEN7c6Z4ZuimkCDa7PmB5OgiOqepELAtSQYwUliuC1Zdw6LOPejI0g1XpnDWchiwNCDFDPMd80TVY2NXjnEx2zvnv3FUSXfT4C0dsJT8QTXAklaXyxtGSZD4sG53AFxo1jSzjQWXPnQHVfIU_ISxQqenWluJrCIL1jmEMZehyj3Hx4xpnJ3lOZs3LX4YPPxbUR_CEtIMcp7roc083OVvDJO1Ycxew9KJmiBLqFbiT6hBQUjLi3BLTcEZtl8HxV_YKaKCqZNP9dt73H4a5QTQ5UvypJK2JlQiCWeH6t8LfpON66Hr-aDuZOhTiKbzhNF27jlPHJh6uXyF_rUSRvaOArQJL0S9_x3PCTCi-HBOs9VcSBCe7ICCQFMdQrF1rk7EiGQhjrJeD57rrxZXw6SeOBQjK8-a8JEeS6Fzd7ORNiWXeSEtT46TbVq03X0e44E7hZY90sSwERr2DIeCA7CM5eeHXf_iU_NCl0OwCLgF2Yd6TFQgtT-bPmEnyye5oH-GvZ52U"
	agentid="ff60ad9c-1afd-4b76-86a0-f94bf6e7b3b2"
	position="right"
	customimage="https://i.postimg.cc/K8PPT4GD/temp-Imagerldp-BZ.avif"
	customtext="FaceTime GABI"
	></simli-widget>
	<script src="https://app.simli.com/simli-widget/index.js" async type="text/javascript"></script>
	"""
	)

	# Header
	gr.Markdown(
	"""
	# Multimodal Demo - Powered by GABI using Phi-4
	Upload an image or audio file, ask a question, and GABI will respond intelligently!
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_type = gr.Radio(
	choices=["Image", "Audio"],
	label="Select Input Type",
	value="Image",
	)
	file_input = gr.File(
	label="Upload Your File",
	file_types=["image", "audio"],
	)
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'",
	lines=2,
	)
	submit_btn = gr.Button("Submit", variant="primary")

	with gr.Column(scale=2):
	output_text = gr.Textbox(
	label="Gabi's Response",
	placeholder="Gabi's answer will appear here...",
	lines=10,
	interactive=False,
	)

	# Example Usage
	with gr.Accordion("Examples", open=False):
	gr.Markdown("Fill the fields using an example, then click Submit manually:")
	gr.Examples(
	examples=[
	["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"],
	["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."],
	],
	inputs=[input_type, file_input, question_input],
	outputs=None,
	cache_examples=False,
	)

	# Submit Button Binding
	submit_btn.click(
	fn=process_input,
	inputs=[input_type, file_input, question_input],
	outputs=output_text,
	)

	# ==============================
	# Launch App
	# ==============================

	demo.launch()