Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PIL import Image | |
import torch | |
import soundfile as sf | |
from transformers import AutoModelForCausalLM, AutoProcessor | |
from urllib.request import urlopen | |
import spaces | |
import os | |
# ============================== | |
# Model and Processor Loading | |
# ============================== | |
model_path = "microsoft/Phi-4-multimodal-instruct" | |
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_path, | |
device_map="auto", | |
torch_dtype="auto", | |
trust_remote_code=True, | |
_attn_implementation="eager", | |
) | |
# ============================== | |
# Prompt Templates | |
# ============================== | |
user_prompt = '<|user|>' | |
assistant_prompt = '<|assistant|>' | |
prompt_suffix = '<|end|>' | |
# ============================== | |
# Inference Function | |
# ============================== | |
def process_input(input_type, file, question): | |
if not file or not question: | |
return "Please upload a file and provide a question." | |
# Prepare the multimodal prompt | |
if input_type == "Image": | |
prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}' | |
# Handle file or URL | |
if isinstance(file, str) and file.startswith("http"): | |
image = Image.open(urlopen(file)) | |
else: | |
image = Image.open(file.name if hasattr(file, "name") else file) | |
inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device) | |
elif input_type == "Audio": | |
prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}' | |
if isinstance(file, str) and file.startswith("http"): | |
audio_file = urlopen(file) | |
audio, samplerate = sf.read(audio_file) | |
else: | |
audio, samplerate = sf.read(file.name if hasattr(file, "name") else file) | |
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device) | |
else: | |
return "Invalid input type selected." | |
# Generate the response | |
with torch.no_grad(): | |
generate_ids = model.generate( | |
**inputs, | |
max_new_tokens=200, | |
num_logits_to_keep=0, | |
) | |
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] | |
response = processor.batch_decode( | |
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
)[0] | |
return response | |
# ============================== | |
# Gradio UI Setup | |
# ============================== | |
with gr.Blocks( | |
title="Demo of how GABI could use a Multimodal", | |
theme=gr.themes.Soft( | |
primary_hue="blue", | |
secondary_hue="gray", | |
radius_size="lg", | |
), | |
) as demo: | |
# Insert Simli FaceTime Widget | |
gr.HTML( | |
""" | |
<simli-widget | |
token="gAAAAABoEN7c6Z4ZuimkCDa7PmB5OgiOqepELAtSQYwUliuC1Zdw6LOPejI0g1XpnDWchiwNCDFDPMd80TVY2NXjnEx2zvnv3FUSXfT4C0dsJT8QTXAklaXyxtGSZD4sG53AFxo1jSzjQWXPnQHVfIU_ISxQqenWluJrCIL1jmEMZehyj3Hx4xpnJ3lOZs3LX4YPPxbUR_CEtIMcp7roc083OVvDJO1Ycxew9KJmiBLqFbiT6hBQUjLi3BLTcEZtl8HxV_YKaKCqZNP9dt73H4a5QTQ5UvypJK2JlQiCWeH6t8LfpON66Hr-aDuZOhTiKbzhNF27jlPHJh6uXyF_rUSRvaOArQJL0S9_x3PCTCi-HBOs9VcSBCe7ICCQFMdQrF1rk7EiGQhjrJeD57rrxZXw6SeOBQjK8-a8JEeS6Fzd7ORNiWXeSEtT46TbVq03X0e44E7hZY90sSwERr2DIeCA7CM5eeHXf_iU_NCl0OwCLgF2Yd6TFQgtT-bPmEnyye5oH-GvZ52U" | |
agentid="ff60ad9c-1afd-4b76-86a0-f94bf6e7b3b2" | |
position="right" | |
customimage="https://i.postimg.cc/K8PPT4GD/temp-Imagerldp-BZ.avif" | |
customtext="FaceTime GABI" | |
></simli-widget> | |
<script src="https://app.simli.com/simli-widget/index.js" async type="text/javascript"></script> | |
""" | |
) | |
# Header | |
gr.Markdown( | |
""" | |
# Multimodal Demo - Powered by GABI using Phi-4 | |
Upload an **image** or **audio** file, ask a question, and GABI will respond intelligently! | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
input_type = gr.Radio( | |
choices=["Image", "Audio"], | |
label="Select Input Type", | |
value="Image", | |
) | |
file_input = gr.File( | |
label="Upload Your File", | |
file_types=["image", "audio"], | |
) | |
question_input = gr.Textbox( | |
label="Your Question", | |
placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'", | |
lines=2, | |
) | |
submit_btn = gr.Button("Submit", variant="primary") | |
with gr.Column(scale=2): | |
output_text = gr.Textbox( | |
label="Gabi's Response", | |
placeholder="Gabi's answer will appear here...", | |
lines=10, | |
interactive=False, | |
) | |
# Example Usage | |
with gr.Accordion("Examples", open=False): | |
gr.Markdown("Fill the fields using an example, then click **Submit** manually:") | |
gr.Examples( | |
examples=[ | |
["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"], | |
["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."], | |
], | |
inputs=[input_type, file_input, question_input], | |
outputs=None, | |
cache_examples=False, | |
) | |
# Submit Button Binding | |
submit_btn.click( | |
fn=process_input, | |
inputs=[input_type, file_input, question_input], | |
outputs=output_text, | |
) | |
# ============================== | |
# Launch App | |
# ============================== | |
demo.launch() |