import gradio as gr from PIL import Image import torch import soundfile as sf from transformers import AutoModelForCausalLM, AutoProcessor from urllib.request import urlopen import spaces import os # ============================== # Model and Processor Loading # ============================== model_path = "microsoft/Phi-4-multimodal-instruct" processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, device_map="auto", torch_dtype="auto", trust_remote_code=True, _attn_implementation="eager", ) # ============================== # Prompt Templates # ============================== user_prompt = '<|user|>' assistant_prompt = '<|assistant|>' prompt_suffix = '<|end|>' # ============================== # Inference Function # ============================== @spaces.GPU def process_input(input_type, file, question): if not file or not question: return "Please upload a file and provide a question." # Prepare the multimodal prompt if input_type == "Image": prompt = f'{user_prompt}<|image_1|>{question}{prompt_suffix}{assistant_prompt}' # Handle file or URL if isinstance(file, str) and file.startswith("http"): image = Image.open(urlopen(file)) else: image = Image.open(file.name if hasattr(file, "name") else file) inputs = processor(text=prompt, images=image, return_tensors='pt').to(model.device) elif input_type == "Audio": prompt = f'{user_prompt}<|audio_1|>{question}{prompt_suffix}{assistant_prompt}' if isinstance(file, str) and file.startswith("http"): audio_file = urlopen(file) audio, samplerate = sf.read(audio_file) else: audio, samplerate = sf.read(file.name if hasattr(file, "name") else file) inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to(model.device) else: return "Invalid input type selected." # Generate the response with torch.no_grad(): generate_ids = model.generate( **inputs, max_new_tokens=200, num_logits_to_keep=0, ) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] return response # ============================== # Gradio UI Setup # ============================== with gr.Blocks( title="Demo of how GABI could use a Multimodal", theme=gr.themes.Soft( primary_hue="blue", secondary_hue="gray", radius_size="lg", ), ) as demo: # Insert Simli FaceTime Widget gr.HTML( """ """ ) # Header gr.Markdown( """ # Multimodal Demo - Powered by GABI using Phi-4 Upload an **image** or **audio** file, ask a question, and GABI will respond intelligently! """ ) with gr.Row(): with gr.Column(scale=1): input_type = gr.Radio( choices=["Image", "Audio"], label="Select Input Type", value="Image", ) file_input = gr.File( label="Upload Your File", file_types=["image", "audio"], ) question_input = gr.Textbox( label="Your Question", placeholder="e.g., 'What is shown in this image?' or 'Transcribe this audio.'", lines=2, ) submit_btn = gr.Button("Submit", variant="primary") with gr.Column(scale=2): output_text = gr.Textbox( label="Gabi's Response", placeholder="Gabi's answer will appear here...", lines=10, interactive=False, ) # Example Usage with gr.Accordion("Examples", open=False): gr.Markdown("Fill the fields using an example, then click **Submit** manually:") gr.Examples( examples=[ ["Image", "https://www.ilankelman.org/stopsigns/australia.jpg", "What is shown in this image?"], ["Audio", "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac", "Transcribe the audio to text."], ], inputs=[input_type, file_input, question_input], outputs=None, cache_examples=False, ) # Submit Button Binding submit_btn.click( fn=process_input, inputs=[input_type, file_input, question_input], outputs=output_text, ) # ============================== # Launch App # ============================== demo.launch()