import gradio as gr from PIL import Image from inference.main import MultiModalPhi2 # from __future__ import annotations from typing import Iterable import gradio as gr from gradio.themes.base import Base from gradio.themes.utils import colors, fonts, sizes import time class Seafoam(Base): def __init__( self, *, primary_hue: colors.Color | str = colors.emerald, secondary_hue: colors.Color | str = colors.blue, neutral_hue: colors.Color | str = colors.gray, spacing_size: sizes.Size | str = sizes.spacing_md, radius_size: sizes.Size | str = sizes.radius_md, text_size: sizes.Size | str = sizes.text_lg, font: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("Quicksand"), "ui-sans-serif", "sans-serif", ), font_mono: fonts.Font | str | Iterable[fonts.Font | str] = ( fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace", ), ): super().__init__( primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue, spacing_size=spacing_size, radius_size=radius_size, text_size=text_size, font=font, font_mono=font_mono, ) seafoam = Seafoam() messages = [] multimodal_phi2 = MultiModalPhi2( modelname_or_path="Navyabhat/Llava-Phi2", temperature=0.2, max_new_tokens=1024, device="cpu", ) def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot: textflag, imageflag, audioflag = False, False, False if text not in ["", None]: chatbot.append((text, None)) textflag = True if image is not None: chatbot.append(((image,), None)) imageflag = True if audio_mic is not None: chatbot.append(((audio_mic,), None)) audioflag = True else: if audio_upload is not None: chatbot.append(((audio_upload,), None)) audioflag = True if not any([textflag, imageflag, audioflag]): # Raise an error if neither text nor file is provided raise gr.Error("Enter a valid text, image or audio") return chatbot def clear_data(): return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []} def run(history, text, image, audio_upload, audio_mic): if text in [None, ""]: text = None if audio_upload is not None: audio = audio_upload elif audio_mic is not None: audio = audio_mic else: audio = None print("text", text) print("image", image) print("audio", audio) if image is not None: image = Image.open(image) outputs = multimodal_phi2(text, audio, image) # outputs = "" history.append((None, outputs.title())) return history, None, None, None, None with gr.Blocks(theme=seafoam) as demo: gr.Markdown("## MulitModal Phi2 Model Pretraining and Finetuning from Scratch") with gr.Row(): chatbot = gr.Chatbot( avatar_images=("🧑", "🤖"), height=450, ) with gr.Row(): # Adding a Textbox with a placeholder "write prompt" prompt = gr.Textbox( placeholder="Enter text, or upload an image or audio", lines=2, label="Query", value=None, scale = 4 ) image= gr.UploadButton( "📁", file_types=["image", "audio"] ) audio_upload= image audio_mic = gr.Audio( source="microphone", type="filepath", format="mp3" ) with gr.Row(): # Adding a Button submit = gr.Button(value = "Submit") clear = gr.Button(value="Clear") submit.click( add_content, inputs=[chatbot, prompt, image, audio_upload, audio_mic], outputs=[chatbot], ).success( run, inputs=[chatbot, prompt, image, audio_upload, audio_mic], outputs=[chatbot, prompt, image, audio_upload, audio_mic], ) clear.click( clear_data, outputs=[prompt, image, audio_upload, audio_mic, chatbot], ) demo.launch()