Spaces:
Sleeping
Sleeping
File size: 3,725 Bytes
65f2058 b211ff8 65f2058 b2a47bb 4cfbe9c 9cd4773 65f2058 9cd4773 65f2058 9cd4773 65f2058 9cd4773 65f2058 9cd4773 65f2058 cc1dbe3 fc9d046 cc1dbe3 65f2058 e2be56e 4cfbe9c f856174 594861d 65f2058 cfc8853 575045b 4cfbe9c 575045b 4cfbe9c 575045b b2c894f 9fbecf2 575045b 4cfbe9c 575045b 0737966 16d64f9 af350b1 160efdd af350b1 9cd4773 65f2058 9cd4773 65f2058 9cd4773 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
from PIL import Image
from inference.main import MultiModalPhi2
multimodal_phi2 = MultiModalPhi2(
modelname_or_path="Navyabhat/Llava-Phi2",
temperature=0.2,
max_new_tokens=1024,
device="cpu",
)
theme = gr.themes.Default(primary_hue="blue").set(
loader_color="#FF0000",
button_primary_background_fill="*primary_200",
button_primary_background_fill_hover="*primary_300",
)
def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot:
textflag, imageflag, audioflag = False, False, False
if text not in ["", None]:
chatbot.append((text, None))
textflag = True
if image is not None:
chatbot.append(((image,), None))
imageflag = True
if audio_mic is not None:
chatbot.append(((audio_mic,), None))
audioflag = True
else:
if audio_upload is not None:
chatbot.append(((audio_upload,), None))
audioflag = True
if not any([textflag, imageflag, audioflag]):
# Raise an error if neither text nor file is provided
raise gr.Error("Enter a valid text, image or audio")
return chatbot
def clear_data():
return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []}
def run(history, text, image, audio_upload, audio_mic):
if text in [None, ""]:
text = None
if audio_upload is not None:
audio = audio_upload
elif audio_mic is not None:
audio = audio_mic
else:
audio = None
print("text", text)
print("image", image)
print("audio", audio)
if image is not None:
image = Image.open(image)
outputs = multimodal_phi2(text, audio, image)
# outputs = ""
history.append((None, outputs.title()))
return history, None, None, None, None
with gr.Blocks(theme=theme) as demo:
gr.Markdown("## 🤖 Multi-modal LLM")
gr.Markdown("This is a multi-modal LLM that takes text, image and audio as inputs.")
with gr.Row():
# with gr.Column(scale=4):
# with gr.Box():
# with gr.Row():
# # Adding image
# image = gr.Image(type="filepath", value=None)
# # with gr.Row():
# # Add audio
# audio_upload = gr.Audio(source="upload", type="filepath")
# audio_mic = gr.Audio(
# source="microphone", type="filepath", format="mp3"
# )
# with gr.Column(scale=8):
# with gr.Box():
# with gr.Row():
chatbot = gr.Chatbot(
avatar_images=("🧑", "🤖"),
height=560,
)
with gr.Row():
image = gr.Image(type="filepath", value=None)
audio_upload = gr.Audio(source="upload", type="filepath")
audio_mic = gr.Audio(
source="microphone", type="filepath", format="mp3"
)
with gr.Row():
prompt = gr.Textbox(
placeholder="Ask anything", lines=2, label="Query", value=None, scale=4
)
with gr.Row():
# Adding a Button
submit = gr.Button(value = "Submit", variant="primary")
clear = gr.Button(value="Clear")
submit.click(
add_content,
inputs=[chatbot, prompt, image, audio_upload, audio_mic],
outputs=[chatbot],
).success(
run,
inputs=[chatbot, prompt, image, audio_upload, audio_mic],
outputs=[chatbot, prompt, image, audio_upload, audio_mic],
)
clear.click(
clear_data,
outputs=[prompt, image, audio_upload, audio_mic, chatbot],
)
demo.launch() |