Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PIL import Image | |
from inference.main import MultiModalPhi2 | |
messages = [] | |
multimodal_phi2 = MultiModalPhi2( | |
modelname_or_path="Navyabhat/Llava-Phi2", | |
temperature=0.2, | |
max_new_tokens=1024, | |
device="cpu", | |
) | |
def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot: | |
textflag, imageflag, audioflag = False, False, False | |
if text not in ["", None]: | |
chatbot.append((text, None)) | |
textflag = True | |
if image is not None: | |
chatbot.append(((image,), None)) | |
imageflag = True | |
if audio_mic is not None: | |
chatbot.append(((audio_mic,), None)) | |
audioflag = True | |
else: | |
if audio_upload is not None: | |
chatbot.append(((audio_upload,), None)) | |
audioflag = True | |
if not any([textflag, imageflag, audioflag]): | |
# Raise an error if neither text nor file is provided | |
raise gr.Error("Enter a valid text, image or audio") | |
return chatbot | |
def clear_data(): | |
return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []} | |
def run(history, text, image, audio_upload, audio_mic): | |
if text in [None, ""]: | |
text = None | |
if audio_upload is not None: | |
audio = audio_upload | |
elif audio_mic is not None: | |
audio = audio_mic | |
else: | |
audio = None | |
print("text", text) | |
print("image", image) | |
print("audio", audio) | |
if image is not None: | |
image = Image.open(image) | |
outputs = multimodal_phi2(text, audio, image) | |
# outputs = "" | |
history.append((None, outputs.title())) | |
return history, None, None, None, None | |
# # Custom styling | |
# interface_style = { | |
# "box": { | |
# "backgroundColor": "#f9f9f9", | |
# "padding": "20px", | |
# "borderRadius": "10px", | |
# "boxShadow": "0 0 10px rgba(0, 0, 0, 0.1)", | |
# }, | |
# "button": { | |
# "backgroundColor": "#4caf50", | |
# "color": "#fff", | |
# "padding": "10px", | |
# "border": "none", | |
# "borderRadius": "5px", | |
# "cursor": "pointer", | |
# }, | |
# "textbox": { | |
# "width": "100%", | |
# "padding": "10px", | |
# "marginBottom": "10px", | |
# "boxSizing": "border-box", | |
# }, | |
# "image": { | |
# "width": "100%", | |
# "marginBottom": "10px", | |
# }, | |
# "audio": { | |
# "width": "100%", | |
# "marginBottom": "10px", | |
# }, | |
# "chatbox": { | |
# "height": "550px", | |
# "backgroundColor": "#f0f0f0", | |
# "borderRadius": "5px", | |
# "padding": "10px", | |
# "overflowY": "auto", | |
# }, | |
# } | |
# with gr.Blocks() as demo: | |
# gr.Markdown("## MultiModal Phi2 Model Pretraining and Finetuning from Scratch") | |
# with gr.Row(): | |
# with gr.Column(scale=4): | |
# with gr.Box(style=interface_style["box"]): | |
# with gr.Row(): | |
# prompt = gr.Textbox( | |
# placeholder="Enter Prompt", | |
# lines=2, | |
# label="Query", | |
# value=None, | |
# style=interface_style["textbox"], | |
# ) | |
# with gr.Row(): | |
# image = gr.Image( | |
# type="filepath", value=None, style=interface_style["image"] | |
# ) | |
# with gr.Row(): | |
# audio_upload = gr.Audio( | |
# source="upload", type="filepath", style=interface_style["audio"] | |
# ) | |
# audio_mic = gr.Audio( | |
# source="microphone", | |
# type="filepath", | |
# format="mp3", | |
# style=interface_style["audio"], | |
# ) | |
# with gr.Column(scale=8): | |
# with gr.Box(style=interface_style["box"]): | |
# with gr.Row(): | |
# chatbot = gr.Chatbot( | |
# avatar_images=("🧑", "🤖"), | |
# height=550, | |
# style=interface_style["chatbox"], | |
# ) | |
# with gr.Row(): | |
# submit = gr.Button(style=interface_style["button"]) | |
# clear = gr.Button(value="Clear", style=interface_style["button"]) | |
# submit.click( | |
# add_content, | |
# inputs=[chatbot, prompt, image, audio_upload, audio_mic], | |
# outputs=[chatbot], | |
# ).success( | |
# run, | |
# inputs=[chatbot, prompt, image, audio_upload, audio_mic], | |
# outputs=[chatbot, prompt, image, audio_upload, audio_mic], | |
# ) | |
# clear.click( | |
# clear_data, | |
# outputs=[prompt, image, audio_upload, audio_mic, chatbot], | |
# ) | |
# demo.launch() | |
custom_html = """ | |
<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<style> | |
body { | |
font-family: 'Arial', sans-serif; | |
background-color: #f4f4f4; | |
margin: 0; | |
padding: 0; | |
} | |
#container { | |
max-width: 800px; | |
margin: 20px auto; | |
padding: 20px; | |
background-color: #fff; | |
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); | |
border-radius: 10px; | |
display: flex; | |
flex-direction: column; | |
align-items: center; | |
} | |
label { | |
font-weight: bold; | |
margin-bottom: 5px; | |
display: block; | |
} | |
textarea, input[type="file"] { | |
width: 100%; | |
padding: 10px; | |
margin-bottom: 10px; | |
box-sizing: border-box; | |
} | |
button { | |
background-color: #4caf50; | |
color: #fff; | |
padding: 10px; | |
border: none; | |
border-radius: 5px; | |
cursor: pointer; | |
} | |
button:hover { | |
background-color: #45a049; | |
} | |
#chat-container { | |
max-height: 550px; | |
overflow-y: auto; | |
border: 1px solid #ddd; | |
padding: 10px; | |
border-radius: 5px; | |
background-color: #f0f0f0; | |
margin-top: 10px; | |
} | |
</style> | |
</head> | |
<body> | |
<div id="container"> | |
<h2 style="text-align: center;">MultiModal Phi2 Model Pretraining and Finetuning from Scratch</h2> | |
<div style="display: flex; width: 100%;"> | |
<div style="flex: 1; margin-right: 10px;"> | |
<label for="inputText">Text Input:</label> | |
<textarea id="inputText" name="inputText" placeholder="Enter Prompt" rows="4"></textarea> | |
<label for="inputImage">Image Input:</label> | |
<input type="file" id="inputImage" name="inputImage" accept="image/*"> | |
<label for="inputAudio">Audio Input:</label> | |
<input type="file" id="inputAudio" name="inputAudio" accept="audio/*"> | |
</div> | |
<div style="flex: 2;"> | |
<div id="chat-container"></div> | |
</div> | |
</div> | |
<button id="submitBtn">Submit</button> | |
<button id="clearBtn" style="background-color: #f44336;">Clear</button> | |
</div> | |
<script> | |
document.getElementById('submitBtn').addEventListener('click', function() { | |
// Add your logic to process and update the chat | |
// Example: Update the chat with a placeholder response | |
var inputText = document.getElementById('inputText').value; | |
var chatContainer = document.getElementById('chat-container'); | |
var response = "<strong>User:</strong> " + inputText + "<br><strong>Model:</strong> This is a placeholder response.<br>"; | |
chatContainer.innerHTML += response; | |
}); | |
document.getElementById('clearBtn').addEventListener('click', function() { | |
// Clear input and chat | |
document.getElementById('inputText').value = ''; | |
document.getElementById('inputImage').value = ''; | |
document.getElementById('inputAudio').value = ''; | |
document.getElementById('chat-container').innerHTML = ''; | |
}); | |
</script> | |
</body> | |
</html> | |
""" | |
with gr.Blocks() as demo: | |
gr.HTML(custom_html) | |
demo.launch() |