Spaces:

Navyabhat
/

Capstone_Project

Sleeping

File size: 8,354 Bytes

import gradio as gr
from PIL import Image
from inference.main import MultiModalPhi2

messages = []

multimodal_phi2 = MultiModalPhi2(
    modelname_or_path="Navyabhat/Llava-Phi2",
    temperature=0.2,
    max_new_tokens=1024,
    device="cpu",
)


def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot:
    textflag, imageflag, audioflag = False, False, False
    if text not in ["", None]:
        chatbot.append((text, None))
        textflag = True
    if image is not None:
        chatbot.append(((image,), None))
        imageflag = True
    if audio_mic is not None:
        chatbot.append(((audio_mic,), None))
        audioflag = True
    else:
        if audio_upload is not None:
            chatbot.append(((audio_upload,), None))
            audioflag = True
    if not any([textflag, imageflag, audioflag]):
        # Raise an error if neither text nor file is provided
        raise gr.Error("Enter a valid text, image or audio")
    return chatbot


def clear_data():
    return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []}


def run(history, text, image, audio_upload, audio_mic):
    if text in [None, ""]:
        text = None

    if audio_upload is not None:
        audio = audio_upload
    elif audio_mic is not None:
        audio = audio_mic
    else:
        audio = None

    print("text", text)
    print("image", image)
    print("audio", audio)

    if image is not None:
        image = Image.open(image)
    outputs = multimodal_phi2(text, audio, image)
    # outputs = ""

    history.append((None, outputs.title()))
    return history, None, None, None, None


# # Custom styling
# interface_style = {
#     "box": {
#         "backgroundColor": "#f9f9f9",
#         "padding": "20px",
#         "borderRadius": "10px",
#         "boxShadow": "0 0 10px rgba(0, 0, 0, 0.1)",
#     },
#     "button": {
#         "backgroundColor": "#4caf50",
#         "color": "#fff",
#         "padding": "10px",
#         "border": "none",
#         "borderRadius": "5px",
#         "cursor": "pointer",
#     },
#     "textbox": {
#         "width": "100%",
#         "padding": "10px",
#         "marginBottom": "10px",
#         "boxSizing": "border-box",
#     },
#     "image": {
#         "width": "100%",
#         "marginBottom": "10px",
#     },
#     "audio": {
#         "width": "100%",
#         "marginBottom": "10px",
#     },
#     "chatbox": {
#         "height": "550px",
#         "backgroundColor": "#f0f0f0",
#         "borderRadius": "5px",
#         "padding": "10px",
#         "overflowY": "auto",
#     },
# }

# with gr.Blocks() as demo:
#     gr.Markdown("## MultiModal Phi2 Model Pretraining and Finetuning from Scratch")

#     with gr.Row():
#         with gr.Column(scale=4):
#             with gr.Box(style=interface_style["box"]):
#                 with gr.Row():
#                     prompt = gr.Textbox(
#                         placeholder="Enter Prompt",
#                         lines=2,
#                         label="Query",
#                         value=None,
#                         style=interface_style["textbox"],
#                     )
#                 with gr.Row():
#                     image = gr.Image(
#                         type="filepath", value=None, style=interface_style["image"]
#                     )
#                 with gr.Row():
#                     audio_upload = gr.Audio(
#                         source="upload", type="filepath", style=interface_style["audio"]
#                     )
#                     audio_mic = gr.Audio(
#                         source="microphone",
#                         type="filepath",
#                         format="mp3",
#                         style=interface_style["audio"],
#                     )

#         with gr.Column(scale=8):
#             with gr.Box(style=interface_style["box"]):
#                 with gr.Row():
#                     chatbot = gr.Chatbot(
#                         avatar_images=("🧑", "🤖"),
#                         height=550,
#                         style=interface_style["chatbox"],
#                     )
#                 with gr.Row():
#                     submit = gr.Button(style=interface_style["button"])
#                     clear = gr.Button(value="Clear", style=interface_style["button"])

#     submit.click(
#         add_content,
#         inputs=[chatbot, prompt, image, audio_upload, audio_mic],
#         outputs=[chatbot],
#     ).success(
#         run,
#         inputs=[chatbot, prompt, image, audio_upload, audio_mic],
#         outputs=[chatbot, prompt, image, audio_upload, audio_mic],
#     )

#     clear.click(
#         clear_data,
#         outputs=[prompt, image, audio_upload, audio_mic, chatbot],
#     )

# demo.launch()

custom_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <style>
        body {
            font-family: 'Arial', sans-serif;
            background-color: #f4f4f4;
            margin: 0;
            padding: 0;
        }
        #container {
            max-width: 800px;
            margin: 20px auto;
            padding: 20px;
            background-color: #fff;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
            border-radius: 10px;
            display: flex;
            flex-direction: column;
            align-items: center;
        }
        label {
            font-weight: bold;
            margin-bottom: 5px;
            display: block;
        }
        textarea, input[type="file"] {
            width: 100%;
            padding: 10px;
            margin-bottom: 10px;
            box-sizing: border-box;
        }
        button {
            background-color: #4caf50;
            color: #fff;
            padding: 10px;
            border: none;
            border-radius: 5px;
            cursor: pointer;
        }
        button:hover {
            background-color: #45a049;
        }
        #chat-container {
            max-height: 550px;
            overflow-y: auto;
            border: 1px solid #ddd;
            padding: 10px;
            border-radius: 5px;
            background-color: #f0f0f0;
            margin-top: 10px;
        }
    </style>
</head>
<body>
    <div id="container">
        <h2 style="text-align: center;">MultiModal Phi2 Model Pretraining and Finetuning from Scratch</h2>
        <div style="display: flex; width: 100%;">
            <div style="flex: 1; margin-right: 10px;">
                <label for="inputText">Text Input:</label>
                <textarea id="inputText" name="inputText" placeholder="Enter Prompt" rows="4"></textarea>

                <label for="inputImage">Image Input:</label>
                <input type="file" id="inputImage" name="inputImage" accept="image/*">

                <label for="inputAudio">Audio Input:</label>
                <input type="file" id="inputAudio" name="inputAudio" accept="audio/*">
            </div>
            <div style="flex: 2;">
                <div id="chat-container"></div>
            </div>
        </div>
        <button id="submitBtn">Submit</button>
        <button id="clearBtn" style="background-color: #f44336;">Clear</button>
    </div>

    <script>
        document.getElementById('submitBtn').addEventListener('click', function() {
            // Add your logic to process and update the chat
            // Example: Update the chat with a placeholder response
            var inputText = document.getElementById('inputText').value;
            var chatContainer = document.getElementById('chat-container');
            var response = "<strong>User:</strong> " + inputText + "<br><strong>Model:</strong> This is a placeholder response.<br>";
            chatContainer.innerHTML += response;
        });

        document.getElementById('clearBtn').addEventListener('click', function() {
            // Clear input and chat
            document.getElementById('inputText').value = '';
            document.getElementById('inputImage').value = '';
            document.getElementById('inputAudio').value = '';
            document.getElementById('chat-container').innerHTML = '';
        });
    </script>
</body>
</html>
"""

with gr.Blocks() as demo:
    gr.HTML(custom_html)
    
demo.launch()