Spaces:

Navyabhat
/

Capstone_Project

Sleeping

File size: 4,620 Bytes

import gradio as gr
from PIL import Image
from inference.main import MultiModalPhi2

messages = []

multimodal_phi2 = MultiModalPhi2(
    modelname_or_path="Navyabhat/Llava-Phi2",
    temperature=0.2,
    max_new_tokens=1024,
    device="cpu",
)


def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot:
    textflag, imageflag, audioflag = False, False, False
    if text not in ["", None]:
        chatbot.append((text, None))
        textflag = True
    if image is not None:
        chatbot.append(((image,), None))
        imageflag = True
    if audio_mic is not None:
        chatbot.append(((audio_mic,), None))
        audioflag = True
    else:
        if audio_upload is not None:
            chatbot.append(((audio_upload,), None))
            audioflag = True
    if not any([textflag, imageflag, audioflag]):
        # Raise an error if neither text nor file is provided
        raise gr.Error("Enter a valid text, image or audio")
    return chatbot


def clear_data():
    return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []}


def run(history, text, image, audio_upload, audio_mic):
    if text in [None, ""]:
        text = None

    if audio_upload is not None:
        audio = audio_upload
    elif audio_mic is not None:
        audio = audio_mic
    else:
        audio = None

    print("text", text)
    print("image", image)
    print("audio", audio)

    if image is not None:
        image = Image.open(image)
    outputs = multimodal_phi2(text, audio, image)
    # outputs = ""

    history.append((None, outputs.title()))
    return history, None, None, None, None


# Custom styling
interface_style = {
    "box": {
        "backgroundColor": "#f9f9f9",
        "padding": "20px",
        "borderRadius": "10px",
        "boxShadow": "0 0 10px rgba(0, 0, 0, 0.1)",
    },
    "button": {
        "backgroundColor": "#4caf50",
        "color": "#fff",
        "padding": "10px",
        "border": "none",
        "borderRadius": "5px",
        "cursor": "pointer",
    },
    "textbox": {
        "width": "100%",
        "padding": "10px",
        "marginBottom": "10px",
        "boxSizing": "border-box",
    },
    "image": {
        "width": "100%",
        "marginBottom": "10px",
    },
    "audio": {
        "width": "100%",
        "marginBottom": "10px",
    },
    "chatbox": {
        "height": "550px",
        "backgroundColor": "#f0f0f0",
        "borderRadius": "5px",
        "padding": "10px",
        "overflowY": "auto",
    },
}

with gr.Blocks() as demo:
    gr.Markdown("## MultiModal Phi2 Model Pretraining and Finetuning from Scratch")

    with gr.Row():
        with gr.Column(scale=4):
            with gr.Box(style=interface_style["box"]):
                with gr.Row():
                    prompt = gr.Textbox(
                        placeholder="Enter Prompt",
                        lines=2,
                        label="Query",
                        value=None,
                        style=interface_style["textbox"],
                    )
                with gr.Row():
                    image = gr.Image(
                        type="filepath", value=None, style=interface_style["image"]
                    )
                with gr.Row():
                    audio_upload = gr.Audio(
                        source="upload", type="filepath", style=interface_style["audio"]
                    )
                    audio_mic = gr.Audio(
                        source="microphone",
                        type="filepath",
                        format="mp3",
                        style=interface_style["audio"],
                    )

        with gr.Column(scale=8):
            with gr.Box(style=interface_style["box"]):
                with gr.Row():
                    chatbot = gr.Chatbot(
                        avatar_images=("🧑", "🤖"),
                        height=550,
                        style=interface_style["chatbox"],
                    )
                with gr.Row():
                    submit = gr.Button(style=interface_style["button"])
                    clear = gr.Button(value="Clear", style=interface_style["button"])

    submit.click(
        add_content,
        inputs=[chatbot, prompt, image, audio_upload, audio_mic],
        outputs=[chatbot],
    ).success(
        run,
        inputs=[chatbot, prompt, image, audio_upload, audio_mic],
        outputs=[chatbot, prompt, image, audio_upload, audio_mic],
    )

    clear.click(
        clear_data,
        outputs=[prompt, image, audio_upload, audio_mic, chatbot],
    )

demo.launch()