Spaces:

Chintan-Shah
/

MyMultiModalExperiment

Sleeping

App Files Files Community

Chintan-Shah commited on Oct 1, 2024

Commit

963eec5

verified ·

1 Parent(s): 6e386c4

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -8

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from PIL import Image
 import gradio as gr
 import librosa
 import nltk
 from transformers import PreTrainedModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -18,7 +19,7 @@ model_name = "microsoft/Phi-3.5-mini-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 # Load the model and processor
-clipmodel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
 clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 nltk.download('punkt')
@@ -235,11 +236,16 @@ def getInputs(image_path, question, answer=""):
     return start_input_ids, end_input_ids, image_features, attention_mask
 model_location = "./MM_FT_C1"
-print("Model location:", model_location)
 model = MultimodalPhiModel.from_pretrained(model_location).to(device)
-import re
 def getStringAfter(output, start_str):
     if start_str in output:
@@ -260,7 +266,7 @@ def getStringAfterAnswer(output):
     answer = preprocess_text(answer)
     return answer
-def generateOutput(image_path, audio_path, context_text, question, max_length=5):
     answerPart = ""
     speech_text = ""
     if image_path is not None:
@@ -294,7 +300,7 @@ def generateOutput(image_path, audio_path, context_text, question, max_length=5)
     # base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
     output_text = tokenizer.decode(
-        model.base_phi_model.generate(start_tokens, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
         skip_special_tokens=True
     )
@@ -326,14 +332,18 @@ def process_inputs(image, audio_source, audio_file, audio_mic, context_text, que
     return answer
 with demo:
     with gr.Row():
         audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
         audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
         audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
-        image_input = gr.Image(type="filepath", label="Upload Image")
         context_text = gr.Textbox(label="Context Text")
-        question = gr.Textbox(label="Question")
-        output_text = gr.Textbox(label="Output")
     def update_audio_input(source):
         if source == "Microphone":

 import gradio as gr
 import librosa
 import nltk
+import re
 from transformers import PreTrainedModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 # Load the model and processor
+clipmodel = CLIPModel.frm_pretrained("openai/clip-vit-base-patch32")
 clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 nltk.download('punkt')
     return start_input_ids, end_input_ids, image_features, attention_mask
 model_location = "./MM_FT_C1"
+# print("Model location:", model_location)
 model = MultimodalPhiModel.from_pretrained(model_location).to(device)
+model_name = "microsoft/Phi-3.5-mini-instruct"
+base_phi_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+).to(device)
 def getStringAfter(output, start_str):
     if start_str in output:
     answer = preprocess_text(answer)
     return answer
+def generateOutput(image_path, audio_path, context_text, question, max_length=2):
     answerPart = ""
     speech_text = ""
     if image_path is not None:
     # base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
     output_text = tokenizer.decode(
+        base_phi_model.generate(start_tokens, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
         skip_special_tokens=True
     )
     return answer
 with demo:
+    with gr.Row():
+        with gr.Column(scale=1, min_width=300):
+            image_input = gr.Image(type="filepath", label="Upload Image")
+        with gr.Column(scale=2, min_width=300):
+            question = gr.Textbox(label="Question")
+            output_text = gr.Textbox(label="Output")
     with gr.Row():
         audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
         audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
         audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
+    with gr.Row():
         context_text = gr.Textbox(label="Context Text")
     def update_audio_input(source):
         if source == "Microphone":