Chintan-Shah commited on
Commit
963eec5
·
verified ·
1 Parent(s): 6e386c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -8
app.py CHANGED
@@ -6,6 +6,7 @@ from PIL import Image
6
  import gradio as gr
7
  import librosa
8
  import nltk
 
9
 
10
  from transformers import PreTrainedModel
11
  from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -18,7 +19,7 @@ model_name = "microsoft/Phi-3.5-mini-instruct"
18
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
19
 
20
  # Load the model and processor
21
- clipmodel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
22
  clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
23
 
24
  nltk.download('punkt')
@@ -235,11 +236,16 @@ def getInputs(image_path, question, answer=""):
235
  return start_input_ids, end_input_ids, image_features, attention_mask
236
 
237
  model_location = "./MM_FT_C1"
238
- print("Model location:", model_location)
239
 
240
  model = MultimodalPhiModel.from_pretrained(model_location).to(device)
241
 
242
- import re
 
 
 
 
 
243
 
244
  def getStringAfter(output, start_str):
245
  if start_str in output:
@@ -260,7 +266,7 @@ def getStringAfterAnswer(output):
260
  answer = preprocess_text(answer)
261
  return answer
262
 
263
- def generateOutput(image_path, audio_path, context_text, question, max_length=5):
264
  answerPart = ""
265
  speech_text = ""
266
  if image_path is not None:
@@ -294,7 +300,7 @@ def generateOutput(image_path, audio_path, context_text, question, max_length=5)
294
  # base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
295
 
296
  output_text = tokenizer.decode(
297
- model.base_phi_model.generate(start_tokens, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
298
  skip_special_tokens=True
299
  )
300
 
@@ -326,14 +332,18 @@ def process_inputs(image, audio_source, audio_file, audio_mic, context_text, que
326
  return answer
327
 
328
  with demo:
 
 
 
 
 
 
329
  with gr.Row():
330
  audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
331
  audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
332
  audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
333
- image_input = gr.Image(type="filepath", label="Upload Image")
334
  context_text = gr.Textbox(label="Context Text")
335
- question = gr.Textbox(label="Question")
336
- output_text = gr.Textbox(label="Output")
337
 
338
  def update_audio_input(source):
339
  if source == "Microphone":
 
6
  import gradio as gr
7
  import librosa
8
  import nltk
9
+ import re
10
 
11
  from transformers import PreTrainedModel
12
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
19
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
20
 
21
  # Load the model and processor
22
+ clipmodel = CLIPModel.frm_pretrained("openai/clip-vit-base-patch32")
23
  clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
24
 
25
  nltk.download('punkt')
 
236
  return start_input_ids, end_input_ids, image_features, attention_mask
237
 
238
  model_location = "./MM_FT_C1"
239
+ # print("Model location:", model_location)
240
 
241
  model = MultimodalPhiModel.from_pretrained(model_location).to(device)
242
 
243
+ model_name = "microsoft/Phi-3.5-mini-instruct"
244
+ base_phi_model = AutoModelForCausalLM.from_pretrained(
245
+ model_name,
246
+ torch_dtype=torch.bfloat16,
247
+ trust_remote_code=True,
248
+ ).to(device)
249
 
250
  def getStringAfter(output, start_str):
251
  if start_str in output:
 
266
  answer = preprocess_text(answer)
267
  return answer
268
 
269
+ def generateOutput(image_path, audio_path, context_text, question, max_length=2):
270
  answerPart = ""
271
  speech_text = ""
272
  if image_path is not None:
 
300
  # base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
301
 
302
  output_text = tokenizer.decode(
303
+ base_phi_model.generate(start_tokens, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
304
  skip_special_tokens=True
305
  )
306
 
 
332
  return answer
333
 
334
  with demo:
335
+ with gr.Row():
336
+ with gr.Column(scale=1, min_width=300):
337
+ image_input = gr.Image(type="filepath", label="Upload Image")
338
+ with gr.Column(scale=2, min_width=300):
339
+ question = gr.Textbox(label="Question")
340
+ output_text = gr.Textbox(label="Output")
341
  with gr.Row():
342
  audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
343
  audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
344
  audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
345
+ with gr.Row():
346
  context_text = gr.Textbox(label="Context Text")
 
 
347
 
348
  def update_audio_input(source):
349
  if source == "Microphone":