Chintan-Shah commited on
Commit
b50280a
·
verified ·
1 Parent(s): 1640931

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -38
app.py CHANGED
@@ -190,10 +190,19 @@ def getAudioArray(audio_path):
190
  speech, rate = librosa.load(audio_path, sr=16000)
191
  return speech
192
 
 
 
 
 
 
 
 
 
 
 
193
  def getInputs(image_path, question, answer=""):
194
 
195
  image_features = None
196
- speech_text = ""
197
  num_image_tokens = 0
198
 
199
  if image_path is not None:
@@ -210,10 +219,10 @@ def getInputs(image_path, question, answer=""):
210
  num_image_tokens = image_features.shape[1]
211
 
212
  # Start text before putting image embedding
213
- start_text = f"<|system|>\nYou are an assistant good at understanding the objects and their relationship from the context.<|end|>\n<|user|>\n"
214
 
215
  # Prepare text input for causal language modeling
216
- end_text = f"\nPlease describe the objects and their relationship from the context.<|end|>\n<|assistant|>\n{answer}"
217
 
218
  # Tokenize the full texts
219
  start_tokens = tokenizer(start_text, padding=True, truncation=True, max_length=512, return_tensors="pt")
@@ -240,12 +249,12 @@ model_location = "./MM_FT_C1"
240
 
241
  model = MultimodalPhiModel.from_pretrained(model_location).to(device)
242
 
243
- model_name = "microsoft/Phi-3.5-mini-instruct"
244
- base_phi_model = AutoModelForCausalLM.from_pretrained(
245
- model_name,
246
- torch_dtype=torch.bfloat16,
247
- trust_remote_code=True,
248
- ).to(device)
249
 
250
  def getStringAfter(output, start_str):
251
  if start_str in output:
@@ -256,17 +265,12 @@ def getStringAfter(output, start_str):
256
  answer = preprocess_text(answer)
257
  return answer
258
 
 
 
 
 
259
 
260
- def getStringAfterAnswer(output):
261
- if "<|assistant|>" in output:
262
- answer = output.split("<|assistant|>")[1]
263
- else:
264
- answer = output
265
-
266
- answer = preprocess_text(answer)
267
- return answer
268
-
269
- def generateOutput(image_path, audio_path, context_text, question, max_length=2):
270
  answerPart = ""
271
  speech_text = ""
272
  if image_path is not None:
@@ -279,7 +283,8 @@ def generateOutput(image_path, audio_path, context_text, question, max_length=2)
279
  tokens[0],
280
  skip_special_tokens=True
281
  )
282
- answerPart = getStringAfter(output, "<|assistant|>")
 
283
  print("Answerpart:", answerPart)
284
 
285
  if audio_path is not None:
@@ -287,20 +292,21 @@ def generateOutput(image_path, audio_path, context_text, question, max_length=2)
287
  print("Speech Text:", speech_text)
288
 
289
  if (question is None) or (question == ""):
290
- question = "Provide only in 1 sentence to describe the objects and their relationships in it."
291
 
292
  input_text = (
293
- "<|system|>\nPlease understand the context "
294
- "and answer the question based on the context in 1 or 2 summarized sentences.\n"
295
- f"<|end|>\n<|user|>\n<|context|>{answerPart}\n{speech_text}\n{context_text}"
296
- f"\n<|question|>: {question}\n<|end|>\n<|assistant|>\n"
297
  )
298
  print("input_text:", input_text)
299
  start_tokens = tokenizer(input_text, padding=True, truncation=True, max_length=1024, return_tensors="pt")['input_ids'].to(device)
 
300
  # base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
301
 
302
  output_text = tokenizer.decode(
303
- base_phi_model.generate(start_tokens, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
304
  skip_special_tokens=True
305
  )
306
 
@@ -309,12 +315,6 @@ def generateOutput(image_path, audio_path, context_text, question, max_length=2)
309
 
310
  title = "Created Fine Tuned MultiModal model"
311
  description = "Test the fine tuned multimodal model created using clip, phi3.5 mini instruct, whisper models"
312
- examples = [
313
- ["./images/COCO_train2014_000000581181.jpg", None, None, None, None, "Describe what is happening in this image."],
314
- [None, "Audio File", "./audio/03-01-01-01-01-01-01.wav", None, None, "Describe what is the person trying to tell in this audio."],
315
- ]
316
-
317
- # [None, "Microphone", None, "example_audio_mic.wav", "Context without image.", "What is the result?"],
318
 
319
  demo = gr.Blocks()
320
 
@@ -332,18 +332,20 @@ def process_inputs(image, audio_source, audio_file, audio_mic, context_text, que
332
  return answer
333
 
334
  with demo:
 
 
 
335
  with gr.Row():
336
  with gr.Column(scale=1, min_width=300):
337
  image_input = gr.Image(type="filepath", label="Upload Image")
338
  with gr.Column(scale=2, min_width=300):
339
  question = gr.Textbox(label="Question")
 
 
 
 
 
340
  output_text = gr.Textbox(label="Output")
341
- with gr.Row():
342
- audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
343
- audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
344
- audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
345
- with gr.Row():
346
- context_text = gr.Textbox(label="Context Text")
347
 
348
  def update_audio_input(source):
349
  if source == "Microphone":
 
190
  speech, rate = librosa.load(audio_path, sr=16000)
191
  return speech
192
 
193
+ # Start text before putting image embedding
194
+ start_text = "<|system|> \n You are an assistant good at understanding the context.<|end|> \n <|user|> \n"
195
+ # Prepare text input for causal language modeling
196
+ end_text = "\n Describe the objects and their relationship in the given context.<|end|> \n <|assistant|> \n"
197
+
198
+ words = nltk.word_tokenize(start_text) + nltk.word_tokenize(end_text)
199
+ input_words = list(set(words))
200
+ # print("Input words:",input_words)
201
+
202
+
203
  def getInputs(image_path, question, answer=""):
204
 
205
  image_features = None
 
206
  num_image_tokens = 0
207
 
208
  if image_path is not None:
 
219
  num_image_tokens = image_features.shape[1]
220
 
221
  # Start text before putting image embedding
222
+ start_text = f"<|system|>\nYou are an assistant good at understanding the context.<|end|>\n<|user|>\n "
223
 
224
  # Prepare text input for causal language modeling
225
+ end_text = f" .\n Describe the objects and their relationship from the context. <|end|>\n<|assistant|>\n {answer}"
226
 
227
  # Tokenize the full texts
228
  start_tokens = tokenizer(start_text, padding=True, truncation=True, max_length=512, return_tensors="pt")
 
249
 
250
  model = MultimodalPhiModel.from_pretrained(model_location).to(device)
251
 
252
+ # model_name = "microsoft/Phi-3.5-mini-instruct"
253
+ # base_phi_model = AutoModelForCausalLM.from_pretrained(
254
+ # model_name,
255
+ # torch_dtype=torch.bfloat16,
256
+ # trust_remote_code=True,
257
+ # ).to(device)
258
 
259
  def getStringAfter(output, start_str):
260
  if start_str in output:
 
265
  answer = preprocess_text(answer)
266
  return answer
267
 
268
+ def getAnswerPart(output):
269
+ output_words = nltk.word_tokenize(output)
270
+ filtered_words = [word for word in output_words if word.lower() not in [w.lower() for w in input_words]]
271
+ return ' '.join(filtered_words)
272
 
273
+ def generateOutput(image_path, audio_path, context_text, question, max_length=3):
 
 
 
 
 
 
 
 
 
274
  answerPart = ""
275
  speech_text = ""
276
  if image_path is not None:
 
283
  tokens[0],
284
  skip_special_tokens=True
285
  )
286
+ # answerPart = getStringAfter(output, "<|assistant|>")
287
+ answerPart = getAnswerPart(output)
288
  print("Answerpart:", answerPart)
289
 
290
  if audio_path is not None:
 
292
  print("Speech Text:", speech_text)
293
 
294
  if (question is None) or (question == ""):
295
+ question = " Describe the objects and their relationships in 1 sentence."
296
 
297
  input_text = (
298
+ "<|system|>\n Please understand the context "
299
+ "and answer the question in 1 or 2 summarized sentences.\n"
300
+ f"<|end|>\n<|user|>\n<|context|> {answerPart} \n {speech_text} \n {context_text} "
301
+ f"\n<|question|>: {question} \n<|end|>\n<|assistant|>\n"
302
  )
303
  print("input_text:", input_text)
304
  start_tokens = tokenizer(input_text, padding=True, truncation=True, max_length=1024, return_tensors="pt")['input_ids'].to(device)
305
+ attention_mask = tokens['attention_mask'].to(device)
306
  # base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
307
 
308
  output_text = tokenizer.decode(
309
+ model.base_phi_model.generate(start_tokens, attention_mask=attention_mask, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
310
  skip_special_tokens=True
311
  )
312
 
 
315
 
316
  title = "Created Fine Tuned MultiModal model"
317
  description = "Test the fine tuned multimodal model created using clip, phi3.5 mini instruct, whisper models"
 
 
 
 
 
 
318
 
319
  demo = gr.Blocks()
320
 
 
332
  return answer
333
 
334
  with demo:
335
+ gr.Markdown(f"# {title}")
336
+ gr.Markdown(f" {description}")
337
+
338
  with gr.Row():
339
  with gr.Column(scale=1, min_width=300):
340
  image_input = gr.Image(type="filepath", label="Upload Image")
341
  with gr.Column(scale=2, min_width=300):
342
  question = gr.Textbox(label="Question")
343
+ with gr.Row():
344
+ audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
345
+ audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
346
+ audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
347
+ context_text = gr.Textbox(label="Context Text")
348
  output_text = gr.Textbox(label="Output")
 
 
 
 
 
 
349
 
350
  def update_audio_input(source):
351
  if source == "Microphone":