Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ from PIL import Image
|
|
6 |
import gradio as gr
|
7 |
import librosa
|
8 |
import nltk
|
|
|
9 |
|
10 |
from transformers import PreTrainedModel
|
11 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
@@ -18,7 +19,7 @@ model_name = "microsoft/Phi-3.5-mini-instruct"
|
|
18 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
19 |
|
20 |
# Load the model and processor
|
21 |
-
clipmodel = CLIPModel.
|
22 |
clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
23 |
|
24 |
nltk.download('punkt')
|
@@ -235,11 +236,16 @@ def getInputs(image_path, question, answer=""):
|
|
235 |
return start_input_ids, end_input_ids, image_features, attention_mask
|
236 |
|
237 |
model_location = "./MM_FT_C1"
|
238 |
-
print("Model location:", model_location)
|
239 |
|
240 |
model = MultimodalPhiModel.from_pretrained(model_location).to(device)
|
241 |
|
242 |
-
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
def getStringAfter(output, start_str):
|
245 |
if start_str in output:
|
@@ -260,7 +266,7 @@ def getStringAfterAnswer(output):
|
|
260 |
answer = preprocess_text(answer)
|
261 |
return answer
|
262 |
|
263 |
-
def generateOutput(image_path, audio_path, context_text, question, max_length=
|
264 |
answerPart = ""
|
265 |
speech_text = ""
|
266 |
if image_path is not None:
|
@@ -294,7 +300,7 @@ def generateOutput(image_path, audio_path, context_text, question, max_length=5)
|
|
294 |
# base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
|
295 |
|
296 |
output_text = tokenizer.decode(
|
297 |
-
|
298 |
skip_special_tokens=True
|
299 |
)
|
300 |
|
@@ -326,14 +332,18 @@ def process_inputs(image, audio_source, audio_file, audio_mic, context_text, que
|
|
326 |
return answer
|
327 |
|
328 |
with demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
with gr.Row():
|
330 |
audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
|
331 |
audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
|
332 |
audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
|
333 |
-
|
334 |
context_text = gr.Textbox(label="Context Text")
|
335 |
-
question = gr.Textbox(label="Question")
|
336 |
-
output_text = gr.Textbox(label="Output")
|
337 |
|
338 |
def update_audio_input(source):
|
339 |
if source == "Microphone":
|
|
|
6 |
import gradio as gr
|
7 |
import librosa
|
8 |
import nltk
|
9 |
+
import re
|
10 |
|
11 |
from transformers import PreTrainedModel
|
12 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
19 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
20 |
|
21 |
# Load the model and processor
|
22 |
+
clipmodel = CLIPModel.frm_pretrained("openai/clip-vit-base-patch32")
|
23 |
clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
24 |
|
25 |
nltk.download('punkt')
|
|
|
236 |
return start_input_ids, end_input_ids, image_features, attention_mask
|
237 |
|
238 |
model_location = "./MM_FT_C1"
|
239 |
+
# print("Model location:", model_location)
|
240 |
|
241 |
model = MultimodalPhiModel.from_pretrained(model_location).to(device)
|
242 |
|
243 |
+
model_name = "microsoft/Phi-3.5-mini-instruct"
|
244 |
+
base_phi_model = AutoModelForCausalLM.from_pretrained(
|
245 |
+
model_name,
|
246 |
+
torch_dtype=torch.bfloat16,
|
247 |
+
trust_remote_code=True,
|
248 |
+
).to(device)
|
249 |
|
250 |
def getStringAfter(output, start_str):
|
251 |
if start_str in output:
|
|
|
266 |
answer = preprocess_text(answer)
|
267 |
return answer
|
268 |
|
269 |
+
def generateOutput(image_path, audio_path, context_text, question, max_length=2):
|
270 |
answerPart = ""
|
271 |
speech_text = ""
|
272 |
if image_path is not None:
|
|
|
300 |
# base_phi_model.generate(start_tokens, max_length=2, do_sample=False, pad_token_id=tokenizer.pad_token_id)
|
301 |
|
302 |
output_text = tokenizer.decode(
|
303 |
+
base_phi_model.generate(start_tokens, max_length=1024, do_sample=False, pad_token_id=tokenizer.pad_token_id)[0],
|
304 |
skip_special_tokens=True
|
305 |
)
|
306 |
|
|
|
332 |
return answer
|
333 |
|
334 |
with demo:
|
335 |
+
with gr.Row():
|
336 |
+
with gr.Column(scale=1, min_width=300):
|
337 |
+
image_input = gr.Image(type="filepath", label="Upload Image")
|
338 |
+
with gr.Column(scale=2, min_width=300):
|
339 |
+
question = gr.Textbox(label="Question")
|
340 |
+
output_text = gr.Textbox(label="Output")
|
341 |
with gr.Row():
|
342 |
audio_source = gr.Radio(choices=["Microphone", "Audio File"], label="Select Audio Source")
|
343 |
audio_file = gr.Audio(sources="upload", type="filepath", visible=False)
|
344 |
audio_mic = gr.Audio(sources="microphone", type="filepath", visible=False)
|
345 |
+
with gr.Row():
|
346 |
context_text = gr.Textbox(label="Context Text")
|
|
|
|
|
347 |
|
348 |
def update_audio_input(source):
|
349 |
if source == "Microphone":
|