Spaces:
Running
Running
import gradio as gr | |
from transformers import Qwen2_5OmniForCausalLM, AutoProcessor | |
import torch | |
# Load model and processor | |
model_name = "Qwen/Qwen2.5-Omni-3B" | |
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") | |
processor = AutoProcessor.from_pretrained(model_name) | |
device = model.device | |
# Function to process inputs and generate response | |
def process_input(text_input, image_input=None, audio_input=None, video_input=None): | |
conversation = [ | |
{"role": "user", "content": [{"text": text_input}]} | |
] | |
if image_input: | |
conversation[0]["content"].append({"image": image_input}) | |
if audio_input: | |
conversation[0]["content"].append({"audio": audio_input}) | |
if video_input: | |
conversation[0]["content"].append({"video": video_input}) | |
# Process conversation | |
model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device) | |
# Generate response | |
outputs = model.generate(**model_inputs, max_length=200) | |
response_text = processor.decode(outputs[0], skip_special_tokens=True) | |
# Audio output not implemented | |
response_audio = None | |
return response_text, response_audio | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Qwen2.5-Omni-3B Demo") | |
with gr.Row(): | |
text_input = gr.Textbox(label="Text Input") | |
image_input = gr.Image(label="Upload Image", type="filepath") | |
audio_input = gr.Audio(label="Upload Audio", type="filepath") | |
video_input = gr.Video(label="Upload Video", type="filepath") | |
submit_button = gr.Button("Submit") | |
text_output = gr.Textbox(label="Text Response") | |
audio_output = gr.Audio(label="Audio Response") | |
submit_button.click( | |
fn=process_input, | |
inputs=[text_input, image_input, audio_input, video_input], | |
outputs=[text_output, audio_output] | |
) | |
# Launch the app | |
demo.launch() | |