Spaces:
Running
Running
File size: 1,945 Bytes
16588c5 01aaa01 556d852 507486b 556d852 707450c 507486b 556d852 507486b 556d852 507486b 556d852 507486b 556d852 507486b a72f0f9 556d852 507486b 556d852 a72f0f9 507486b 556d852 16588c5 556d852 507486b 556d852 507486b 556d852 707450c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
from transformers import Qwen2_5OmniForCausalLM, AutoProcessor
import torch
# Load model and processor
model_name = "Qwen/Qwen2.5-Omni-3B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
processor = AutoProcessor.from_pretrained(model_name)
device = model.device
# Function to process inputs and generate response
def process_input(text_input, image_input=None, audio_input=None, video_input=None):
conversation = [
{"role": "user", "content": [{"text": text_input}]}
]
if image_input:
conversation[0]["content"].append({"image": image_input})
if audio_input:
conversation[0]["content"].append({"audio": audio_input})
if video_input:
conversation[0]["content"].append({"video": video_input})
# Process conversation
model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device)
# Generate response
outputs = model.generate(**model_inputs, max_length=200)
response_text = processor.decode(outputs[0], skip_special_tokens=True)
# Audio output not implemented
response_audio = None
return response_text, response_audio
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Qwen2.5-Omni-3B Demo")
with gr.Row():
text_input = gr.Textbox(label="Text Input")
image_input = gr.Image(label="Upload Image", type="filepath")
audio_input = gr.Audio(label="Upload Audio", type="filepath")
video_input = gr.Video(label="Upload Video", type="filepath")
submit_button = gr.Button("Submit")
text_output = gr.Textbox(label="Text Response")
audio_output = gr.Audio(label="Audio Response")
submit_button.click(
fn=process_input,
inputs=[text_input, image_input, audio_input, video_input],
outputs=[text_output, audio_output]
)
# Launch the app
demo.launch()
|