import gradio as gr from transformers import Qwen2_5OmniForCausalLM, AutoProcessor import torch # Load model and processor model_name = "Qwen/Qwen2.5-Omni-3B" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") processor = AutoProcessor.from_pretrained(model_name) device = model.device # Function to process inputs and generate response def process_input(text_input, image_input=None, audio_input=None, video_input=None): conversation = [ {"role": "user", "content": [{"text": text_input}]} ] if image_input: conversation[0]["content"].append({"image": image_input}) if audio_input: conversation[0]["content"].append({"audio": audio_input}) if video_input: conversation[0]["content"].append({"video": video_input}) # Process conversation model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device) # Generate response outputs = model.generate(**model_inputs, max_length=200) response_text = processor.decode(outputs[0], skip_special_tokens=True) # Audio output not implemented response_audio = None return response_text, response_audio # Gradio interface with gr.Blocks() as demo: gr.Markdown("# Qwen2.5-Omni-3B Demo") with gr.Row(): text_input = gr.Textbox(label="Text Input") image_input = gr.Image(label="Upload Image", type="filepath") audio_input = gr.Audio(label="Upload Audio", type="filepath") video_input = gr.Video(label="Upload Video", type="filepath") submit_button = gr.Button("Submit") text_output = gr.Textbox(label="Text Response") audio_output = gr.Audio(label="Audio Response") submit_button.click( fn=process_input, inputs=[text_input, image_input, audio_input, video_input], outputs=[text_output, audio_output] ) # Launch the app demo.launch()