File size: 1,945 Bytes
16588c5
01aaa01
556d852
 
507486b
556d852
707450c
507486b
 
556d852
 
507486b
 
 
 
556d852
507486b
556d852
507486b
 
 
556d852
507486b
a72f0f9
556d852
 
507486b
 
556d852
a72f0f9
507486b
556d852
 
 
 
 
 
16588c5
556d852
507486b
 
 
556d852
 
 
 
 
 
507486b
556d852
 
 
 
707450c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import gradio as gr
from transformers import Qwen2_5OmniForCausalLM, AutoProcessor
import torch

# Load model and processor
model_name = "Qwen/Qwen2.5-Omni-3B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
processor = AutoProcessor.from_pretrained(model_name)
device = model.device

# Function to process inputs and generate response
def process_input(text_input, image_input=None, audio_input=None, video_input=None):
    conversation = [
        {"role": "user", "content": [{"text": text_input}]}
    ]
    if image_input:
        conversation[0]["content"].append({"image": image_input})
    if audio_input:
        conversation[0]["content"].append({"audio": audio_input})
    if video_input:
        conversation[0]["content"].append({"video": video_input})
    
    # Process conversation
    model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device)
    
    # Generate response
    outputs = model.generate(**model_inputs, max_length=200)
    response_text = processor.decode(outputs[0], skip_special_tokens=True)
    
    # Audio output not implemented
    response_audio = None
    
    return response_text, response_audio

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Qwen2.5-Omni-3B Demo")
    with gr.Row():
        text_input = gr.Textbox(label="Text Input")
        image_input = gr.Image(label="Upload Image", type="filepath")
        audio_input = gr.Audio(label="Upload Audio", type="filepath")
        video_input = gr.Video(label="Upload Video", type="filepath")
    submit_button = gr.Button("Submit")
    text_output = gr.Textbox(label="Text Response")
    audio_output = gr.Audio(label="Audio Response")
    
    submit_button.click(
        fn=process_input,
        inputs=[text_input, image_input, audio_input, video_input],
        outputs=[text_output, audio_output]
    )

# Launch the app
demo.launch()