File size: 3,809 Bytes
fef0a8d
294c109
 
99eb93c
fef0a8d
 
294c109
0c034e2
45099c6
542f90d
fef0a8d
294c109
fef0a8d
 
 
 
 
5268082
45099c6
5268082
e52a62d
0629ecb
4bd5128
294c109
 
4bd5128
96f2f76
 
 
 
 
 
 
 
45099c6
294c109
 
 
45099c6
 
294c109
 
 
 
 
 
 
5268082
c60c480
baefccb
 
e7322bf
c60c480
029aec2
294c109
c60c480
294c109
 
c60c480
294c109
ceea9f7
c60c480
5e7d101
294c109
c60c480
 
294c109
 
3ef9424
af21e2a
4bd5128
5268082
 
 
 
 
1b6a68d
5268082
294c109
 
 
8a86647
294c109
5268082
294c109
5268082
 
 
46010b5
 
 
5268082
32f0fe9
5e7d101
5268082
 
eb3d2f3
5268082
 
eb3d2f3
5268082
 
 
 
 
 
 
c60c480
5268082
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Imports
import os
import sys
import gradio as gr
import spaces
import torch
import librosa
from PIL import Image
from decord import VideoReader, cpu
from transformers import AutoModel, AutoTokenizer, AutoProcessor

# Variables
DEVICE = "auto"
if DEVICE == "auto":
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[SYSTEM] | Using {DEVICE} type compute device.")

DEFAULT_INPUT = "Describe in one paragraph."
MAX_FRAMES = 64

model_name = "openbmb/MiniCPM-o-2_6"

repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

css = '''
.gradio-container{max-width: 560px !important}
h1{text-align:center}
footer {
    visibility: hidden
}
'''

def encode_video(video_path):
    def uniform_sample(idxs, n):
        gap = len(idxs) / n
        return [idxs[int(i*gap + gap/2)] for i in range(n)]

    vr = VideoReader(video_path, ctx=cpu(0))
    fps = round(vr.get_avg_fps())
    idxs = list(range(0, len(vr), fps))
    if len(idxs) > MAX_FRAMES:
        idxs = uniform_sample(idxs, MAX_FRAMES)
    frames = vr.get_batch(idxs).asnumpy()
    return [Image.fromarray(f.astype("uint8")) for f in frames]

@spaces.GPU(duration=60)
def generate(image, video, audio, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):

    print(image)
    print(video)
    print(audio)
    print(instruction)

    if image is not None:
        img = Image.fromarray(image.astype("uint8"), "RGB")
        inputs = [{"role": "user", "content": [img, instruction]}]
    elif video is not None:
        vid = encode_video(video)
        inputs = [{"role": "user", "content": vid + [instruction]}]
    elif audio is not None:
        aud, _ = librosa.load(audio, sr=16000, mono=True)
        inputs = [{"role": "user", "content": [aud, instruction]}]
    else:
        return "No input provided."

    params = {
        "msgs": inputs,
        "tokenizer": tokenizer,
        #"processor": processor,
        "sampling": sampling,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "repetition_penalty": repetition_penalty,
        "max_new_tokens": max_tokens,
    }

    output = repo.chat(**params)

    print(output)

    return output

def cloud():
    print("[CLOUD] | Space maintained.")

# Initialize
with gr.Blocks(css=css) as main:
    with gr.Column():
        input = gr.Image(label="Image")
        input_2 = gr.Video(label="Video")
        input_3 = gr.Audio(label="Audio", type="filepath")
        instruction = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Instruction")
        sampling = gr.Checkbox(value=False, label="Sampling")
        temperature = gr.Slider(minimum=0.01, maximum=1.99, step=0.01, value=0.7, label="Temperature")
        top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label="Top P")
        top_k = gr.Slider(minimum=0, maximum=1000, step=1, value=100, label="Top K")
        repetition_penalty = gr.Slider(minimum=0.01, maximum=1.99, step=0.01, value=1.05, label="Repetition Penalty")
        max_tokens = gr.Slider(minimum=1, maximum=4096, step=1, value=512, label="Max Tokens")
        submit = gr.Button("▶")
        maintain = gr.Button("☁️")
        
    with gr.Column():
        output = gr.Textbox(lines=1, value="", label="Output")

    submit.click(fn=generate, inputs=[input, input_2, input_3, instruction, sampling, temperature, top_p, top_k, repetition_penalty, max_tokens], outputs=[output], queue=False)
    maintain.click(cloud, inputs=[], outputs=[], queue=False)

main.launch(show_api=True)