Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,30 +1,35 @@
|
|
1 |
# Imports
|
|
|
|
|
2 |
import gradio as gr
|
3 |
import spaces
|
4 |
import torch
|
5 |
-
|
6 |
from PIL import Image
|
7 |
from decord import VideoReader, cpu
|
8 |
from transformers import AutoModel, AutoTokenizer, AutoProcessor
|
9 |
|
10 |
-
#
|
11 |
DEVICE = "auto"
|
12 |
if DEVICE == "auto":
|
13 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
14 |
print(f"[SYSTEM] | Using {DEVICE} type compute device.")
|
15 |
|
16 |
-
# Variables
|
17 |
DEFAULT_INPUT = "Describe in one paragraph."
|
18 |
MAX_FRAMES = 64
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
css = '''
|
29 |
.gradio-container{max-width: 560px !important}
|
30 |
h1{text-align:center}
|
@@ -33,75 +38,69 @@ footer {
|
|
33 |
}
|
34 |
'''
|
35 |
|
36 |
-
# Functions
|
37 |
def encode_video(video_path):
|
38 |
-
def uniform_sample(
|
39 |
-
gap = len(
|
40 |
-
|
41 |
-
return [l[i] for i in idxs]
|
42 |
|
43 |
vr = VideoReader(video_path, ctx=cpu(0))
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
frames = [Image.fromarray(v.astype('uint8')) for v in frames]
|
52 |
-
return frames
|
53 |
-
|
54 |
@spaces.GPU(duration=60)
|
55 |
def generate(image, video, audio, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
|
56 |
-
# repo.to(DEVICE)
|
57 |
|
58 |
print(image)
|
59 |
print(video)
|
60 |
print(audio)
|
61 |
print(instruction)
|
62 |
-
|
63 |
if image is not None:
|
64 |
-
|
65 |
-
inputs = [{"role": "user", "content": [
|
|
|
66 |
elif video is not None:
|
67 |
-
|
68 |
-
inputs = [{"role": "user", "content": [
|
|
|
69 |
elif audio is not None:
|
70 |
if isinstance(audio, str):
|
71 |
-
|
72 |
else:
|
73 |
-
|
74 |
-
inputs = [{"role": "user", "content": [
|
|
|
75 |
else:
|
76 |
return "No input provided."
|
77 |
-
|
78 |
-
|
|
|
79 |
"msgs": inputs,
|
80 |
"tokenizer": tokenizer,
|
81 |
"processor": processor,
|
82 |
"sampling": sampling,
|
|
|
83 |
"temperature": temperature,
|
84 |
"top_p": top_p,
|
85 |
"top_k": top_k,
|
86 |
"repetition_penalty": repetition_penalty,
|
87 |
"max_new_tokens": max_tokens,
|
88 |
}
|
89 |
-
|
90 |
-
output = repo.chat(**
|
91 |
-
|
92 |
print(output)
|
93 |
-
|
94 |
return output
|
95 |
-
|
96 |
def cloud():
|
97 |
print("[CLOUD] | Space maintained.")
|
98 |
|
99 |
-
|
100 |
-
with gr.Blocks(css=css) as main:
|
101 |
-
with gr.Column():
|
102 |
-
gr.Markdown("🪄 Analyze images and caption them using state-of-the-art openbmb/MiniCPM-V-2_6.")
|
103 |
-
|
104 |
-
with gr.Column():
|
105 |
input = gr.Image(label="Image")
|
106 |
input_2 = gr.Video(label="Video")
|
107 |
input_3 = gr.Audio(label="Audio")
|
|
|
1 |
# Imports
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
import gradio as gr
|
5 |
import spaces
|
6 |
import torch
|
7 |
+
import librosa
|
8 |
from PIL import Image
|
9 |
from decord import VideoReader, cpu
|
10 |
from transformers import AutoModel, AutoTokenizer, AutoProcessor
|
11 |
|
12 |
+
# Variables
|
13 |
DEVICE = "auto"
|
14 |
if DEVICE == "auto":
|
15 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
print(f"[SYSTEM] | Using {DEVICE} type compute device.")
|
17 |
|
|
|
18 |
DEFAULT_INPUT = "Describe in one paragraph."
|
19 |
MAX_FRAMES = 64
|
20 |
|
21 |
+
repo = AutoModel.from_pretrained(
|
22 |
+
model_name,
|
23 |
+
init_vision=True,
|
24 |
+
init_audio=False,
|
25 |
+
init_tts=False,
|
26 |
+
trust_remote_code=True,
|
27 |
+
attn_implementation="sdpa",
|
28 |
+
torch_dtype=torch.bfloat16,
|
29 |
+
)
|
30 |
+
repo = repo.eval().to(DEVICE)
|
31 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
32 |
+
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
33 |
css = '''
|
34 |
.gradio-container{max-width: 560px !important}
|
35 |
h1{text-align:center}
|
|
|
38 |
}
|
39 |
'''
|
40 |
|
|
|
41 |
def encode_video(video_path):
|
42 |
+
def uniform_sample(idxs, n):
|
43 |
+
gap = len(idxs) / n
|
44 |
+
return [idxs[int(i*gap + gap/2)] for i in range(n)]
|
|
|
45 |
|
46 |
vr = VideoReader(video_path, ctx=cpu(0))
|
47 |
+
fps = round(vr.get_avg_fps())
|
48 |
+
idxs = list(range(0, len(vr), fps))
|
49 |
+
if len(idxs) > MAX_FRAMES:
|
50 |
+
idxs = uniform_sample(idxs, MAX_FRAMES)
|
51 |
+
frames = vr.get_batch(idxs).asnumpy()
|
52 |
+
return [Image.fromarray(f.astype("uint8")) for f in frames]
|
53 |
+
|
|
|
|
|
|
|
54 |
@spaces.GPU(duration=60)
|
55 |
def generate(image, video, audio, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
|
|
|
56 |
|
57 |
print(image)
|
58 |
print(video)
|
59 |
print(audio)
|
60 |
print(instruction)
|
61 |
+
|
62 |
if image is not None:
|
63 |
+
img = Image.fromarray(image.astype("uint8"), "RGB")
|
64 |
+
inputs = [{"role": "user", "content": [img, instruction]}]
|
65 |
+
media = img
|
66 |
elif video is not None:
|
67 |
+
vid = encode_video(video)
|
68 |
+
inputs = [{"role": "user", "content": [vid, instruction]}]
|
69 |
+
media = vid
|
70 |
elif audio is not None:
|
71 |
if isinstance(audio, str):
|
72 |
+
aud, _ = librosa.load(audio, sr=16000, mono=True)
|
73 |
else:
|
74 |
+
aud = audio
|
75 |
+
inputs = [{"role": "user", "content": [aud, instruction]}]
|
76 |
+
media = aud
|
77 |
else:
|
78 |
return "No input provided."
|
79 |
+
|
80 |
+
params = {
|
81 |
+
"image": media,
|
82 |
"msgs": inputs,
|
83 |
"tokenizer": tokenizer,
|
84 |
"processor": processor,
|
85 |
"sampling": sampling,
|
86 |
+
"stream": stream,
|
87 |
"temperature": temperature,
|
88 |
"top_p": top_p,
|
89 |
"top_k": top_k,
|
90 |
"repetition_penalty": repetition_penalty,
|
91 |
"max_new_tokens": max_tokens,
|
92 |
}
|
93 |
+
|
94 |
+
output = repo.chat(**params)
|
95 |
+
|
96 |
print(output)
|
97 |
+
|
98 |
return output
|
99 |
+
|
100 |
def cloud():
|
101 |
print("[CLOUD] | Space maintained.")
|
102 |
|
103 |
+
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
104 |
input = gr.Image(label="Image")
|
105 |
input_2 = gr.Video(label="Video")
|
106 |
input_3 = gr.Audio(label="Audio")
|