Staticaliza commited on
Commit
294c109
·
verified ·
1 Parent(s): 3ef9424

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -48
app.py CHANGED
@@ -1,30 +1,35 @@
1
  # Imports
 
 
2
  import gradio as gr
3
  import spaces
4
  import torch
5
-
6
  from PIL import Image
7
  from decord import VideoReader, cpu
8
  from transformers import AutoModel, AutoTokenizer, AutoProcessor
9
 
10
- # Pre-Initialize
11
  DEVICE = "auto"
12
  if DEVICE == "auto":
13
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
14
  print(f"[SYSTEM] | Using {DEVICE} type compute device.")
15
 
16
- # Variables
17
  DEFAULT_INPUT = "Describe in one paragraph."
18
  MAX_FRAMES = 64
19
 
20
- repo_name = "openbmb/MiniCPM-o-2_6" # "openbmb/MiniCPM-V-2_6-int4" # "openbmb/MiniCPM-V-2_6"
21
- repo = AutoModel.from_pretrained(repo_name,
22
- init_vision=True,
23
- init_audio=False,
24
- init_tts=False, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16)
25
- tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
26
- processor = AutoProcessor.from_pretrained(repo_name, trust_remote_code=True)
27
-
 
 
 
 
28
  css = '''
29
  .gradio-container{max-width: 560px !important}
30
  h1{text-align:center}
@@ -33,75 +38,69 @@ footer {
33
  }
34
  '''
35
 
36
- # Functions
37
  def encode_video(video_path):
38
- def uniform_sample(l, n):
39
- gap = len(l) / n
40
- idxs = [int(i * gap + gap / 2) for i in range(n)]
41
- return [l[i] for i in idxs]
42
 
43
  vr = VideoReader(video_path, ctx=cpu(0))
44
- sample_fps = round(vr.get_avg_fps() / 1)
45
- frame_idx = [i for i in range(0, len(vr), sample_fps)]
46
-
47
- if len(frame_idx) > MAX_FRAMES:
48
- frame_idx = uniform_sample(frame_idx, MAX_FRAMES)
49
-
50
- frames = vr.get_batch(frame_idx).asnumpy()
51
- frames = [Image.fromarray(v.astype('uint8')) for v in frames]
52
- return frames
53
-
54
  @spaces.GPU(duration=60)
55
  def generate(image, video, audio, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
56
- # repo.to(DEVICE)
57
 
58
  print(image)
59
  print(video)
60
  print(audio)
61
  print(instruction)
62
-
63
  if image is not None:
64
- image_data = Image.fromarray(image.astype('uint8'), 'RGB')
65
- inputs = [{"role": "user", "content": [image_data, instruction]}]
 
66
  elif video is not None:
67
- video_data = encode_video(video)
68
- inputs = [{"role": "user", "content": [video_data, instruction]}]
 
69
  elif audio is not None:
70
  if isinstance(audio, str):
71
- audio_data, _ = librosa.load(audio, sr=16000, mono=True)
72
  else:
73
- audio_data = audio
74
- inputs = [{"role": "user", "content": [audio_data, instruction]}]
 
75
  else:
76
  return "No input provided."
77
-
78
- parameters = {
 
79
  "msgs": inputs,
80
  "tokenizer": tokenizer,
81
  "processor": processor,
82
  "sampling": sampling,
 
83
  "temperature": temperature,
84
  "top_p": top_p,
85
  "top_k": top_k,
86
  "repetition_penalty": repetition_penalty,
87
  "max_new_tokens": max_tokens,
88
  }
89
-
90
- output = repo.chat(**parameters)
91
-
92
  print(output)
93
-
94
  return output
95
-
96
  def cloud():
97
  print("[CLOUD] | Space maintained.")
98
 
99
- # Initialize
100
- with gr.Blocks(css=css) as main:
101
- with gr.Column():
102
- gr.Markdown("🪄 Analyze images and caption them using state-of-the-art openbmb/MiniCPM-V-2_6.")
103
-
104
- with gr.Column():
105
  input = gr.Image(label="Image")
106
  input_2 = gr.Video(label="Video")
107
  input_3 = gr.Audio(label="Audio")
 
1
  # Imports
2
+ import os
3
+ import sys
4
  import gradio as gr
5
  import spaces
6
  import torch
7
+ import librosa
8
  from PIL import Image
9
  from decord import VideoReader, cpu
10
  from transformers import AutoModel, AutoTokenizer, AutoProcessor
11
 
12
+ # Variables
13
  DEVICE = "auto"
14
  if DEVICE == "auto":
15
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
  print(f"[SYSTEM] | Using {DEVICE} type compute device.")
17
 
 
18
  DEFAULT_INPUT = "Describe in one paragraph."
19
  MAX_FRAMES = 64
20
 
21
+ repo = AutoModel.from_pretrained(
22
+ model_name,
23
+ init_vision=True,
24
+ init_audio=False,
25
+ init_tts=False,
26
+ trust_remote_code=True,
27
+ attn_implementation="sdpa",
28
+ torch_dtype=torch.bfloat16,
29
+ )
30
+ repo = repo.eval().to(DEVICE)
31
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
32
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
33
  css = '''
34
  .gradio-container{max-width: 560px !important}
35
  h1{text-align:center}
 
38
  }
39
  '''
40
 
 
41
  def encode_video(video_path):
42
+ def uniform_sample(idxs, n):
43
+ gap = len(idxs) / n
44
+ return [idxs[int(i*gap + gap/2)] for i in range(n)]
 
45
 
46
  vr = VideoReader(video_path, ctx=cpu(0))
47
+ fps = round(vr.get_avg_fps())
48
+ idxs = list(range(0, len(vr), fps))
49
+ if len(idxs) > MAX_FRAMES:
50
+ idxs = uniform_sample(idxs, MAX_FRAMES)
51
+ frames = vr.get_batch(idxs).asnumpy()
52
+ return [Image.fromarray(f.astype("uint8")) for f in frames]
53
+
 
 
 
54
  @spaces.GPU(duration=60)
55
  def generate(image, video, audio, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
 
56
 
57
  print(image)
58
  print(video)
59
  print(audio)
60
  print(instruction)
61
+
62
  if image is not None:
63
+ img = Image.fromarray(image.astype("uint8"), "RGB")
64
+ inputs = [{"role": "user", "content": [img, instruction]}]
65
+ media = img
66
  elif video is not None:
67
+ vid = encode_video(video)
68
+ inputs = [{"role": "user", "content": [vid, instruction]}]
69
+ media = vid
70
  elif audio is not None:
71
  if isinstance(audio, str):
72
+ aud, _ = librosa.load(audio, sr=16000, mono=True)
73
  else:
74
+ aud = audio
75
+ inputs = [{"role": "user", "content": [aud, instruction]}]
76
+ media = aud
77
  else:
78
  return "No input provided."
79
+
80
+ params = {
81
+ "image": media,
82
  "msgs": inputs,
83
  "tokenizer": tokenizer,
84
  "processor": processor,
85
  "sampling": sampling,
86
+ "stream": stream,
87
  "temperature": temperature,
88
  "top_p": top_p,
89
  "top_k": top_k,
90
  "repetition_penalty": repetition_penalty,
91
  "max_new_tokens": max_tokens,
92
  }
93
+
94
+ output = repo.chat(**params)
95
+
96
  print(output)
97
+
98
  return output
99
+
100
  def cloud():
101
  print("[CLOUD] | Space maintained.")
102
 
103
+ with gr.Column():
 
 
 
 
 
104
  input = gr.Image(label="Image")
105
  input_2 = gr.Video(label="Video")
106
  input_3 = gr.Audio(label="Audio")