Spaces:

Staticaliza
/

Sense

Running

App Files Files Community

Staticaliza commited on May 28

Commit

294c109

verified ·

1 Parent(s): 3ef9424

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -48

app.py CHANGED Viewed

@@ -1,30 +1,35 @@
 # Imports
 import gradio as gr
 import spaces
 import torch
 from PIL import Image
 from decord import VideoReader, cpu
 from transformers import AutoModel, AutoTokenizer, AutoProcessor
-# Pre-Initialize
 DEVICE = "auto"
 if DEVICE == "auto":
     DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[SYSTEM] | Using {DEVICE} type compute device.")
-# Variables
 DEFAULT_INPUT = "Describe in one paragraph."
 MAX_FRAMES = 64
-repo_name = "openbmb/MiniCPM-o-2_6" # "openbmb/MiniCPM-V-2_6-int4" # "openbmb/MiniCPM-V-2_6"
-repo = AutoModel.from_pretrained(repo_name,
-            init_vision=True,
-            init_audio=False,
-            init_tts=False, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16)
-tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
-processor = AutoProcessor.from_pretrained(repo_name, trust_remote_code=True)
 css = '''
 .gradio-container{max-width: 560px !important}
 h1{text-align:center}
@@ -33,75 +38,69 @@ footer {
 }
 '''
-# Functions
 def encode_video(video_path):
-    def uniform_sample(l, n):
-        gap = len(l) / n
-        idxs = [int(i * gap + gap / 2) for i in range(n)]
-        return [l[i] for i in idxs]
     vr = VideoReader(video_path, ctx=cpu(0))
-    sample_fps = round(vr.get_avg_fps() / 1)
-    frame_idx = [i for i in range(0, len(vr), sample_fps)]
-    if len(frame_idx) > MAX_FRAMES:
-        frame_idx = uniform_sample(frame_idx, MAX_FRAMES)
-    frames = vr.get_batch(frame_idx).asnumpy()
-    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
-    return frames
 @spaces.GPU(duration=60)
 def generate(image, video, audio, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
-    # repo.to(DEVICE)
     print(image)
     print(video)
     print(audio)
     print(instruction)
     if image is not None:
-        image_data = Image.fromarray(image.astype('uint8'), 'RGB')
-        inputs = [{"role": "user", "content": [image_data, instruction]}]
     elif video is not None:
-        video_data = encode_video(video)
-        inputs = [{"role": "user", "content": [video_data, instruction]}]
     elif audio is not None:
         if isinstance(audio, str):
-            audio_data, _ = librosa.load(audio, sr=16000, mono=True)
         else:
-            audio_data = audio
-        inputs = [{"role": "user", "content": [audio_data, instruction]}]
     else:
         return "No input provided."
-    parameters = {
         "msgs": inputs,
         "tokenizer": tokenizer,
         "processor": processor,
         "sampling": sampling,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
         "max_new_tokens": max_tokens,
     }
-    output = repo.chat(**parameters)
     print(output)
     return output
 def cloud():
     print("[CLOUD] | Space maintained.")
-# Initialize
-with gr.Blocks(css=css) as main:
-    with gr.Column():
-        gr.Markdown("🪄 Analyze images and caption them using state-of-the-art openbmb/MiniCPM-V-2_6.")
-    with gr.Column():
         input = gr.Image(label="Image")
         input_2 = gr.Video(label="Video")
         input_3 = gr.Audio(label="Audio")

 # Imports
+import os
+import sys
 import gradio as gr
 import spaces
 import torch
+import librosa
 from PIL import Image
 from decord import VideoReader, cpu
 from transformers import AutoModel, AutoTokenizer, AutoProcessor
+# Variables
 DEVICE = "auto"
 if DEVICE == "auto":
     DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[SYSTEM] | Using {DEVICE} type compute device.")
 DEFAULT_INPUT = "Describe in one paragraph."
 MAX_FRAMES = 64
+repo = AutoModel.from_pretrained(
+    model_name,
+    init_vision=True,
+    init_audio=False,
+    init_tts=False,
+    trust_remote_code=True,
+    attn_implementation="sdpa",
+    torch_dtype=torch.bfloat16,
+)
+repo = repo.eval().to(DEVICE)
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 css = '''
 .gradio-container{max-width: 560px !important}
 h1{text-align:center}
 }
 '''
 def encode_video(video_path):
+    def uniform_sample(idxs, n):
+        gap = len(idxs) / n
+        return [idxs[int(i*gap + gap/2)] for i in range(n)]
     vr = VideoReader(video_path, ctx=cpu(0))
+    fps = round(vr.get_avg_fps())
+    idxs = list(range(0, len(vr), fps))
+    if len(idxs) > MAX_FRAMES:
+        idxs = uniform_sample(idxs, MAX_FRAMES)
+    frames = vr.get_batch(idxs).asnumpy()
+    return [Image.fromarray(f.astype("uint8")) for f in frames]
 @spaces.GPU(duration=60)
 def generate(image, video, audio, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
     print(image)
     print(video)
     print(audio)
     print(instruction)
     if image is not None:
+        img = Image.fromarray(image.astype("uint8"), "RGB")
+        inputs = [{"role": "user", "content": [img, instruction]}]
+        media = img
     elif video is not None:
+        vid = encode_video(video)
+        inputs = [{"role": "user", "content": [vid, instruction]}]
+        media = vid
     elif audio is not None:
         if isinstance(audio, str):
+            aud, _ = librosa.load(audio, sr=16000, mono=True)
         else:
+            aud = audio
+        inputs = [{"role": "user", "content": [aud, instruction]}]
+        media = aud
     else:
         return "No input provided."
+    params = {
+        "image": media,
         "msgs": inputs,
         "tokenizer": tokenizer,
         "processor": processor,
         "sampling": sampling,
+        "stream": stream,
         "temperature": temperature,
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
         "max_new_tokens": max_tokens,
     }
+    output = repo.chat(**params)
     print(output)
     return output
 def cloud():
     print("[CLOUD] | Space maintained.")
+with gr.Column():
         input = gr.Image(label="Image")
         input_2 = gr.Video(label="Video")
         input_3 = gr.Audio(label="Audio")