Spaces:

Staticaliza
/

Sense

Running

App Files Files Community

Staticaliza commited on May 28

Commit

17e2cc5

verified ·

1 Parent(s): ba45770

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -36

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ AUDIO_SR = 16000
 model_name = "openbmb/MiniCPM-o-2_6"
-repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16).to(DEVICE)
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
@@ -60,43 +60,29 @@ filetypes = {
 }
 # Functions
-uniform_sample=lambda seq, n: seq[::max(len(seq) // n,1)][:n]
-def build_video(filepath):
-    vr = VideoReader(filepath, ctx = cpu(0))
-    i = uniform_sample(range(len(vr)), MAX_FRAMES)
-    batch = vr.get_batch(i).asnumpy()
-    frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
-    audio = build_audio(filepath)
-    audio_length = math.ceil(len(audio) / AUDIO_SR)
-    total_length = max(1, min(len(frames), audio_length))
-    contents = []
-    for i in range(total_length):
-        frame = frames[i] if i < len(frames) else frames[-1]
-        start = i * AUDIO_SR
-        end = min((i + 1) * AUDIO_SR, len(audio))
-        chunk = audio[start:end]
-        if chunk.size == 0: break
-        contents.extend([frame, chunk])
-    return contents
-def build_image(filepath):
-    image = Image.open(filepath).convert("RGB")
-    return image
-def build_gif(filepath):
-    image = Image.open(filepath)
-    frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(image)]
-    frames = uniform_sample(frames, MAX_FRAMES)
-    return frames
-def build_audio(filepath):
-    audio, _ = librosa.load(filepath, sr=AUDIO_SR, mono=True)
-    return audio
 @spaces.GPU(duration=30)
 def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):

 model_name = "openbmb/MiniCPM-o-2_6"
+repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16, init_vision=True, init_audio=True, init_tts=False).to(DEVICE)
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 }
 # Functions
+def uniform_sample(sequence, n): return seq[::max(len(sequence) // n,1)][:n]
+def build_image(path): return [Image.open(path).convert("RGB")]
+def build_gif(path):
+    frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(Image.open(path))]
+    return uniform_sample(frames, MAX_FRAMES)
+def build_video(path):
+    vr = VideoReader(path, ctx=cpu(0))
+    idx = uniform_sample(range(len(vr)), MAX_FRAMES)
+    frames = [Image.fromarray(f.astype("uint8")) for f in vr.get_batch(idx).asnumpy()]
+    audio = build_audio(path)[0]
+    units = []
+    for i, frame in enumerate(frames):
+        chunk = audio[i*AUDIO_SR:(i+1)*AUDIO_SR]
+        if not chunk.size: break
+        units.extend(["<unit>", frame, chunk])
+    return units
+def build_audio(path):
+    audio, _ = librosa.load(path, sr=AUDIO_SR, mono=True)
+    return [audio]
 @spaces.GPU(duration=30)
 def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):