Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -22,7 +22,7 @@ AUDIO_SR = 16000
|
|
22 |
|
23 |
model_name = "openbmb/MiniCPM-o-2_6"
|
24 |
|
25 |
-
repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16).to(DEVICE)
|
26 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
27 |
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
28 |
|
@@ -60,43 +60,29 @@ filetypes = {
|
|
60 |
}
|
61 |
|
62 |
# Functions
|
63 |
-
uniform_sample
|
64 |
-
|
65 |
-
def build_video(filepath):
|
66 |
-
vr = VideoReader(filepath, ctx = cpu(0))
|
67 |
-
i = uniform_sample(range(len(vr)), MAX_FRAMES)
|
68 |
-
batch = vr.get_batch(i).asnumpy()
|
69 |
-
frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
image = Image.open(filepath).convert("RGB")
|
89 |
-
return image
|
90 |
|
91 |
-
def
|
92 |
-
|
93 |
-
|
94 |
-
frames = uniform_sample(frames, MAX_FRAMES)
|
95 |
-
return frames
|
96 |
-
|
97 |
-
def build_audio(filepath):
|
98 |
-
audio, _ = librosa.load(filepath, sr=AUDIO_SR, mono=True)
|
99 |
-
return audio
|
100 |
|
101 |
@spaces.GPU(duration=30)
|
102 |
def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
|
|
|
22 |
|
23 |
model_name = "openbmb/MiniCPM-o-2_6"
|
24 |
|
25 |
+
repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16, init_vision=True, init_audio=True, init_tts=False).to(DEVICE)
|
26 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
27 |
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
|
28 |
|
|
|
60 |
}
|
61 |
|
62 |
# Functions
|
63 |
+
def uniform_sample(sequence, n): return seq[::max(len(sequence) // n,1)][:n]
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
+
def build_image(path): return [Image.open(path).convert("RGB")]
|
66 |
+
|
67 |
+
def build_gif(path):
|
68 |
+
frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(Image.open(path))]
|
69 |
+
return uniform_sample(frames, MAX_FRAMES)
|
70 |
+
|
71 |
+
def build_video(path):
|
72 |
+
vr = VideoReader(path, ctx=cpu(0))
|
73 |
+
idx = uniform_sample(range(len(vr)), MAX_FRAMES)
|
74 |
+
frames = [Image.fromarray(f.astype("uint8")) for f in vr.get_batch(idx).asnumpy()]
|
75 |
+
audio = build_audio(path)[0]
|
76 |
+
units = []
|
77 |
+
for i, frame in enumerate(frames):
|
78 |
+
chunk = audio[i*AUDIO_SR:(i+1)*AUDIO_SR]
|
79 |
+
if not chunk.size: break
|
80 |
+
units.extend(["<unit>", frame, chunk])
|
81 |
+
return units
|
|
|
|
|
82 |
|
83 |
+
def build_audio(path):
|
84 |
+
audio, _ = librosa.load(path, sr=AUDIO_SR, mono=True)
|
85 |
+
return [audio]
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
@spaces.GPU(duration=30)
|
88 |
def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
|