Staticaliza commited on
Commit
17e2cc5
·
verified ·
1 Parent(s): ba45770

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -36
app.py CHANGED
@@ -22,7 +22,7 @@ AUDIO_SR = 16000
22
 
23
  model_name = "openbmb/MiniCPM-o-2_6"
24
 
25
- repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16).to(DEVICE)
26
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
27
  processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
28
 
@@ -60,43 +60,29 @@ filetypes = {
60
  }
61
 
62
  # Functions
63
- uniform_sample=lambda seq, n: seq[::max(len(seq) // n,1)][:n]
64
-
65
- def build_video(filepath):
66
- vr = VideoReader(filepath, ctx = cpu(0))
67
- i = uniform_sample(range(len(vr)), MAX_FRAMES)
68
- batch = vr.get_batch(i).asnumpy()
69
- frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
70
 
71
- audio = build_audio(filepath)
72
-
73
- audio_length = math.ceil(len(audio) / AUDIO_SR)
74
- total_length = max(1, min(len(frames), audio_length))
75
-
76
- contents = []
77
- for i in range(total_length):
78
- frame = frames[i] if i < len(frames) else frames[-1]
79
- start = i * AUDIO_SR
80
- end = min((i + 1) * AUDIO_SR, len(audio))
81
- chunk = audio[start:end]
82
- if chunk.size == 0: break
83
- contents.extend([frame, chunk])
84
-
85
- return contents
86
-
87
- def build_image(filepath):
88
- image = Image.open(filepath).convert("RGB")
89
- return image
90
 
91
- def build_gif(filepath):
92
- image = Image.open(filepath)
93
- frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(image)]
94
- frames = uniform_sample(frames, MAX_FRAMES)
95
- return frames
96
-
97
- def build_audio(filepath):
98
- audio, _ = librosa.load(filepath, sr=AUDIO_SR, mono=True)
99
- return audio
100
 
101
  @spaces.GPU(duration=30)
102
  def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
 
22
 
23
  model_name = "openbmb/MiniCPM-o-2_6"
24
 
25
+ repo = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="sdpa", torch_dtype=torch.bfloat16, init_vision=True, init_audio=True, init_tts=False).to(DEVICE)
26
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
27
  processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
28
 
 
60
  }
61
 
62
  # Functions
63
+ def uniform_sample(sequence, n): return seq[::max(len(sequence) // n,1)][:n]
 
 
 
 
 
 
64
 
65
+ def build_image(path): return [Image.open(path).convert("RGB")]
66
+
67
+ def build_gif(path):
68
+ frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(Image.open(path))]
69
+ return uniform_sample(frames, MAX_FRAMES)
70
+
71
+ def build_video(path):
72
+ vr = VideoReader(path, ctx=cpu(0))
73
+ idx = uniform_sample(range(len(vr)), MAX_FRAMES)
74
+ frames = [Image.fromarray(f.astype("uint8")) for f in vr.get_batch(idx).asnumpy()]
75
+ audio = build_audio(path)[0]
76
+ units = []
77
+ for i, frame in enumerate(frames):
78
+ chunk = audio[i*AUDIO_SR:(i+1)*AUDIO_SR]
79
+ if not chunk.size: break
80
+ units.extend(["<unit>", frame, chunk])
81
+ return units
 
 
82
 
83
+ def build_audio(path):
84
+ audio, _ = librosa.load(path, sr=AUDIO_SR, mono=True)
85
+ return [audio]
 
 
 
 
 
 
86
 
87
  @spaces.GPU(duration=30)
88
  def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):