Spaces:

Staticaliza
/

Sense

Running

App Files Files Community

Staticaliza commited on May 28

Commit

957abbb

verified ·

1 Parent(s): d2ef006

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -58

app.py CHANGED Viewed

@@ -47,79 +47,54 @@ filetypes = {
     "Audio": [".wav", ".mp3", ".flac", ".aac"],
 }
 def infer_filetype(ext):
     return next((k for k, v in filetypes.items() if ext in v), None)
 def uniform_sample(seq, n):
     step = max(len(seq) // n, 1)
     return seq[::step][:n]
 def frames_from_video(path):
     vr = VideoReader(path, ctx = cpu(0))
     idx = uniform_sample(range(len(vr)), MAX_FRAMES)
     batch = vr.get_batch(idx).asnumpy()
     return [Image.fromarray(frame.astype("uint8")) for frame in batch]
 def audio_from_video(path):
     clip = VideoFileClip(path)
-    audio = clip.audio.to_soundarray(fps = AUDIO_SR)
     clip.close()
-    return librosa.to_mono(audio.T)
 def load_audio(path):
     audio_np, _ = librosa.load(path, sr = AUDIO_SR, mono = True)
     return audio_np
 def build_video_omni(path, prefix, instruction):
     frames = frames_from_video(path)
     audio = audio_from_video(path)
-    return processor.build_omni_input(
-        frames       = frames,
-        audio        = audio,
-        prefix       = prefix,
-        instruction  = instruction,
-        max_frames   = MAX_FRAMES,
-        sr           = AUDIO_SR
-    )
 def build_image_omni(path, prefix, instruction):
     image = Image.open(path).convert("RGB")
-    return processor.build_omni_input(
-        frames      = [image],
-        audio       = None,
-        prefix      = prefix,
-        instruction = instruction
-    )
 def build_gif_omni(path, prefix, instruction):
-    img    = Image.open(path)
-    frames = [frame.copy().convert("RGB") for frame in ImageSequence.Iterator(img)]
     frames = uniform_sample(frames, MAX_FRAMES)
-    return processor.build_omni_input(
-        frames      = frames,
-        audio       = None,
-        prefix      = prefix,
-        instruction = instruction
-    )
 def build_audio_omni(path, prefix, instruction):
     audio = load_audio(path)
-    return processor.build_omni_input(
-        frames      = None,
-        audio       = audio,
-        prefix      = prefix,
-        instruction = instruction,
-        sr          = AUDIO_SR
-    )
 @spaces.GPU(duration = 60)
 def generate(input,
@@ -136,30 +111,32 @@ def generate(input,
     filetype  = infer_filetype(extension)
     if not filetype:
         return "unsupported file type."
-    filename      = os.path.basename(input)
-    prefix        = input_prefixes[filetype].replace("█", filename)
-    builder_map   = {
         "Image": build_image_omni,
         "GIF"  : build_gif_omni,
         "Video": build_video_omni,
         "Audio": build_audio_omni
     }
-    omni_content  = builder_map[filetype](input, prefix, instruction)
-    sys_msg       = repo.get_sys_prompt(mode = "omni", language = "en")
-    msgs          = [sys_msg, { "role": "user", "content": omni_content }]
-    output        = repo.chat(
-        msgs                = msgs,
-        tokenizer           = tokenizer,
-        sampling            = sampling,
-        temperature         = temperature,
-        top_p               = top_p,
-        top_k               = top_k,
-        repetition_penalty  = repetition_penalty,
-        max_new_tokens      = max_tokens,
-        omni_input          = True,
-        use_image_id        = False,
-        max_slice_nums      = 2
     )
     return output
 def cloud():

     "Audio": [".wav", ".mp3", ".flac", ".aac"],
 }
+# Functions
 def infer_filetype(ext):
     return next((k for k, v in filetypes.items() if ext in v), None)
 def uniform_sample(seq, n):
     step = max(len(seq) // n, 1)
     return seq[::step][:n]
 def frames_from_video(path):
     vr = VideoReader(path, ctx = cpu(0))
     idx = uniform_sample(range(len(vr)), MAX_FRAMES)
     batch = vr.get_batch(idx).asnumpy()
     return [Image.fromarray(frame.astype("uint8")) for frame in batch]
 def audio_from_video(path):
     clip = VideoFileClip(path)
+    wav = clip.audio.to_soundarray(fps = AUDIO_SR)
     clip.close()
+    return librosa.to_mono(wav.T)
 def load_audio(path):
     audio_np, _ = librosa.load(path, sr = AUDIO_SR, mono = True)
     return audio_np
 def build_video_omni(path, prefix, instruction):
     frames = frames_from_video(path)
     audio = audio_from_video(path)
+    contents = [prefix + instruction]
+    total = max(len(frames), math.ceil(len(audio) / AUDIO_SR))
+    for i in range(total):
+        frame = frames[i] if i < len(frames) else frames[-1]
+        chunk = audio[AUDIO_SR * i : AUDIO_SR * (i + 1)]
+        contents.extend(["<unit>", frame, chunk])
+    return contents
 def build_image_omni(path, prefix, instruction):
     image = Image.open(path).convert("RGB")
+    return [prefix + instruction, image]
 def build_gif_omni(path, prefix, instruction):
+    img = Image.open(path)
+    frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(img)]
     frames = uniform_sample(frames, MAX_FRAMES)
+    return [prefix + instruction, *frames]
 def build_audio_omni(path, prefix, instruction):
     audio = load_audio(path)
+    return [prefix + instruction, audio]
 @spaces.GPU(duration = 60)
 def generate(input,
     filetype  = infer_filetype(extension)
     if not filetype:
         return "unsupported file type."
+    filename     = os.path.basename(input)
+    prefix       = input_prefixes[filetype].replace("█", filename)
+    builder_map  = {
         "Image": build_image_omni,
         "GIF"  : build_gif_omni,
         "Video": build_video_omni,
         "Audio": build_audio_omni
     }
+    omni_content = builder_map[filetype](input, prefix, instruction)
+    sys_msg      = repo.get_sys_prompt(mode = "omni", language = "en")
+    msgs         = [sys_msg, { "role": "user", "content": omni_content }]
+    output       = repo.chat(
+        msgs               = msgs,
+        tokenizer          = tokenizer,
+        sampling           = sampling,
+        temperature        = temperature,
+        top_p              = top_p,
+        top_k              = top_k,
+        repetition_penalty = repetition_penalty,
+        max_new_tokens     = max_tokens,
+        omni_input         = True,
+        use_image_id       = False,
+        max_slice_nums     = 2
     )
+    torch.cuda.empty_cache()
+    gc.collect()
     return output
 def cloud():