Spaces:

Staticaliza
/

Sense

Running on Zero

App Files Files Community

Staticaliza commited on 13 days ago

Commit

c81c545

verified ·

1 Parent(s): 38e087a

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -20

app.py CHANGED Viewed

@@ -36,11 +36,13 @@ footer {
 }
 '''
 input_prefixes = {
-    "Image": "(A image file called █ has been attached, describe the image content) ",
-    "GIF": "(A GIF file called █ has been attached, describe the GIF content) ",
-    "Video": "(A audio video file called █ has been attached, describe the video content and the audio content) ",
-    "Audio": "(A audio file called █ has been attached, describe the audio content) ",
 }
 filetypes = {
@@ -67,8 +69,7 @@ def frames_from_video(path):
 def audio_from_video(path):
     clip = VideoFileClip(path)
     with tempfile.NamedTemporaryFile(suffix = ".wav", delete = True) as tmp:
-        clip.audio.write_audiofile(tmp.name, codec = "pcm_s16le",
-                                   fps = AUDIO_SR, verbose = False, logger = None)
         audio_np, _ = librosa.load(tmp.name, sr = AUDIO_SR, mono = True)
     clip.close()
     return audio_np
@@ -77,10 +78,10 @@ def load_audio(path):
     audio_np, _ = librosa.load(path, sr = AUDIO_SR, mono = True)
     return audio_np
-def build_video_omni(path, prefix, instruction):
     frames   = frames_from_video(path)
     audio    = audio_from_video(path)
-    contents = [prefix + instruction]
     total    = max(len(frames), math.ceil(len(audio) / AUDIO_SR))
     for i in range(total):
         frame = frames[i] if i < len(frames) else frames[-1]
@@ -88,21 +89,21 @@ def build_video_omni(path, prefix, instruction):
         contents.extend(["<unit>", frame, chunk])
     return contents
-def build_image_omni(path, prefix, instruction):
     image = Image.open(path).convert("RGB")
-    return [prefix + instruction, image]
-def build_gif_omni(path, prefix, instruction):
     img = Image.open(path)
     frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(img)]
     frames = uniform_sample(frames, MAX_FRAMES)
-    return [prefix + instruction, *frames]
-def build_audio_omni(path, prefix, instruction):
     audio = load_audio(path)
-    return [prefix + instruction, audio]
-@spaces.GPU(duration = 60)
 def generate(input,
              instruction        = DEFAULT_INPUT,
              sampling           = False,
@@ -111,12 +112,12 @@ def generate(input,
              top_k              = 100,
              repetition_penalty = 1.05,
              max_tokens         = 512):
-    if not input:
-        return "no input provided."
     extension = os.path.splitext(input)[1].lower()
     filetype  = infer_filetype(extension)
-    if not filetype:
-        return "unsupported file type."
     filename     = os.path.basename(input)
     prefix       = input_prefixes[filetype].replace("█", filename)
     builder_map  = {
@@ -125,9 +126,14 @@ def generate(input,
         "Video": build_video_omni,
         "Audio": build_audio_omni
     }
-    omni_content = builder_map[filetype](input, prefix, instruction)
     sys_msg      = repo.get_sys_prompt(mode = "omni", language = "en")
     msgs         = [sys_msg, { "role": "user", "content": omni_content }]
     output       = repo.chat(
         msgs               = msgs,
         tokenizer          = tokenizer,
@@ -141,8 +147,10 @@ def generate(input,
         use_image_id       = False,
         max_slice_nums     = 2
     )
     torch.cuda.empty_cache()
     gc.collect()
     return output
 def cloud():

 }
 '''
+global_instruction = "Describe the given content with as much keywords and always take a guess."
 input_prefixes = {
+    "Image": "A image file called █ has been attached, describe the image content.",
+    "GIF": "A GIF file called █ has been attached, describe the GIF content.",
+    "Video": "A audio video file called █ has been attached, describe the video content and the audio content.",
+    "Audio": "A audio file called █ has been attached, describe the audio content.",
 }
 filetypes = {
 def audio_from_video(path):
     clip = VideoFileClip(path)
     with tempfile.NamedTemporaryFile(suffix = ".wav", delete = True) as tmp:
+        clip.audio.write_audiofile(tmp.name, codec = "pcm_s16le", fps = AUDIO_SR, verbose = False, logger = None)
         audio_np, _ = librosa.load(tmp.name, sr = AUDIO_SR, mono = True)
     clip.close()
     return audio_np
     audio_np, _ = librosa.load(path, sr = AUDIO_SR, mono = True)
     return audio_np
+def build_video_omni(path, instruction):
     frames   = frames_from_video(path)
     audio    = audio_from_video(path)
+    contents = [instruction]
     total    = max(len(frames), math.ceil(len(audio) / AUDIO_SR))
     for i in range(total):
         frame = frames[i] if i < len(frames) else frames[-1]
         contents.extend(["<unit>", frame, chunk])
     return contents
+def build_image_omni(path, instruction):
     image = Image.open(path).convert("RGB")
+    return [instruction, image]
+def build_gif_omni(path, instruction):
     img = Image.open(path)
     frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(img)]
     frames = uniform_sample(frames, MAX_FRAMES)
+    return [instruction, *frames]
+def build_audio_omni(path, instruction):
     audio = load_audio(path)
+    return [instruction, audio]
+@spaces.GPU(duration=30)
 def generate(input,
              instruction        = DEFAULT_INPUT,
              sampling           = False,
              top_k              = 100,
              repetition_penalty = 1.05,
              max_tokens         = 512):
+    if not input: return "no input provided."
     extension = os.path.splitext(input)[1].lower()
     filetype  = infer_filetype(extension)
+    if not filetype: return "unsupported file type."
     filename     = os.path.basename(input)
     prefix       = input_prefixes[filetype].replace("█", filename)
     builder_map  = {
         "Video": build_video_omni,
         "Audio": build_audio_omni
     }
+    instruction = f"{global_instruction}\n{prefix}\n{instruction}"
+    omni_content = builder_map[filetype](input, instruction)
     sys_msg      = repo.get_sys_prompt(mode = "omni", language = "en")
     msgs         = [sys_msg, { "role": "user", "content": omni_content }]
+    print(msgs)
     output       = repo.chat(
         msgs               = msgs,
         tokenizer          = tokenizer,
         use_image_id       = False,
         max_slice_nums     = 2
     )
     torch.cuda.empty_cache()
     gc.collect()
     return output
 def cloud():