Spaces:

Staticaliza
/

Sense

Running

App Files Files Community

Staticaliza commited on May 28

Commit

ab966e8

verified ·

1 Parent(s): c74b254

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -38

app.py CHANGED Viewed

@@ -37,30 +37,39 @@ footer {
 global_instruction = "You will analyze image, GIF, video, and audio input, then use as much keywords to describe the given content and take as much guesses of what it could be."
-input_prefixes = {
-    "Image": "Analyze the '█' image.",
-    "GIF": "Analyze the '█' GIF.",
-    "Video": "Analyze the '█' video including the audio associated with the video.",
-    "Audio": "Analyze the '█' audio.",
-}
 filetypes = {
-    "Image": [".jpg", ".jpeg", ".png", ".bmp"],
-    "GIF": [".gif"],
-    "Video": [".mp4", ".mov", ".avi", ".mkv"],
-    "Audio": [".wav", ".mp3", ".flac", ".aac"],
 }
 # Functions
 uniform_sample=lambda seq, n: seq[::max(len(seq) // n,1)][:n]
-def build_video(path):
-    vr = VideoReader(path, ctx = cpu(0))
     i = uniform_sample(range(len(vr)), MAX_FRAMES)
     batch = vr.get_batch(i).asnumpy()
     frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
-    audio = build_audio(path)
     audio_length = math.ceil(len(audio) / AUDIO_SR)
     total_length = max(1, min(len(frames), audio_length))
@@ -76,40 +85,36 @@ def build_video(path):
     return contents
-def build_image(path):
-    image = Image.open(path).convert("RGB")
     return image
-def build_gif(path):
-    image = Image.open(path)
     frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(image)]
     frames = uniform_sample(frames, MAX_FRAMES)
     return frames
-def build_audio(path):
-    audio, _ = librosa.load(path, sr=AUDIO_SR, mono=True)
     return audio
 @spaces.GPU(duration=30)
-def generate(input, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
     if not input: return "No input provided."
-    extension = os.path.splitext(input)[1].lower()
-    filetype = next((k for k, v in filetypes.items() if extension in v), None)
     if not filetype: return "Unsupported file type."
-    filename = os.path.basename(input)
-    prefix = input_prefixes[filetype].replace("█", filename)
-    builder_map = {
-        "Image": build_image,
-        "GIF" : build_gif,
-        "Video": build_video,
-        "Audio": build_audio,
-    }
-    instruction = f"{global_instruction}\n{prefix}\n{instruction}"
-    omni_content = builder_map[filetype](input)
-    msgs = [{ "role": "user", "content": [omni_content, instruction] }]
     print(msgs)
@@ -138,8 +143,8 @@ def cloud():
 # Initialize
 with gr.Blocks(css=css) as main:
     with gr.Column():
-        input = gr.File(label="Input", file_types=["image", "video", "audio"], type="filepath")
-        instruction = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Instruction")
         sampling = gr.Checkbox(value=False, label="Sampling")
         temperature = gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="Temperature")
         top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.95, label="Top P")
@@ -152,7 +157,7 @@ with gr.Blocks(css=css) as main:
     with gr.Column():
         output = gr.Textbox(lines=1, value="", label="Output")
-    submit.click(fn=generate, inputs=[input, instruction, sampling, temperature, top_p, top_k, repetition_penalty, max_tokens], outputs=[output], queue=False)
     maintain.click(cloud, inputs=[], outputs=[], queue=False)
 main.launch(show_api=True)

 global_instruction = "You will analyze image, GIF, video, and audio input, then use as much keywords to describe the given content and take as much guesses of what it could be."
 filetypes = {
+    "Image": {
+        "extensions": [".jpg",".jpeg",".png",".bmp"],
+        "instruction": "Analyze the '█' image.",
+        "function": "build_image"
+    },
+    "GIF":{
+        "extensions": [".gif"],
+        "instruction": "Analyze the '█' GIF.",
+        "function": "build_gif"
+    },
+    "Video": {
+        "extensions": [".mp4",".mov",".avi",".mkv"],
+        "instruction": "Analyze the '█' video including the audio associated with the video.",
+        "function": "build_video"
+    },
+    "Audio": {
+        "extensions": [".wav",".mp3",".flac",".aac"],
+        "instruction": "Analyze the '█' audio.",
+        "function": "build_audio"
+    },
 }
 # Functions
 uniform_sample=lambda seq, n: seq[::max(len(seq) // n,1)][:n]
+def build_video(filepath):
+    vr = VideoReader(filepath, ctx = cpu(0))
     i = uniform_sample(range(len(vr)), MAX_FRAMES)
     batch = vr.get_batch(i).asnumpy()
     frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
+    audio = build_audio(filepath)
     audio_length = math.ceil(len(audio) / AUDIO_SR)
     total_length = max(1, min(len(frames), audio_length))
     return contents
+def build_image(filepath):
+    image = Image.open(filepath).convert("RGB")
     return image
+def build_gif(filepath):
+    image = Image.open(filepath)
     frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(image)]
     frames = uniform_sample(frames, MAX_FRAMES)
     return frames
+def build_audio(filepath):
+    audio, _ = librosa.load(filepath, sr=AUDIO_SR, mono=True)
     return audio
 @spaces.GPU(duration=30)
+def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
     if not input: return "No input provided."
+    extension = os.path.splitext(filepath)[1].lower()
+    filetype = next((k for k, v in filetypes.items() if extension in v["extensions"]), None)
     if not filetype: return "Unsupported file type."
+    filetype_data = filetypes[filetype]
+    input_prefix = filetype_data["instruction"].replace("█", os.path.basename(filepath))
+    file_content = globals()[filetype_data["function"]](filepath)
+    full_instruction=f"{global_instruction}\n{input_prefix}\n{instruction}"
+    content = (file_content if isinstance(file_content, list) else [file_content]) + [full_instruction]
+    msgs = [{ "role": "user", "content": content }]
     print(msgs)
 # Initialize
 with gr.Blocks(css=css) as main:
     with gr.Column():
+        file = gr.File(label="File", file_types=["image", "video", "audio"], type="filepath")
+        input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
         sampling = gr.Checkbox(value=False, label="Sampling")
         temperature = gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="Temperature")
         top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.95, label="Top P")
     with gr.Column():
         output = gr.Textbox(lines=1, value="", label="Output")
+    submit.click(fn=generate, inputs=[file, input, sampling, temperature, top_p, top_k, repetition_penalty, max_tokens], outputs=[output], queue=False)
     maintain.click(cloud, inputs=[], outputs=[], queue=False)
 main.launch(show_api=True)