Spaces:

Staticaliza
/

Sense

Running on Zero

App Files Files Community

Staticaliza commited on 14 days ago

Commit

d7a2675

verified ·

1 Parent(s): 0350ec7

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -84

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ import librosa
 import tempfile
 from PIL import Image, ImageSequence
 from decord import VideoReader, cpu
-from moviepy.editor import VideoFileClip
 from transformers import AutoModel, AutoTokenizer, AutoProcessor
 # Variables
@@ -39,10 +38,10 @@ footer {
 global_instruction = "You will analyze video, audio and text input and output your description of the given content with as much keywords and always take a guess."
 input_prefixes = {
-    "Image": "A image file called █ has been attached, describe the image content.",
-    "GIF": "A GIF file called █ has been attached, describe the GIF content.",
-    "Video": "A audio video file called █ has been attached, describe the video content and the audio content.",
-    "Audio": "A audio file called █ has been attached, describe the audio content.",
 }
 filetypes = {
@@ -53,109 +52,81 @@ filetypes = {
 }
 # Functions
-def infer_filetype(ext):
-    return next((k for k, v in filetypes.items() if ext in v), None)
 def uniform_sample(seq, n):
     step = max(len(seq) // n, 1)
     return seq[::step][:n]
-def frames_from_video(path):
     vr = VideoReader(path, ctx = cpu(0))
     idx = uniform_sample(range(len(vr)), MAX_FRAMES)
     batch = vr.get_batch(idx).asnumpy()
-    return [Image.fromarray(frame.astype("uint8")) for frame in batch]
-def audio_from_video(path):
-    clip = VideoFileClip(path)
-    with tempfile.NamedTemporaryFile(suffix = ".wav", delete = True) as tmp:
-        clip.audio.write_audiofile(tmp.name,
-                                   codec = "pcm_s16le",
-                                   fps   = AUDIO_SR,
-                                   verbose = False,
-                                   logger  = None)
-        audio_np, _ = librosa.load(tmp.name, sr = AUDIO_SR, mono = True)
-    clip.close()
-    return audio_np
-def load_audio(path):
-    audio_np, _ = librosa.load(path, sr = AUDIO_SR, mono = True)
-    return audio_np
-def build_video_omni(path, instruction):
-    frames = frames_from_video(path)
-    audio  = audio_from_video(path)
-    contents = [instruction]
-    audio_secs  = math.ceil(len(audio) / AUDIO_SR)
     total_units = max(1, min(len(frames), audio_secs))
     for i in range(total_units):
         frame = frames[i] if i < len(frames) else frames[-1]
         start = i * AUDIO_SR
-        end   = min((i + 1) * AUDIO_SR, len(audio))
         chunk = audio[start:end]
         if chunk.size == 0: break
         contents.extend(["<unit>", frame, chunk])
     return contents
-def build_image_omni(path, instruction):
     image = Image.open(path).convert("RGB")
-    return [instruction, image]
-def build_gif_omni(path, instruction):
-    img = Image.open(path)
-    frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(img)]
     frames = uniform_sample(frames, MAX_FRAMES)
-    return [instruction, *frames]
-def build_audio_omni(path, instruction):
-    audio = load_audio(path)
-    return [instruction, audio]
 @spaces.GPU(duration=30)
-def generate(input,
-             instruction        = DEFAULT_INPUT,
-             sampling           = False,
-             temperature        = 0.7,
-             top_p              = 0.8,
-             top_k              = 100,
-             repetition_penalty = 1.05,
-             max_tokens         = 512):
-    if not input: return "no input provided."
     extension = os.path.splitext(input)[1].lower()
-    filetype  = infer_filetype(extension)
-    if not filetype: return "unsupported file type."
-    filename     = os.path.basename(input)
-    prefix       = input_prefixes[filetype].replace("█", filename)
-    builder_map  = {
-        "Image": build_image_omni,
-        "GIF"  : build_gif_omni,
-        "Video": build_video_omni,
-        "Audio": build_audio_omni
     }
-    instruction = f"{prefix}\n{instruction}"
-    omni_content = builder_map[filetype](input, instruction)
-    msgs = [{ "role": "user", "content": global_instruction }, { "role": "user", "content": omni_content }]
     print(msgs)
-    output       = repo.chat(
-        msgs               = msgs,
-        tokenizer          = tokenizer,
-        sampling           = sampling,
-        temperature        = temperature,
-        top_p              = top_p,
-        top_k              = top_k,
-        repetition_penalty = repetition_penalty,
-        max_new_tokens     = max_tokens,
-        omni_input         = True,
-        use_image_id       = False,
-        max_slice_nums     = 2
     )
     torch.cuda.empty_cache()
@@ -171,11 +142,11 @@ with gr.Blocks(css=css) as main:
     with gr.Column():
         input = gr.File(label="Input", file_types=["image", "video", "audio"], type="filepath")
         instruction = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Instruction")
-        sampling = gr.Checkbox(value=False, label="Sampling")
-        temperature = gr.Slider(minimum=0.01, maximum=1.99, step=0.01, value=0.7, label="Temperature")
-        top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.8, label="Top P")
-        top_k = gr.Slider(minimum=0, maximum=1000, step=1, value=100, label="Top K")
-        repetition_penalty = gr.Slider(minimum=0.01, maximum=1.99, step=0.01, value=1.05, label="Repetition Penalty")
         max_tokens = gr.Slider(minimum=1, maximum=4096, step=1, value=512, label="Max Tokens")
         submit = gr.Button("▶")
         maintain = gr.Button("☁️")

 import tempfile
 from PIL import Image, ImageSequence
 from decord import VideoReader, cpu
 from transformers import AutoModel, AutoTokenizer, AutoProcessor
 # Variables
 global_instruction = "You will analyze video, audio and text input and output your description of the given content with as much keywords and always take a guess."
 input_prefixes = {
+    "Image": "Analyze the '█' image.",
+    "GIF": "Analyze the '█' GIF.",
+    "Video": "Analyze the '█' video including the audio associated with the video.",
+    "Audio": "Analyze the '█' audio.",
 }
 filetypes = {
 }
 # Functions
 def uniform_sample(seq, n):
     step = max(len(seq) // n, 1)
     return seq[::step][:n]
+def build_video_omni(path):
     vr = VideoReader(path, ctx = cpu(0))
     idx = uniform_sample(range(len(vr)), MAX_FRAMES)
     batch = vr.get_batch(idx).asnumpy()
+    frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
+    audio = build_audio(path)
+    audio_secs = math.ceil(len(audio) / AUDIO_SR)
     total_units = max(1, min(len(frames), audio_secs))
+    contents = []
     for i in range(total_units):
         frame = frames[i] if i < len(frames) else frames[-1]
         start = i * AUDIO_SR
+        end = min((i + 1) * AUDIO_SR, len(audio))
         chunk = audio[start:end]
         if chunk.size == 0: break
         contents.extend(["<unit>", frame, chunk])
     return contents
+def build_image(path):
     image = Image.open(path).convert("RGB")
+    return image
+def build_gif(path):
+    image = Image.open(path)
+    frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(image)]
     frames = uniform_sample(frames, MAX_FRAMES)
+    return *frames
+def build_audio(path):
+    audio, _ = librosa.load(path, sr=AUDIO_SR, mono=True)
+    return audio
 @spaces.GPU(duration=30)
+def generate(input, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
+    if not input: return "No input provided."
     extension = os.path.splitext(input)[1].lower()
+    filetype = next((k for k, v in filetypes.items() if extension in v), None)
+    if not filetype: return "Unsupported file type."
+    filename = os.path.basename(input)
+    prefix = input_prefixes[filetype].replace("█", filename)
+    builder_map = {
+        "Image": build_image,
+        "GIF" : build_gif,
+        "Video": build_video,
+        "Audio": build_audio,
     }
+    instruction = f"{global_instruction}\n{prefix}\n{instruction}"
+    omni_content = builder_map[filetype](input)
+    msgs = [{ "role": "user", "content": [omni_content, instruction] }]
     print(msgs)
+    output = repo.chat(
+        msgs=msgs,
+        tokenizer=tokenizer,
+        sampling=sampling,
+        temperature= temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        max_new_tokens=max_tokens,
+        omni_input=True,
+        use_image_id=False,
+        max_slice_nums=9
     )
     torch.cuda.empty_cache()
     with gr.Column():
         input = gr.File(label="Input", file_types=["image", "video", "audio"], type="filepath")
         instruction = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Instruction")
+        sampling = gr.Checkbox(value=True, label="Sampling")
+        temperature = gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="Temperature")
+        top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.95, label="Top P")
+        top_k = gr.Slider(minimum=0, maximum=1000, step=1, value=50, label="Top K")
+        repetition_penalty = gr.Slider(minimum=0, maximum=2, step=0.01, value=1.05, label="Repetition Penalty")
         max_tokens = gr.Slider(minimum=1, maximum=4096, step=1, value=512, label="Max Tokens")
         submit = gr.Button("▶")
         maintain = gr.Button("☁️")