Staticaliza commited on
Commit
ab966e8
Β·
verified Β·
1 Parent(s): c74b254

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -38
app.py CHANGED
@@ -37,30 +37,39 @@ footer {
37
 
38
  global_instruction = "You will analyze image, GIF, video, and audio input, then use as much keywords to describe the given content and take as much guesses of what it could be."
39
 
40
- input_prefixes = {
41
- "Image": "Analyze the 'β–ˆ' image.",
42
- "GIF": "Analyze the 'β–ˆ' GIF.",
43
- "Video": "Analyze the 'β–ˆ' video including the audio associated with the video.",
44
- "Audio": "Analyze the 'β–ˆ' audio.",
45
- }
46
-
47
  filetypes = {
48
- "Image": [".jpg", ".jpeg", ".png", ".bmp"],
49
- "GIF": [".gif"],
50
- "Video": [".mp4", ".mov", ".avi", ".mkv"],
51
- "Audio": [".wav", ".mp3", ".flac", ".aac"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  }
53
 
54
  # Functions
55
  uniform_sample=lambda seq, n: seq[::max(len(seq) // n,1)][:n]
56
 
57
- def build_video(path):
58
- vr = VideoReader(path, ctx = cpu(0))
59
  i = uniform_sample(range(len(vr)), MAX_FRAMES)
60
  batch = vr.get_batch(i).asnumpy()
61
  frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
62
 
63
- audio = build_audio(path)
64
 
65
  audio_length = math.ceil(len(audio) / AUDIO_SR)
66
  total_length = max(1, min(len(frames), audio_length))
@@ -76,40 +85,36 @@ def build_video(path):
76
 
77
  return contents
78
 
79
- def build_image(path):
80
- image = Image.open(path).convert("RGB")
81
  return image
82
 
83
- def build_gif(path):
84
- image = Image.open(path)
85
  frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(image)]
86
  frames = uniform_sample(frames, MAX_FRAMES)
87
  return frames
88
 
89
- def build_audio(path):
90
- audio, _ = librosa.load(path, sr=AUDIO_SR, mono=True)
91
  return audio
92
 
93
  @spaces.GPU(duration=30)
94
- def generate(input, instruction=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
95
  if not input: return "No input provided."
96
 
97
- extension = os.path.splitext(input)[1].lower()
98
- filetype = next((k for k, v in filetypes.items() if extension in v), None)
99
  if not filetype: return "Unsupported file type."
100
 
101
- filename = os.path.basename(input)
102
- prefix = input_prefixes[filetype].replace("β–ˆ", filename)
103
- builder_map = {
104
- "Image": build_image,
105
- "GIF" : build_gif,
106
- "Video": build_video,
107
- "Audio": build_audio,
108
- }
109
-
110
- instruction = f"{global_instruction}\n{prefix}\n{instruction}"
111
- omni_content = builder_map[filetype](input)
112
- msgs = [{ "role": "user", "content": [omni_content, instruction] }]
113
 
114
  print(msgs)
115
 
@@ -138,8 +143,8 @@ def cloud():
138
  # Initialize
139
  with gr.Blocks(css=css) as main:
140
  with gr.Column():
141
- input = gr.File(label="Input", file_types=["image", "video", "audio"], type="filepath")
142
- instruction = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Instruction")
143
  sampling = gr.Checkbox(value=False, label="Sampling")
144
  temperature = gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="Temperature")
145
  top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.95, label="Top P")
@@ -152,7 +157,7 @@ with gr.Blocks(css=css) as main:
152
  with gr.Column():
153
  output = gr.Textbox(lines=1, value="", label="Output")
154
 
155
- submit.click(fn=generate, inputs=[input, instruction, sampling, temperature, top_p, top_k, repetition_penalty, max_tokens], outputs=[output], queue=False)
156
  maintain.click(cloud, inputs=[], outputs=[], queue=False)
157
 
158
  main.launch(show_api=True)
 
37
 
38
  global_instruction = "You will analyze image, GIF, video, and audio input, then use as much keywords to describe the given content and take as much guesses of what it could be."
39
 
 
 
 
 
 
 
 
40
  filetypes = {
41
+ "Image": {
42
+ "extensions": [".jpg",".jpeg",".png",".bmp"],
43
+ "instruction": "Analyze the 'β–ˆ' image.",
44
+ "function": "build_image"
45
+ },
46
+ "GIF":{
47
+ "extensions": [".gif"],
48
+ "instruction": "Analyze the 'β–ˆ' GIF.",
49
+ "function": "build_gif"
50
+ },
51
+ "Video": {
52
+ "extensions": [".mp4",".mov",".avi",".mkv"],
53
+ "instruction": "Analyze the 'β–ˆ' video including the audio associated with the video.",
54
+ "function": "build_video"
55
+ },
56
+ "Audio": {
57
+ "extensions": [".wav",".mp3",".flac",".aac"],
58
+ "instruction": "Analyze the 'β–ˆ' audio.",
59
+ "function": "build_audio"
60
+ },
61
  }
62
 
63
  # Functions
64
  uniform_sample=lambda seq, n: seq[::max(len(seq) // n,1)][:n]
65
 
66
+ def build_video(filepath):
67
+ vr = VideoReader(filepath, ctx = cpu(0))
68
  i = uniform_sample(range(len(vr)), MAX_FRAMES)
69
  batch = vr.get_batch(i).asnumpy()
70
  frames = [Image.fromarray(frame.astype("uint8")) for frame in batch]
71
 
72
+ audio = build_audio(filepath)
73
 
74
  audio_length = math.ceil(len(audio) / AUDIO_SR)
75
  total_length = max(1, min(len(frames), audio_length))
 
85
 
86
  return contents
87
 
88
+ def build_image(filepath):
89
+ image = Image.open(filepath).convert("RGB")
90
  return image
91
 
92
+ def build_gif(filepath):
93
+ image = Image.open(filepath)
94
  frames = [f.copy().convert("RGB") for f in ImageSequence.Iterator(image)]
95
  frames = uniform_sample(frames, MAX_FRAMES)
96
  return frames
97
 
98
+ def build_audio(filepath):
99
+ audio, _ = librosa.load(filepath, sr=AUDIO_SR, mono=True)
100
  return audio
101
 
102
  @spaces.GPU(duration=30)
103
+ def generate(filepath, input=DEFAULT_INPUT, sampling=False, temperature=0.7, top_p=0.8, top_k=100, repetition_penalty=1.05, max_tokens=512):
104
  if not input: return "No input provided."
105
 
106
+ extension = os.path.splitext(filepath)[1].lower()
107
+ filetype = next((k for k, v in filetypes.items() if extension in v["extensions"]), None)
108
  if not filetype: return "Unsupported file type."
109
 
110
+ filetype_data = filetypes[filetype]
111
+ input_prefix = filetype_data["instruction"].replace("β–ˆ", os.path.basename(filepath))
112
+
113
+ file_content = globals()[filetype_data["function"]](filepath)
114
+ full_instruction=f"{global_instruction}\n{input_prefix}\n{instruction}"
115
+ content = (file_content if isinstance(file_content, list) else [file_content]) + [full_instruction]
116
+
117
+ msgs = [{ "role": "user", "content": content }]
 
 
 
 
118
 
119
  print(msgs)
120
 
 
143
  # Initialize
144
  with gr.Blocks(css=css) as main:
145
  with gr.Column():
146
+ file = gr.File(label="File", file_types=["image", "video", "audio"], type="filepath")
147
+ input = gr.Textbox(lines=1, value=DEFAULT_INPUT, label="Input")
148
  sampling = gr.Checkbox(value=False, label="Sampling")
149
  temperature = gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="Temperature")
150
  top_p = gr.Slider(minimum=0, maximum=1, step=0.01, value=0.95, label="Top P")
 
157
  with gr.Column():
158
  output = gr.Textbox(lines=1, value="", label="Output")
159
 
160
+ submit.click(fn=generate, inputs=[file, input, sampling, temperature, top_p, top_k, repetition_penalty, max_tokens], outputs=[output], queue=False)
161
  maintain.click(cloud, inputs=[], outputs=[], queue=False)
162
 
163
  main.launch(show_api=True)