Staticaliza commited on
Commit
3e7bef2
·
verified ·
1 Parent(s): c81c545

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -12
app.py CHANGED
@@ -36,7 +36,7 @@ footer {
36
  }
37
  '''
38
 
39
- global_instruction = "Describe the given content with as much keywords and always take a guess."
40
 
41
  input_prefixes = {
42
  "Image": "A image file called █ has been attached, describe the image content.",
@@ -69,7 +69,11 @@ def frames_from_video(path):
69
  def audio_from_video(path):
70
  clip = VideoFileClip(path)
71
  with tempfile.NamedTemporaryFile(suffix = ".wav", delete = True) as tmp:
72
- clip.audio.write_audiofile(tmp.name, codec = "pcm_s16le", fps = AUDIO_SR, verbose = False, logger = None)
 
 
 
 
73
  audio_np, _ = librosa.load(tmp.name, sr = AUDIO_SR, mono = True)
74
  clip.close()
75
  return audio_np
@@ -79,16 +83,23 @@ def load_audio(path):
79
  return audio_np
80
 
81
  def build_video_omni(path, instruction):
82
- frames = frames_from_video(path)
83
- audio = audio_from_video(path)
84
  contents = [instruction]
85
- total = max(len(frames), math.ceil(len(audio) / AUDIO_SR))
86
- for i in range(total):
 
 
 
87
  frame = frames[i] if i < len(frames) else frames[-1]
88
- chunk = audio[AUDIO_SR * i : AUDIO_SR * (i + 1)]
 
 
 
89
  contents.extend(["<unit>", frame, chunk])
90
- return contents
91
 
 
 
92
  def build_image_omni(path, instruction):
93
  image = Image.open(path).convert("RGB")
94
  return [instruction, image]
@@ -127,10 +138,8 @@ def generate(input,
127
  "Audio": build_audio_omni
128
  }
129
 
130
- instruction = f"{global_instruction}\n{prefix}\n{instruction}"
131
- omni_content = builder_map[filetype](input, instruction)
132
- sys_msg = repo.get_sys_prompt(mode = "omni", language = "en")
133
- msgs = [sys_msg, { "role": "user", "content": omni_content }]
134
 
135
  print(msgs)
136
 
 
36
  }
37
  '''
38
 
39
+ global_instruction = "You will analyze video, audio and text input and output your description of the given content with as much keywords and always take a guess."
40
 
41
  input_prefixes = {
42
  "Image": "A image file called █ has been attached, describe the image content.",
 
69
  def audio_from_video(path):
70
  clip = VideoFileClip(path)
71
  with tempfile.NamedTemporaryFile(suffix = ".wav", delete = True) as tmp:
72
+ clip.audio.write_audiofile(tmp.name,
73
+ codec = "pcm_s16le",
74
+ fps = AUDIO_SR,
75
+ verbose = False,
76
+ logger = None)
77
  audio_np, _ = librosa.load(tmp.name, sr = AUDIO_SR, mono = True)
78
  clip.close()
79
  return audio_np
 
83
  return audio_np
84
 
85
  def build_video_omni(path, instruction):
86
+ frames = frames_from_video(path)
87
+ audio = audio_from_video(path)
88
  contents = [instruction]
89
+
90
+ audio_secs = math.ceil(len(audio) / AUDIO_SR)
91
+ total_units = max(1, min(len(frames), audio_secs))
92
+
93
+ for i in range(total_units):
94
  frame = frames[i] if i < len(frames) else frames[-1]
95
+ start = i * AUDIO_SR
96
+ end = min((i + 1) * AUDIO_SR, len(audio))
97
+ chunk = audio[start:end]
98
+ if chunk.size == 0: break
99
  contents.extend(["<unit>", frame, chunk])
 
100
 
101
+ return contents
102
+
103
  def build_image_omni(path, instruction):
104
  image = Image.open(path).convert("RGB")
105
  return [instruction, image]
 
138
  "Audio": build_audio_omni
139
  }
140
 
141
+ instruction = f"{prefix}\n{instruction}"
142
+ msgs = [{ "role": "user", "content": global_instruction }, { "role": "user", "content": omni_content }]
 
 
143
 
144
  print(msgs)
145