|
from pathlib import Path
|
|
import os
|
|
from os import path as osp
|
|
import whisper
|
|
from moviepy import VideoFileClip
|
|
from PIL import Image
|
|
from utility import download_video, extract_meta_data, get_transcript_vtt, getSubs
|
|
from urllib.request import urlretrieve
|
|
from IPython.display import display
|
|
import ollama
|
|
|
|
def demp_video_input_that_has_transcript():
|
|
|
|
vid_url = "https://www.youtube.com/watch?v=7Hcg-rLYwdM"
|
|
|
|
|
|
vid_dir = "./shared_data/videos/video1"
|
|
vid_filepath = download_video(vid_url, vid_dir)
|
|
|
|
|
|
vid_transcript_filepath = get_transcript_vtt(vid_url, vid_dir)
|
|
|
|
return extract_meta_data(vid_dir, vid_filepath, vid_transcript_filepath)
|
|
|
|
def demp_video_input_that_has_no_transcript():
|
|
|
|
vid_url=(
|
|
"https://multimedia-commons.s3-us-west-2.amazonaws.com/"
|
|
"data/videos/mp4/010/a07/010a074acb1975c4d6d6e43c1faeb8.mp4"
|
|
)
|
|
vid_dir = "./shared_data/videos/video2"
|
|
vid_name = "toddler_in_playground.mp4"
|
|
|
|
|
|
Path(vid_dir).mkdir(parents=True, exist_ok=True)
|
|
vid_filepath = urlretrieve(
|
|
vid_url,
|
|
osp.join(vid_dir, vid_name)
|
|
)[0]
|
|
|
|
path_to_video_no_transcript = vid_filepath
|
|
|
|
|
|
path_to_extracted_audio_file = os.path.join(vid_dir, 'audio.mp3')
|
|
|
|
|
|
clip = VideoFileClip(path_to_video_no_transcript)
|
|
clip.audio.write_audiofile(path_to_extracted_audio_file)
|
|
|
|
model = whisper.load_model("small")
|
|
options = dict(task="translate", best_of=1, language='en')
|
|
results = model.transcribe(path_to_extracted_audio_file, **options)
|
|
|
|
vtt = getSubs(results["segments"], "vtt")
|
|
|
|
|
|
path_to_generated_trans = osp.join(vid_dir, 'generated_video1.vtt')
|
|
|
|
with open(path_to_generated_trans, 'w') as f:
|
|
f.write(vtt)
|
|
|
|
return extract_meta_data(vid_dir, vid_filepath, path_to_generated_trans)
|
|
|
|
|
|
|
|
def ask_llvm(instruction, file_path):
|
|
result = ollama.generate(
|
|
model='llava',
|
|
prompt=instruction,
|
|
images=[file_path],
|
|
stream=False
|
|
)['response']
|
|
img=Image.open(file_path, mode='r')
|
|
img = img.resize([int(i/1.2) for i in img.size])
|
|
display(img)
|
|
for i in result.split('.'):
|
|
print(i, end='', flush=True)
|
|
if __name__ == "__main__":
|
|
meta_data = demp_video_input_that_has_transcript()
|
|
|
|
meta_data1 = demp_video_input_that_has_no_transcript()
|
|
data = meta_data1[1]
|
|
caption = data['transcript']
|
|
print(f'Generated caption is: "{caption}"')
|
|
frame = Image.open(data['extracted_frame_path'])
|
|
display(frame)
|
|
instruction = "Can you describe the image?"
|
|
ask_llvm(instruction, data['extracted_frame_path'])
|
|
|
|
|