Spaces:

88hours
/

multimodel-rag-chat-with-videos

Sleeping

File size: 3,170 Bytes

from pathlib import Path
import os
from os import path as osp
import whisper
from moviepy import VideoFileClip
from PIL import Image
from utility import download_video, extract_meta_data, get_transcript_vtt, getSubs
from urllib.request import urlretrieve
from IPython.display import display
import ollama

def demp_video_input_that_has_transcript():  
    # first video's url
    vid_url = "https://www.youtube.com/watch?v=7Hcg-rLYwdM"

    # download Youtube video to ./shared_data/videos/video1
    vid_dir = "./shared_data/videos/video1"
    vid_filepath = download_video(vid_url, vid_dir)

    # download Youtube video's subtitle to ./shared_data/videos/video1
    vid_transcript_filepath = get_transcript_vtt(vid_url, vid_dir)

    return extract_meta_data(vid_dir, vid_filepath, vid_transcript_filepath)

def demp_video_input_that_has_no_transcript():  
        # second video's url
    vid_url=(
        "https://multimedia-commons.s3-us-west-2.amazonaws.com/" 
        "data/videos/mp4/010/a07/010a074acb1975c4d6d6e43c1faeb8.mp4"
    )
    vid_dir = "./shared_data/videos/video2"
    vid_name = "toddler_in_playground.mp4"

    # create folder to which video2 will be downloaded 
    Path(vid_dir).mkdir(parents=True, exist_ok=True)
    vid_filepath = urlretrieve(
                            vid_url, 
                            osp.join(vid_dir, vid_name)
                        )[0]
    
    path_to_video_no_transcript = vid_filepath

    # declare where to save .mp3 audio
    path_to_extracted_audio_file = os.path.join(vid_dir, 'audio.mp3')

    # extract mp3 audio file from mp4 video video file
    clip = VideoFileClip(path_to_video_no_transcript)
    clip.audio.write_audiofile(path_to_extracted_audio_file)

    model = whisper.load_model("small")
    options = dict(task="translate", best_of=1, language='en')
    results = model.transcribe(path_to_extracted_audio_file, **options)

    vtt = getSubs(results["segments"], "vtt")

    # path to save generated transcript of video1
    path_to_generated_trans = osp.join(vid_dir, 'generated_video1.vtt')
    # write transcription to file
    with open(path_to_generated_trans, 'w') as f:
        f.write(vtt)

    return extract_meta_data(vid_dir, vid_filepath, path_to_generated_trans)



def ask_llvm(instruction, file_path):
    result = ollama.generate(
        model='llava',
        prompt=instruction,
        images=[file_path],
        stream=False
    )['response']
    img=Image.open(file_path, mode='r')
    img = img.resize([int(i/1.2) for i in img.size])
    display(img) 
    for i in result.split('.'):
        print(i, end='', flush=True)
if __name__ == "__main__":
    meta_data = demp_video_input_that_has_transcript()
    
    meta_data1 = demp_video_input_that_has_no_transcript()
    data = meta_data1[1]
    caption = data['transcript']
    print(f'Generated caption is: "{caption}"')
    frame = Image.open(data['extracted_frame_path'])
    display(frame)
    instruction = "Can you describe the image?"
    ask_llvm(instruction, data['extracted_frame_path'])
    #print(meta_data)