Spaces:

88hours
/

multimodel-rag-chat-with-videos

Running

App Files Files Community

multimodel-rag-chat-with-videos / s6_prepare_video_input.py

88hours

Upload folder using huggingface_hub

ad022d3 verified 3 months ago

raw

history blame contribute delete

3.17 kB

	from pathlib import Path
	import os
	from os import path as osp
	import whisper
	from moviepy import VideoFileClip
	from PIL import Image
	from utility import download_video, extract_meta_data, get_transcript_vtt, getSubs
	from urllib.request import urlretrieve
	from IPython.display import display
	import ollama

	def demp_video_input_that_has_transcript():
	# first video's url
	vid_url = "https://www.youtube.com/watch?v=7Hcg-rLYwdM"

	# download Youtube video to ./shared_data/videos/video1
	vid_dir = "./shared_data/videos/video1"
	vid_filepath = download_video(vid_url, vid_dir)

	# download Youtube video's subtitle to ./shared_data/videos/video1
	vid_transcript_filepath = get_transcript_vtt(vid_url, vid_dir)

	return extract_meta_data(vid_dir, vid_filepath, vid_transcript_filepath)

	def demp_video_input_that_has_no_transcript():
	# second video's url
	vid_url=(
	"https://multimedia-commons.s3-us-west-2.amazonaws.com/"
	"data/videos/mp4/010/a07/010a074acb1975c4d6d6e43c1faeb8.mp4"
	)
	vid_dir = "./shared_data/videos/video2"
	vid_name = "toddler_in_playground.mp4"

	# create folder to which video2 will be downloaded
	Path(vid_dir).mkdir(parents=True, exist_ok=True)
	vid_filepath = urlretrieve(
	vid_url,
	osp.join(vid_dir, vid_name)
	)[0]

	path_to_video_no_transcript = vid_filepath

	# declare where to save .mp3 audio
	path_to_extracted_audio_file = os.path.join(vid_dir, 'audio.mp3')

	# extract mp3 audio file from mp4 video video file
	clip = VideoFileClip(path_to_video_no_transcript)
	clip.audio.write_audiofile(path_to_extracted_audio_file)

	model = whisper.load_model("small")
	options = dict(task="translate", best_of=1, language='en')
	results = model.transcribe(path_to_extracted_audio_file, **options)

	vtt = getSubs(results["segments"], "vtt")

	# path to save generated transcript of video1
	path_to_generated_trans = osp.join(vid_dir, 'generated_video1.vtt')
	# write transcription to file
	with open(path_to_generated_trans, 'w') as f:
	f.write(vtt)

	return extract_meta_data(vid_dir, vid_filepath, path_to_generated_trans)



	def ask_llvm(instruction, file_path):
	result = ollama.generate(
	model='llava',
	prompt=instruction,
	images=[file_path],
	stream=False
	)['response']
	img=Image.open(file_path, mode='r')
	img = img.resize([int(i/1.2) for i in img.size])
	display(img)
	for i in result.split('.'):
	print(i, end='', flush=True)
	if __name__ == "__main__":
	meta_data = demp_video_input_that_has_transcript()

	meta_data1 = demp_video_input_that_has_no_transcript()
	data = meta_data1[1]
	caption = data['transcript']
	print(f'Generated caption is: "{caption}"')
	frame = Image.open(data['extracted_frame_path'])
	display(frame)
	instruction = "Can you describe the image?"
	ask_llvm(instruction, data['extracted_frame_path'])
	#print(meta_data)