multimodel-rag-chat-with-videos / s7_store_in_rag.py
88hours's picture
Upload folder using huggingface_hub
ad022d3 verified
from mm_rag.embeddings.bridgetower_embeddings import (
BridgeTowerEmbeddings
)
from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
import lancedb
import json
import os
from PIL import Image
from utility import load_json_file, display_retrieved_results
import pyarrow as pa
# declare host file
LANCEDB_HOST_FILE = "./shared_data/.lancedb"
# declare table name
TBL_NAME = "test_tbl"
# initialize vectorstore
db = lancedb.connect(LANCEDB_HOST_FILE)
# initialize an BridgeTower embedder
embedder = BridgeTowerEmbeddings()
def return_top_k_most_similar_docs(max_docs=3):
# ask to return top 3 most similar documents
# Creating a LanceDB vector store
vectorstore = MultimodalLanceDB(
uri=LANCEDB_HOST_FILE,
embedding=embedder,
table_name=TBL_NAME)
# creating a retriever for the vector store
# search_type="similarity"
# declares that the type of search that the Retriever should perform
# is similarity search
# search_kwargs={"k": 1} means returning top-1 most similar document
retriever = vectorstore.as_retriever(
search_type='similarity',
search_kwargs={"k": max_docs})
query2 = (
"an astronaut's spacewalk "
"with an amazing view of the earth from space behind"
)
results2 = retriever.invoke(query2)
display_retrieved_results(results2)
query3 = "a group of astronauts"
results3 = retriever.invoke(query3)
display_retrieved_results(results3)
def open_table(TBL_NAME):
# open a connection to table TBL_NAME
tbl = db.open_table()
print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
# display the first 3 rows of the table
tbl.to_pandas()[['text', 'image_path']].head(3)
def store_in_rag():
# load metadata files
vid1_metadata_path = './shared_data/videos/video1/metadatas.json'
vid2_metadata_path = './shared_data/videos/video2/metadatas.json'
vid1_metadata = load_json_file(vid1_metadata_path)
vid2_metadata = load_json_file(vid2_metadata_path)
# collect transcripts and image paths
vid1_trans = [vid['transcript'] for vid in vid1_metadata]
vid1_img_path = [vid['extracted_frame_path'] for vid in vid1_metadata]
vid2_trans = [vid['transcript'] for vid in vid2_metadata]
vid2_img_path = [vid['extracted_frame_path'] for vid in vid2_metadata]
# for video1, we pick n = 7
n = 7
updated_vid1_trans = [
' '.join(vid1_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
' '.join(vid1_trans[0 : i + int(n/2)]) for i in range(len(vid1_trans))
]
# also need to update the updated transcripts in metadata
for i in range(len(updated_vid1_trans)):
vid1_metadata[i]['transcript'] = updated_vid1_trans[i]
# you can pass in mode="append"
# to add more entries to the vector store
# in case you want to start with a fresh vector store,
# you can pass in mode="overwrite" instead
_ = MultimodalLanceDB.from_text_image_pairs(
texts=updated_vid1_trans+vid2_trans,
image_paths=vid1_img_path+vid2_img_path,
embedding=embedder,
metadatas=vid1_metadata+vid2_metadata,
connection=db,
table_name=TBL_NAME,
mode="overwrite",
)
if __name__ == "__main__":
tbl = db.open_table(TBL_NAME)
print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
#display the first 3 rows of the table
return_top_k_most_similar_docs()