Spaces:

88hours
/

multimodel-rag-chat-with-videos

Sleeping

File size: 3,595 Bytes

from mm_rag.embeddings.bridgetower_embeddings import (
    BridgeTowerEmbeddings
)
from mm_rag.vectorstores.multimodal_lancedb import MultimodalLanceDB
import lancedb
import json
import os
from PIL import Image
from utility import load_json_file, display_retrieved_results
import pyarrow as pa

# declare host file
LANCEDB_HOST_FILE = "./shared_data/.lancedb"
# declare table name
TBL_NAME = "test_tbl"
# initialize vectorstore
db = lancedb.connect(LANCEDB_HOST_FILE)
# initialize an BridgeTower embedder 
embedder = BridgeTowerEmbeddings()


def return_top_k_most_similar_docs(max_docs=3):
    # ask to return top 3 most similar documents
        # Creating a LanceDB vector store 
    vectorstore = MultimodalLanceDB(
        uri=LANCEDB_HOST_FILE, 
        embedding=embedder, 
        table_name=TBL_NAME)

    # creating a retriever for the vector store
    # search_type="similarity" 
    #  declares that the type of search that the Retriever should perform 
    #  is similarity search
    # search_kwargs={"k": 1} means returning top-1 most similar document
    
    
    retriever = vectorstore.as_retriever(
    search_type='similarity', 
    search_kwargs={"k": max_docs})
    query2 = (
            "an astronaut's spacewalk "
            "with an amazing view of the earth from space behind"
    )
    results2 = retriever.invoke(query2)
    display_retrieved_results(results2)
    query3 = "a group of astronauts"
    results3 = retriever.invoke(query3)
    display_retrieved_results(results3) 


def open_table(TBL_NAME):
    # open a connection to table TBL_NAME
    tbl = db.open_table()

    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
    # display the first 3 rows of the table
    tbl.to_pandas()[['text', 'image_path']].head(3)

def store_in_rag():

    # load metadata files
    vid1_metadata_path = './shared_data/videos/video1/metadatas.json'
    vid2_metadata_path = './shared_data/videos/video2/metadatas.json'
    vid1_metadata = load_json_file(vid1_metadata_path)
    vid2_metadata = load_json_file(vid2_metadata_path)

    # collect transcripts and image paths
    vid1_trans = [vid['transcript'] for vid in vid1_metadata]
    vid1_img_path = [vid['extracted_frame_path'] for vid in vid1_metadata]

    vid2_trans = [vid['transcript'] for vid in vid2_metadata]
    vid2_img_path = [vid['extracted_frame_path'] for vid in vid2_metadata]


    # for video1, we pick n = 7
    n = 7
    updated_vid1_trans = [
    ' '.join(vid1_trans[i-int(n/2) : i+int(n/2)]) if i-int(n/2) >= 0 else
    ' '.join(vid1_trans[0 : i + int(n/2)]) for i in range(len(vid1_trans))
    ]

    # also need to update the updated transcripts in metadata
    for i in range(len(updated_vid1_trans)):
        vid1_metadata[i]['transcript'] = updated_vid1_trans[i]

    # you can pass in mode="append" 
    # to add more entries to the vector store
    # in case you want to start with a fresh vector store,
    # you can pass in mode="overwrite" instead 

    _ = MultimodalLanceDB.from_text_image_pairs(
        texts=updated_vid1_trans+vid2_trans,
        image_paths=vid1_img_path+vid2_img_path,
        embedding=embedder,
        metadatas=vid1_metadata+vid2_metadata,
        connection=db,
        table_name=TBL_NAME,
        mode="overwrite", 
    )

if __name__ == "__main__":
    tbl  = db.open_table(TBL_NAME)
    print(f"There are {tbl.to_pandas().shape[0]} rows in the table")
    #display the first 3 rows of the table
    return_top_k_most_similar_docs()