""" Load and parse files (pdf) in the data/documents and save cached pkl files. """ import os import pickle from dotenv import load_dotenv from huggingface_hub import login from documents import load_pdf_as_docs, get_doc_chunks from embeddings import get_jinaai_embeddings # Load and set env variables load_dotenv() # Set huggingface api for downloading embedding model HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"] login(HUGGINGFACEHUB_API_TOKEN) def save_to_pickle(obj, filename): with open(filename, "wb") as file: pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # database_root = "./data/db" document_path = "./data/documents" # Parse pdf as "Documents" instances and save as "docs.pkl" docs = load_pdf_as_docs(document_path) save_to_pickle(docs, os.path.join(database_root, "docs.pkl")) # Get text chunks and save as "docs_chunks.pkl" document_chunks = get_doc_chunks(docs) save_to_pickle(docs, os.path.join(database_root, "docs_chunks.pkl")) embeddings = get_jinaai_embeddings(device="auto") # Create and save vectorstore from vectorestores import get_faiss_vectorestore vectorstore = get_faiss_vectorestore(embeddings) # Create retrievers from retrievers import get_parent_doc_retriever # Get parent doc (small-to-big) retriever and save as "docstore.pkl" parent_doc_retriever = get_parent_doc_retriever( docs, vectorstore, save_path_root=database_root, save_vectorstore=True, save_docstore=True, )