|
""" |
|
Load and parse files (pdf) in the data/documents and save cached pkl files. |
|
""" |
|
|
|
import os |
|
import pickle |
|
|
|
from dotenv import load_dotenv |
|
|
|
|
|
from huggingface_hub import login |
|
|
|
from documents import load_pdf_as_docs, get_doc_chunks |
|
from embeddings import get_jinaai_embeddings |
|
|
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"] |
|
login(HUGGINGFACEHUB_API_TOKEN) |
|
|
|
|
|
def save_to_pickle(obj, filename): |
|
with open(filename, "wb") as file: |
|
pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL) |
|
|
|
|
|
|
|
database_root = "./data/db" |
|
document_path = "./data/documents" |
|
|
|
|
|
docs = load_pdf_as_docs(document_path) |
|
save_to_pickle(docs, os.path.join(database_root, "docs.pkl")) |
|
|
|
|
|
document_chunks = get_doc_chunks(docs) |
|
save_to_pickle(docs, os.path.join(database_root, "docs_chunks.pkl")) |
|
|
|
embeddings = get_jinaai_embeddings(device="auto") |
|
|
|
|
|
from vectorestores import get_faiss_vectorestore |
|
|
|
vectorstore = get_faiss_vectorestore(embeddings) |
|
|
|
|
|
from retrievers import get_parent_doc_retriever |
|
|
|
|
|
parent_doc_retriever = get_parent_doc_retriever( |
|
docs, |
|
vectorstore, |
|
save_path_root=database_root, |
|
save_vectorstore=True, |
|
save_docstore=True, |
|
) |
|
|