chatbot_ohw_projects / create_retriever.py
boryasbora's picture
Create create_retriever.py
9514ca1 verified
from langchain.schema import Document
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
import os
from typing import Iterable
import json
from tqdm import tqdm
from langchain_huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")
def parent_retriever(chroma_path, embeddings):
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,
chunk_overlap=500)
# create the child documents - The small chunks
child_splitter = RecursiveCharacterTextSplitter(chunk_size=300,
chunk_overlap=50)
# The storage layer for the parent chunks
store = InMemoryStore()
vectorstore = Chroma(collection_name="full_documents",
embedding_function=embeddings,
persist_directory=chroma_path)
retriever = ParentDocumentRetriever(
vectorstore=vectorstore,
docstore=store,
child_splitter=child_splitter,
parent_splitter=parent_splitter,
search_kwargs={"k": 5})
return retriever
def save_to_pickle(obj, filename):
'''
save docstore as pickle file
'''
with open(filename, "wb") as file:
pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL)
retriever_repos = parent_retriever('ohw_proj_chorma_db',embeddings=embedding)
def load_docs_from_jsonl(file_path)->Iterable[Document]:
array = []
with open(file_path, 'r') as jsonl_file:
for line in jsonl_file:
data = json.loads(line)
obj = Document(**data)
array.append(obj)
return array
documents = load_docs_from_jsonl('project_readmes.json')
for i in tqdm(range(0,len(documents))):
retriever_repos.add_documents([documents[i]])
save_to_pickle(retriever_repos.docstore.store, 'ohw_proj_chorma_db.pcl')