from langchain.schema import Document import pickle from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_chroma import Chroma from langchain.retrievers import ParentDocumentRetriever from langchain.storage import InMemoryStore import os from typing import Iterable import json from tqdm import tqdm from langchain_huggingface import HuggingFaceEmbeddings embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2") def parent_retriever(chroma_path, embeddings): parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=500) # create the child documents - The small chunks child_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50) # The storage layer for the parent chunks store = InMemoryStore() vectorstore = Chroma(collection_name="full_documents", embedding_function=embeddings, persist_directory=chroma_path) retriever = ParentDocumentRetriever( vectorstore=vectorstore, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter, search_kwargs={"k": 5}) return retriever def save_to_pickle(obj, filename): ''' save docstore as pickle file ''' with open(filename, "wb") as file: pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL) retriever_repos = parent_retriever('ohw_proj_chorma_db',embeddings=embedding) def load_docs_from_jsonl(file_path)->Iterable[Document]: array = [] with open(file_path, 'r') as jsonl_file: for line in jsonl_file: data = json.loads(line) obj = Document(**data) array.append(obj) return array documents = load_docs_from_jsonl('project_readmes.json') for i in tqdm(range(0,len(documents))): retriever_repos.add_documents([documents[i]]) save_to_pickle(retriever_repos.docstore.store, 'ohw_proj_chorma_db.pcl')