Spaces:
Running
Running
from langchain.schema import Document | |
import pickle | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_openai import OpenAIEmbeddings | |
from langchain_chroma import Chroma | |
from langchain.retrievers import ParentDocumentRetriever | |
from langchain.storage import InMemoryStore | |
import os | |
from typing import Iterable | |
import json | |
from tqdm import tqdm | |
from langchain_huggingface import HuggingFaceEmbeddings | |
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2") | |
def parent_retriever(chroma_path, embeddings): | |
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, | |
chunk_overlap=500) | |
# create the child documents - The small chunks | |
child_splitter = RecursiveCharacterTextSplitter(chunk_size=300, | |
chunk_overlap=50) | |
# The storage layer for the parent chunks | |
store = InMemoryStore() | |
vectorstore = Chroma(collection_name="full_documents", | |
embedding_function=embeddings, | |
persist_directory=chroma_path) | |
retriever = ParentDocumentRetriever( | |
vectorstore=vectorstore, | |
docstore=store, | |
child_splitter=child_splitter, | |
parent_splitter=parent_splitter, | |
search_kwargs={"k": 5}) | |
return retriever | |
def save_to_pickle(obj, filename): | |
''' | |
save docstore as pickle file | |
''' | |
with open(filename, "wb") as file: | |
pickle.dump(obj, file, pickle.HIGHEST_PROTOCOL) | |
retriever_repos = parent_retriever('ohw_proj_chorma_db',embeddings=embedding) | |
def load_docs_from_jsonl(file_path)->Iterable[Document]: | |
array = [] | |
with open(file_path, 'r') as jsonl_file: | |
for line in jsonl_file: | |
data = json.loads(line) | |
obj = Document(**data) | |
array.append(obj) | |
return array | |
documents = load_docs_from_jsonl('project_readmes.json') | |
for i in tqdm(range(0,len(documents))): | |
retriever_repos.add_documents([documents[i]]) | |
save_to_pickle(retriever_repos.docstore.store, 'ohw_proj_chorma_db.pcl') |