from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredFileLoader, CSVLoader from langchain.vectorstores.faiss import FAISS from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma, Pinecone import pickle import pinecone # Load Data # loader = UnstructuredFileLoader("output.md") # raw_documents = loader.load() loader = CSVLoader(file_path='./posts.csv', source_column="Post Title", encoding='utf-8') raw_documents = loader.load() # Split text text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=0) documents = text_splitter.split_documents(raw_documents) # # # Load Data to vectorstore embeddings = OpenAIEmbeddings() # vectorstore = FAISS.from_documents(documents, embeddings) # # Save vectorstore # with open("posts.pkl", "wb") as f: # pickle.dump(vectorstore, f) PINECONE_API_KEY = '6af52b8a-a3df-4189-899b-b21163027bb8' PINECONE_API_ENV = 'asia-southeast1-gcp' # initialize pinecone pinecone.init( api_key=PINECONE_API_KEY, # find at app.pinecone.io environment=PINECONE_API_ENV # next to api key in console ) index_name = "twimbit-answer" Pinecone.from_texts([t.page_content for t in documents], embeddings, index_name=index_name) # query = "How many neo banks are in india ?" # # docsearch = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings) # # docs = docsearch.similarity_search(query, include_metadata=True)