import os from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQA from langchain.llms import OpenAI from dotenv import load_dotenv # Load environment variable for OpenAI key load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") if not OPENAI_API_KEY: raise ValueError("Missing OPENAI_API_KEY in environment variables.") # Extract Data from the PDFs def load_pdf_file(data_path): loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader) documents = loader.load() return documents # Split the data into chunks def text_split(docs): splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) return splitter.split_documents(docs) # Set up LLM and Embedding llm = OpenAI(model_name="gpt-4o-mini", temperature=0.5, openai_api_key=OPENAI_API_KEY) embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) # Load PDF, chunk it, embed it, and store in FAISS pdf_docs = load_pdf_file("/kaggle/input/rag-test") # Update this to your PDF folder chunks = text_split(pdf_docs) vectorstore = FAISS.from_documents(chunks, embeddings) vectorstore.save_local("faiss_index_sysml") # Load FAISS and create retriever QA chain # new_vectorstore = FAISS.load_local("faiss_index_sysml", embeddings, allow_dangerous_deserialization=True) # qa = RetrievalQA.from_chain_type( # llm=llm, # chain_type="stuff", # retriever=new_vectorstore.as_retriever() # ) # # Run a sample query # query = "What is SysML used for?" # print("User:", query) # print("Bot:", qa.run(query))