SysModeler-Chatbot / vdb_script /faiss_vdb_script.py
SysModeler's picture
Upload 6 files
f979d1d verified
raw
history blame
1.81 kB
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from dotenv import load_dotenv
# Load environment variable for OpenAI key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
raise ValueError("Missing OPENAI_API_KEY in environment variables.")
# Extract Data from the PDFs
def load_pdf_file(data_path):
loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
return documents
# Split the data into chunks
def text_split(docs):
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
return splitter.split_documents(docs)
# Set up LLM and Embedding
llm = OpenAI(model_name="gpt-4o-mini", temperature=0.5, openai_api_key=OPENAI_API_KEY)
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
# Load PDF, chunk it, embed it, and store in FAISS
pdf_docs = load_pdf_file("/kaggle/input/rag-test") # Update this to your PDF folder
chunks = text_split(pdf_docs)
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local("faiss_index_sysml")
# Load FAISS and create retriever QA chain
# new_vectorstore = FAISS.load_local("faiss_index_sysml", embeddings, allow_dangerous_deserialization=True)
# qa = RetrievalQA.from_chain_type(
# llm=llm,
# chain_type="stuff",
# retriever=new_vectorstore.as_retriever()
# )
# # Run a sample query
# query = "What is SysML used for?"
# print("User:", query)
# print("Bot:", qa.run(query))