Spaces:
Sleeping
Sleeping
import os | |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings.openai import OpenAIEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.chains import RetrievalQA | |
from langchain.llms import OpenAI | |
from dotenv import load_dotenv | |
# Load environment variable for OpenAI key | |
load_dotenv() | |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
if not OPENAI_API_KEY: | |
raise ValueError("Missing OPENAI_API_KEY in environment variables.") | |
# Extract Data from the PDFs | |
def load_pdf_file(data_path): | |
loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader) | |
documents = loader.load() | |
return documents | |
# Split the data into chunks | |
def text_split(docs): | |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) | |
return splitter.split_documents(docs) | |
# Set up LLM and Embedding | |
llm = OpenAI(model_name="gpt-4o-mini", temperature=0.5, openai_api_key=OPENAI_API_KEY) | |
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) | |
# Load PDF, chunk it, embed it, and store in FAISS | |
pdf_docs = load_pdf_file("/kaggle/input/rag-test") # Update this to your PDF folder | |
chunks = text_split(pdf_docs) | |
vectorstore = FAISS.from_documents(chunks, embeddings) | |
vectorstore.save_local("faiss_index_sysml") | |
# Load FAISS and create retriever QA chain | |
# new_vectorstore = FAISS.load_local("faiss_index_sysml", embeddings, allow_dangerous_deserialization=True) | |
# qa = RetrievalQA.from_chain_type( | |
# llm=llm, | |
# chain_type="stuff", | |
# retriever=new_vectorstore.as_retriever() | |
# ) | |
# # Run a sample query | |
# query = "What is SysML used for?" | |
# print("User:", query) | |
# print("Bot:", qa.run(query)) | |