import os from dotenv import load_dotenv from langchain.document_loaders import PyPDFLoader, DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.agents import Tool, AgentExecutor from langchain.tools.retriever import create_retriever_tool from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_community.vectorstores import FAISS from langchain_community.embeddings import AzureOpenAIEmbeddings from langchain_community.chat_models import AzureChatOpenAI from openai import AzureOpenAI import warnings # Load environment variables load_dotenv() AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") AZURE_OPENAI_LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT") AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT") if not all([AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_LLM_DEPLOYMENT, AZURE_OPENAI_EMBEDDING_DEPLOYMENT]): raise ValueError("Missing one or more Azure OpenAI environment variables.") warnings.filterwarnings("ignore") AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") if not AZURE_OPENAI_API_KEY: raise ValueError("Missing AZURE_OPENAI_API_KEY in environment variables.") chunk_size = 500 # Extract Data from the PDFs def load_pdf_file(data_path): loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader) documents = loader.load() return documents # Split the data into chunks def text_split(docs): splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20) return splitter.split_documents(docs) # Set up LLM and Embedding llm = AzureChatOpenAI( deployment_name=AZURE_OPENAI_LLM_DEPLOYMENT, azure_endpoint=AZURE_OPENAI_ENDPOINT, openai_api_key=AZURE_OPENAI_API_KEY, openai_api_version="2023-12-01-preview" # or your supported version # temperature=0.5 # Only if supported by your deployment ) embeddings = AzureOpenAIEmbeddings( azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT, azure_endpoint=AZURE_OPENAI_ENDPOINT, openai_api_key=AZURE_OPENAI_API_KEY, openai_api_version="2023-12-01-preview", chunk_size=chunk_size # or another value up to 2048 ) # Load PDF, chunk it, embed it, and store in FAISS pdf_docs = load_pdf_file("Dataset/") # Update this to your PDF folder chunks = text_split(pdf_docs) vectorstore = FAISS.from_documents(chunks, embeddings) vectorstore.save_local("faiss_index_sysml") # Load FAISS and create retriever QA chain # new_vectorstore = FAISS.load_local("faiss_index_sysml", embeddings, allow_dangerous_deserialization=True) # qa = RetrievalQA.from_chain_type( # llm=llm, # chain_type="stuff", # retriever=new_vectorstore.as_retriever() # ) # # Run a sample query # query = "What is SysML used for?" # print("User:", query) # print("Bot:", qa.run(query))