File size: 3,012 Bytes
f979d1d
2db996e
f979d1d
 
2db996e
 
 
f979d1d
2db996e
 
 
 
 
 
f979d1d
2db996e
 
 
 
 
 
 
 
 
 
 
 
 
f979d1d
2db996e
f979d1d
 
 
 
 
 
 
 
 
2db996e
f979d1d
 
 
2db996e
 
 
 
 
 
 
 
 
 
 
 
 
 
f979d1d
 
2db996e
f979d1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.agents import Tool, AgentExecutor
from langchain.tools.retriever import create_retriever_tool
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import AzureOpenAIEmbeddings
from langchain_community.chat_models import AzureChatOpenAI
from openai import AzureOpenAI
import warnings
 
# Load environment variables
load_dotenv()
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
 
if not all([AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_LLM_DEPLOYMENT, AZURE_OPENAI_EMBEDDING_DEPLOYMENT]):
    raise ValueError("Missing one or more Azure OpenAI environment variables.")
 
warnings.filterwarnings("ignore")

AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
if not AZURE_OPENAI_API_KEY:
    raise ValueError("Missing AZURE_OPENAI_API_KEY in environment variables.")

chunk_size = 500

# Extract Data from the PDFs
def load_pdf_file(data_path):
    loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

# Split the data into chunks
def text_split(docs):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
    return splitter.split_documents(docs)

# Set up LLM and Embedding
llm = AzureChatOpenAI(
    deployment_name=AZURE_OPENAI_LLM_DEPLOYMENT,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    openai_api_key=AZURE_OPENAI_API_KEY,
    openai_api_version="2023-12-01-preview"  # or your supported version
    # temperature=0.5  # Only if supported by your deployment
)
embeddings = AzureOpenAIEmbeddings(
    azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    openai_api_key=AZURE_OPENAI_API_KEY,
    openai_api_version="2023-12-01-preview",
    chunk_size=chunk_size  # or another value up to 2048
)

# Load PDF, chunk it, embed it, and store in FAISS
pdf_docs = load_pdf_file("Dataset/")  # Update this to your PDF folder
chunks = text_split(pdf_docs)

vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local("faiss_index_sysml")

# Load FAISS and create retriever QA chain
# new_vectorstore = FAISS.load_local("faiss_index_sysml", embeddings, allow_dangerous_deserialization=True)
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=new_vectorstore.as_retriever()
# )

# # Run a sample query
# query = "What is SysML used for?"
# print("User:", query)
# print("Bot:", qa.run(query))