SysModeler commited on
Commit
2db996e
·
verified ·
1 Parent(s): 58d26b3

Upload faiss_vdb_script.py

Browse files
Files changed (1) hide show
  1. vdb_script/faiss_vdb_script.py +40 -13
vdb_script/faiss_vdb_script.py CHANGED
@@ -1,18 +1,33 @@
1
  import os
 
2
  from langchain.document_loaders import PyPDFLoader, DirectoryLoader
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
- from langchain.embeddings.openai import OpenAIEmbeddings
 
 
5
  from langchain_community.vectorstores import FAISS
6
- from langchain.chains import RetrievalQA
7
- from langchain.llms import OpenAI
8
- from dotenv import load_dotenv
9
-
10
- # Load environment variable for OpenAI key
 
11
  load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
14
- if not OPENAI_API_KEY:
15
- raise ValueError("Missing OPENAI_API_KEY in environment variables.")
16
 
17
  # Extract Data from the PDFs
18
  def load_pdf_file(data_path):
@@ -22,15 +37,27 @@ def load_pdf_file(data_path):
22
 
23
  # Split the data into chunks
24
  def text_split(docs):
25
- splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
26
  return splitter.split_documents(docs)
27
 
28
  # Set up LLM and Embedding
29
- llm = OpenAI(model_name="gpt-4o-mini", temperature=0.5, openai_api_key=OPENAI_API_KEY)
30
- embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  # Load PDF, chunk it, embed it, and store in FAISS
33
- pdf_docs = load_pdf_file("/kaggle/input/rag-test") # Update this to your PDF folder
34
  chunks = text_split(pdf_docs)
35
 
36
  vectorstore = FAISS.from_documents(chunks, embeddings)
 
1
  import os
2
+ from dotenv import load_dotenv
3
  from langchain.document_loaders import PyPDFLoader, DirectoryLoader
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.agents import Tool, AgentExecutor
6
+ from langchain.tools.retriever import create_retriever_tool
7
+ from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
8
  from langchain_community.vectorstores import FAISS
9
+ from langchain_community.embeddings import AzureOpenAIEmbeddings
10
+ from langchain_community.chat_models import AzureChatOpenAI
11
+ from openai import AzureOpenAI
12
+ import warnings
13
+
14
+ # Load environment variables
15
  load_dotenv()
16
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
17
+ AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
18
+ AZURE_OPENAI_LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")
19
+ AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
20
+
21
+ if not all([AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_LLM_DEPLOYMENT, AZURE_OPENAI_EMBEDDING_DEPLOYMENT]):
22
+ raise ValueError("Missing one or more Azure OpenAI environment variables.")
23
+
24
+ warnings.filterwarnings("ignore")
25
+
26
+ AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
27
+ if not AZURE_OPENAI_API_KEY:
28
+ raise ValueError("Missing AZURE_OPENAI_API_KEY in environment variables.")
29
 
30
+ chunk_size = 500
 
 
31
 
32
  # Extract Data from the PDFs
33
  def load_pdf_file(data_path):
 
37
 
38
  # Split the data into chunks
39
  def text_split(docs):
40
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
41
  return splitter.split_documents(docs)
42
 
43
  # Set up LLM and Embedding
44
+ llm = AzureChatOpenAI(
45
+ deployment_name=AZURE_OPENAI_LLM_DEPLOYMENT,
46
+ azure_endpoint=AZURE_OPENAI_ENDPOINT,
47
+ openai_api_key=AZURE_OPENAI_API_KEY,
48
+ openai_api_version="2023-12-01-preview" # or your supported version
49
+ # temperature=0.5 # Only if supported by your deployment
50
+ )
51
+ embeddings = AzureOpenAIEmbeddings(
52
+ azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
53
+ azure_endpoint=AZURE_OPENAI_ENDPOINT,
54
+ openai_api_key=AZURE_OPENAI_API_KEY,
55
+ openai_api_version="2023-12-01-preview",
56
+ chunk_size=chunk_size # or another value up to 2048
57
+ )
58
 
59
  # Load PDF, chunk it, embed it, and store in FAISS
60
+ pdf_docs = load_pdf_file("Dataset/") # Update this to your PDF folder
61
  chunks = text_split(pdf_docs)
62
 
63
  vectorstore = FAISS.from_documents(chunks, embeddings)