Spaces:
Sleeping
Sleeping
Upload faiss_vdb_script.py
Browse files- vdb_script/faiss_vdb_script.py +40 -13
vdb_script/faiss_vdb_script.py
CHANGED
@@ -1,18 +1,33 @@
|
|
1 |
import os
|
|
|
2 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
-
from langchain.
|
|
|
|
|
5 |
from langchain_community.vectorstores import FAISS
|
6 |
-
from
|
7 |
-
from
|
8 |
-
from
|
9 |
-
|
10 |
-
|
|
|
11 |
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
if not OPENAI_API_KEY:
|
15 |
-
raise ValueError("Missing OPENAI_API_KEY in environment variables.")
|
16 |
|
17 |
# Extract Data from the PDFs
|
18 |
def load_pdf_file(data_path):
|
@@ -22,15 +37,27 @@ def load_pdf_file(data_path):
|
|
22 |
|
23 |
# Split the data into chunks
|
24 |
def text_split(docs):
|
25 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=
|
26 |
return splitter.split_documents(docs)
|
27 |
|
28 |
# Set up LLM and Embedding
|
29 |
-
llm =
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# Load PDF, chunk it, embed it, and store in FAISS
|
33 |
-
pdf_docs = load_pdf_file("/
|
34 |
chunks = text_split(pdf_docs)
|
35 |
|
36 |
vectorstore = FAISS.from_documents(chunks, embeddings)
|
|
|
1 |
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.agents import Tool, AgentExecutor
|
6 |
+
from langchain.tools.retriever import create_retriever_tool
|
7 |
+
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
|
8 |
from langchain_community.vectorstores import FAISS
|
9 |
+
from langchain_community.embeddings import AzureOpenAIEmbeddings
|
10 |
+
from langchain_community.chat_models import AzureChatOpenAI
|
11 |
+
from openai import AzureOpenAI
|
12 |
+
import warnings
|
13 |
+
|
14 |
+
# Load environment variables
|
15 |
load_dotenv()
|
16 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
17 |
+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
|
18 |
+
AZURE_OPENAI_LLM_DEPLOYMENT = os.getenv("AZURE_OPENAI_LLM_DEPLOYMENT")
|
19 |
+
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
|
20 |
+
|
21 |
+
if not all([AZURE_OPENAI_API_KEY, AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_LLM_DEPLOYMENT, AZURE_OPENAI_EMBEDDING_DEPLOYMENT]):
|
22 |
+
raise ValueError("Missing one or more Azure OpenAI environment variables.")
|
23 |
+
|
24 |
+
warnings.filterwarnings("ignore")
|
25 |
+
|
26 |
+
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
|
27 |
+
if not AZURE_OPENAI_API_KEY:
|
28 |
+
raise ValueError("Missing AZURE_OPENAI_API_KEY in environment variables.")
|
29 |
|
30 |
+
chunk_size = 500
|
|
|
|
|
31 |
|
32 |
# Extract Data from the PDFs
|
33 |
def load_pdf_file(data_path):
|
|
|
37 |
|
38 |
# Split the data into chunks
|
39 |
def text_split(docs):
|
40 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=20)
|
41 |
return splitter.split_documents(docs)
|
42 |
|
43 |
# Set up LLM and Embedding
|
44 |
+
llm = AzureChatOpenAI(
|
45 |
+
deployment_name=AZURE_OPENAI_LLM_DEPLOYMENT,
|
46 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
47 |
+
openai_api_key=AZURE_OPENAI_API_KEY,
|
48 |
+
openai_api_version="2023-12-01-preview" # or your supported version
|
49 |
+
# temperature=0.5 # Only if supported by your deployment
|
50 |
+
)
|
51 |
+
embeddings = AzureOpenAIEmbeddings(
|
52 |
+
azure_deployment=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
|
53 |
+
azure_endpoint=AZURE_OPENAI_ENDPOINT,
|
54 |
+
openai_api_key=AZURE_OPENAI_API_KEY,
|
55 |
+
openai_api_version="2023-12-01-preview",
|
56 |
+
chunk_size=chunk_size # or another value up to 2048
|
57 |
+
)
|
58 |
|
59 |
# Load PDF, chunk it, embed it, and store in FAISS
|
60 |
+
pdf_docs = load_pdf_file("Dataset/") # Update this to your PDF folder
|
61 |
chunks = text_split(pdf_docs)
|
62 |
|
63 |
vectorstore = FAISS.from_documents(chunks, embeddings)
|