DocuSage / chain.py
nav13n's picture
first commit
c78c360
# Langchain imports
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from dotenv import load_dotenv
load_dotenv()
######################## Build RAG Chain #############################
######################################################################
#### Load Documents
# loader = PyMuPDFLoader(
# "./data/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf",
# )
loader = PyMuPDFLoader(
"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf"
)
documents = loader.load()
#### Split Documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 800,
chunk_overlap = 100
)
documents = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings(
model="text-embedding-3-small"
)
### Create Vector Store
vector_store = Qdrant.from_documents(
documents,
embeddings,
location=":memory:",
collection_name="Meta 10k Filings",
)
### Create Prmopt Template
template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':
Context:
{context}
Question:
{question}
"""
prompt = ChatPromptTemplate.from_template(template)
def format_docs(docs):
return "\n\n".join([d.page_content for d in docs])
### Setup RAG Chain
retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
search_kwargs={"score_threshold": 0.6, "k":8})
primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| primary_qa_llm
| StrOutputParser()
)