Skier8402's picture
Update app.py
69e77a2 verified
raw
history blame
5.93 kB
"""
Question Answering with Retrieval QA and LangChain Language Models featuring FAISS vector stores.
This script uses the LangChain Language Model API to answer questions using Retrieval QA
and FAISS vector stores. It also uses the Mistral huggingface inference endpoint to
generate responses.
"""
import os
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import BaseOutputParser, OutputParserException
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
class ReferenceOutputParser(BaseOutputParser[ChatGeneration]):
def parse(self, text: str) -> ChatGeneration:
try:
result, references = text.split("References:")
return ChatGeneration(
result=result.strip(),
extra_info={"references": [ref.strip() for ref in references.split("\n") if ref.strip()]}
)
except ValueError:
raise OutputParserException(f"Could not parse output: {text}")
def get_pdf_text(pdf_docs):
text = ""
for pdf in pdf_docs:
try:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return text
def get_text_chunks(text):
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
)
try:
chunks = text_splitter.split_text(text)
except Exception as e:
st.error(f"Error splitting text into chunks: {e}")
chunks = []
return chunks
def get_vectorstore(text_chunks):
model = "BAAI/bge-base-en-v1.5"
encode_kwargs = {
"normalize_embeddings": True
}
try:
embeddings = HuggingFaceBgeEmbeddings(
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
except Exception as e:
st.error(f"Error creating vector store: {e}")
vectorstore = None
return vectorstore
def get_conversation_chain(vectorstore):
if vectorstore is None:
return None
try:
llm = HuggingFaceHub(
repo_id="mistralai/Mistral-7B-v0.3",
model_kwargs={"temperature": 0.5, "max_length": 4000},
)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm, retriever=vectorstore.as_retriever(), memory=memory, output_parser=ReferenceOutputParser()
)
except Exception as e:
st.error(f"Error creating conversation chain: {e}")
conversation_chain = None
return conversation_chain
def handle_userinput(user_question):
if st.session_state.conversation is None:
st.error("Please process the PDF files before asking a question.")
return
try:
response = st.session_state.conversation({"question": user_question})
st.session_state.chat_history = response["chat_history"]
result = response.result
references = response.extra_info["references"]
st.write("//_^ User: " + user_question)
st.write("πŸ€– ChatBot: " + result)
st.write("References:")
for ref in references:
st.write("- " + ref)
except Exception as e:
st.error(f"Error handling user input: {e}")
def main():
st.set_page_config(
page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
page_icon=":books:",
)
st.markdown("# Chat with a Bot")
st.markdown("This bot tries to answer questions about multiple PDFs. Let the processing of the PDF finish before adding your question. πŸ™πŸΎ")
st.write(css, unsafe_allow_html=True)
huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
#openai_api_key = st.text_input("Enter your OpenAI API key", type="password")
if huggingface_token:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
#if openai_api_key:
# os.environ["OPENAI_API_KEY"] = openai_api_key
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.header("Chat with a Bot πŸ€–πŸ¦Ύ that tries to answer questions about multiple PDFs :books:")
user_question = st.text_input("Ask a question about your documents:")
if user_question:
handle_userinput(user_question)
with st.sidebar:
st.subheader("Your documents")
pdf_docs = st.file_uploader(
"Upload your PDFs here and click on 'Process'", accept_multiple_files=True
)
if st.button("Process"):
with st.spinner("Processing"):
try:
# get pdf text
raw_text = get_pdf_text(pdf_docs)
# get the text chunks
text_chunks = get_text_chunks(raw_text)
# create vector store
vectorstore = get_vectorstore(text_chunks)
# create conversation chain
st.session_state.conversation = get_conversation_chain(vectorstore)
except Exception as e:
st.error(f"Error processing PDF files: {e}")
if __name__ == "__main__":
main()