Spaces:

Skier8402
/

mistral-PDF-chat

Running

App Files Files Community

mistral-PDF-chat / app.py

Skier8402

Update app.py

69e77a2 verified about 1 year ago

raw

history blame

5.93 kB

	"""
	Question Answering with Retrieval QA and LangChain Language Models featuring FAISS vector stores.
	This script uses the LangChain Language Model API to answer questions using Retrieval QA
	and FAISS vector stores. It also uses the Mistral huggingface inference endpoint to
	generate responses.
	"""

	import os
	import streamlit as st
	from dotenv import load_dotenv
	from PyPDF2 import PdfReader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.embeddings import HuggingFaceBgeEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.chat_models import ChatOpenAI
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from langchain.schema import BaseOutputParser, OutputParserException
	from htmlTemplates import css, bot_template, user_template
	from langchain.llms import HuggingFaceHub

	class ReferenceOutputParser(BaseOutputParser[ChatGeneration]):
	def parse(self, text: str) -> ChatGeneration:
	try:
	result, references = text.split("References:")
	return ChatGeneration(
	result=result.strip(),
	extra_info={"references": [ref.strip() for ref in references.split("\n") if ref.strip()]}
	)
	except ValueError:
	raise OutputParserException(f"Could not parse output: {text}")


	def get_pdf_text(pdf_docs):
	text = ""
	for pdf in pdf_docs:
	try:
	pdf_reader = PdfReader(pdf)
	for page in pdf_reader.pages:
	text += page.extract_text()
	except Exception as e:
	st.error(f"Error extracting text from PDF: {e}")
	return text

	def get_text_chunks(text):
	text_splitter = CharacterTextSplitter(
	separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
	)
	try:
	chunks = text_splitter.split_text(text)
	except Exception as e:
	st.error(f"Error splitting text into chunks: {e}")
	chunks = []
	return chunks

	def get_vectorstore(text_chunks):
	model = "BAAI/bge-base-en-v1.5"
	encode_kwargs = {
	"normalize_embeddings": True
	}
	try:
	embeddings = HuggingFaceBgeEmbeddings(
	model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
	)
	vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
	except Exception as e:
	st.error(f"Error creating vector store: {e}")
	vectorstore = None
	return vectorstore

	def get_conversation_chain(vectorstore):
	if vectorstore is None:
	return None

	try:
	llm = HuggingFaceHub(
	repo_id="mistralai/Mistral-7B-v0.3",
	model_kwargs={"temperature": 0.5, "max_length": 4000},
	)
	memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
	conversation_chain = ConversationalRetrievalChain.from_llm(
	llm=llm, retriever=vectorstore.as_retriever(), memory=memory, output_parser=ReferenceOutputParser()
	)
	except Exception as e:
	st.error(f"Error creating conversation chain: {e}")
	conversation_chain = None
	return conversation_chain

	def handle_userinput(user_question):
	if st.session_state.conversation is None:
	st.error("Please process the PDF files before asking a question.")
	return

	try:
	response = st.session_state.conversation({"question": user_question})
	st.session_state.chat_history = response["chat_history"]

	result = response.result
	references = response.extra_info["references"]

	st.write("//_^ User: " + user_question)
	st.write("🤖 ChatBot: " + result)
	st.write("References:")
	for ref in references:
	st.write("- " + ref)
	except Exception as e:
	st.error(f"Error handling user input: {e}")

	def main():
	st.set_page_config(
	page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
	page_icon=":books:",
	)

	st.markdown("# Chat with a Bot")
	st.markdown("This bot tries to answer questions about multiple PDFs. Let the processing of the PDF finish before adding your question. 🙏🏾")

	st.write(css, unsafe_allow_html=True)

	huggingface_token = st.text_input("Enter your HuggingFace Hub token", type="password")
	#openai_api_key = st.text_input("Enter your OpenAI API key", type="password")

	if huggingface_token:
	os.environ["HUGGINGFACEHUB_API_TOKEN"] = huggingface_token
	#if openai_api_key:
	# os.environ["OPENAI_API_KEY"] = openai_api_key

	if "conversation" not in st.session_state:
	st.session_state.conversation = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = None

	st.header("Chat with a Bot 🤖🦾 that tries to answer questions about multiple PDFs :books:")
	user_question = st.text_input("Ask a question about your documents:")
	if user_question:
	handle_userinput(user_question)

	with st.sidebar:
	st.subheader("Your documents")
	pdf_docs = st.file_uploader(
	"Upload your PDFs here and click on 'Process'", accept_multiple_files=True
	)
	if st.button("Process"):
	with st.spinner("Processing"):
	try:
	# get pdf text
	raw_text = get_pdf_text(pdf_docs)

	# get the text chunks
	text_chunks = get_text_chunks(raw_text)

	# create vector store
	vectorstore = get_vectorstore(text_chunks)

	# create conversation chain
	st.session_state.conversation = get_conversation_chain(vectorstore)
	except Exception as e:
	st.error(f"Error processing PDF files: {e}")

	if __name__ == "__main__":
	main()