Prachidwi commited on
Commit
2afcc95
·
verified ·
1 Parent(s): 33ca07a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -37
app.py CHANGED
@@ -1,24 +1,25 @@
 
1
  import streamlit as st
2
  from dotenv import load_dotenv
3
  from PyPDF2 import PdfReader
4
  from langchain.text_splitter import CharacterTextSplitter
5
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
  from langchain.vectorstores import FAISS
7
- from langchain.chat_models import ChatOpenAI
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.chains import ConversationalRetrievalChain
10
- from htmlTemplates import css, bot_template, user_template
11
  from langchain.llms import HuggingFaceHub
 
12
 
13
  def get_pdf_text(pdf_docs):
14
  text = ""
15
  for pdf in pdf_docs:
16
- pdf_reader = PdfReader(pdf)
17
- for page in pdf_reader.pages:
18
- text += page.extract_text()
 
 
 
19
  return text
20
 
21
-
22
  def get_text_chunks(text):
23
  text_splitter = CharacterTextSplitter(
24
  separator="\n",
@@ -29,17 +30,23 @@ def get_text_chunks(text):
29
  chunks = text_splitter.split_text(text)
30
  return chunks
31
 
32
-
33
  def get_vectorstore(text_chunks):
34
- embeddings = OpenAIEmbeddings()
35
- # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
36
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
37
- return vectorstore
38
-
 
 
39
 
40
  def get_conversation_chain(vectorstore):
41
- llm = ChatOpenAI()
42
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
 
 
 
 
 
43
 
44
  memory = ConversationBufferMemory(
45
  memory_key='chat_history', return_messages=True)
@@ -50,25 +57,13 @@ def get_conversation_chain(vectorstore):
50
  )
51
  return conversation_chain
52
 
53
-
54
  def handle_userinput(user_question):
55
  response = st.session_state.conversation({'question': user_question})
56
  st.session_state.chat_history = response['chat_history']
57
 
58
- for i, message in enumerate(st.session_state.chat_history):
59
- if i % 2 == 0:
60
- st.write(user_template.replace(
61
- "{{MSG}}", message.content), unsafe_allow_html=True)
62
- else:
63
- st.write(bot_template.replace(
64
- "{{MSG}}", message.content), unsafe_allow_html=True)
65
-
66
-
67
  def main():
68
  load_dotenv()
69
- st.set_page_config(page_title="Chat with multiple PDFs",
70
- page_icon=":books:")
71
- st.write(css, unsafe_allow_html=True)
72
 
73
  if "conversation" not in st.session_state:
74
  st.session_state.conversation = None
@@ -89,16 +84,16 @@ def main():
89
  # get pdf text
90
  raw_text = get_pdf_text(pdf_docs)
91
 
92
- # get the text chunks
93
- text_chunks = get_text_chunks(raw_text)
94
-
95
- # create vector store
96
- vectorstore = get_vectorstore(text_chunks)
97
 
98
- # create conversation chain
99
- st.session_state.conversation = get_conversation_chain(
100
- vectorstore)
101
 
 
 
 
102
 
103
  if __name__ == '__main__':
104
- main()
 
1
+ import os
2
  import streamlit as st
3
  from dotenv import load_dotenv
4
  from PyPDF2 import PdfReader
5
  from langchain.text_splitter import CharacterTextSplitter
 
6
  from langchain.vectorstores import FAISS
 
7
  from langchain.memory import ConversationBufferMemory
8
  from langchain.chains import ConversationalRetrievalChain
 
9
  from langchain.llms import HuggingFaceHub
10
+ from langchain.embeddings import HuggingFaceEmbeddings
11
 
12
  def get_pdf_text(pdf_docs):
13
  text = ""
14
  for pdf in pdf_docs:
15
+ try:
16
+ pdf_reader = PdfReader(pdf)
17
+ for page in pdf_reader.pages:
18
+ text += page.extract_text()
19
+ except Exception as e:
20
+ st.error(f"Error reading {pdf.name}: {e}. Skipping this file.")
21
  return text
22
 
 
23
  def get_text_chunks(text):
24
  text_splitter = CharacterTextSplitter(
25
  separator="\n",
 
30
  chunks = text_splitter.split_text(text)
31
  return chunks
32
 
 
33
  def get_vectorstore(text_chunks):
34
+ try:
35
+ embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
36
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding)
37
+ return vectorstore
38
+ except Exception as e:
39
+ st.error(f"Error creating vector store: {e}")
40
+ return None
41
 
42
  def get_conversation_chain(vectorstore):
43
+ # Fetch the HuggingFace API token from environment variable
44
+ api_token = os.getenv("HUGGINGFACE_API_TOKEN ")
45
+ if not api_token:
46
+ st.error("HuggingFace API token not found. Please ensure it is set in the environment variables.")
47
+ return None
48
+
49
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.5, "max_length": 512}, huggingfacehub_api_token=api_token)
50
 
51
  memory = ConversationBufferMemory(
52
  memory_key='chat_history', return_messages=True)
 
57
  )
58
  return conversation_chain
59
 
 
60
  def handle_userinput(user_question):
61
  response = st.session_state.conversation({'question': user_question})
62
  st.session_state.chat_history = response['chat_history']
63
 
 
 
 
 
 
 
 
 
 
64
  def main():
65
  load_dotenv()
66
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
 
 
67
 
68
  if "conversation" not in st.session_state:
69
  st.session_state.conversation = None
 
84
  # get pdf text
85
  raw_text = get_pdf_text(pdf_docs)
86
 
87
+ if raw_text: # Proceed only if there is valid text
88
+ # get the text chunks
89
+ text_chunks = get_text_chunks(raw_text)
 
 
90
 
91
+ # create vector store
92
+ vectorstore = get_vectorstore(text_chunks)
 
93
 
94
+ if vectorstore: # Check if vectorstore is valid
95
+ # create conversation chain
96
+ st.session_state.conversation = get_conversation_chain(vectorstore)
97
 
98
  if __name__ == '__main__':
99
+ main()