samim2024 commited on
Commit
20dd456
·
verified ·
1 Parent(s): 46abc5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -14
app.py CHANGED
@@ -15,6 +15,8 @@ import faiss
15
  import uuid
16
  from dotenv import load_dotenv
17
  import requests
 
 
18
 
19
  # Load environment variables
20
  load_dotenv()
@@ -32,36 +34,60 @@ if "history" not in st.session_state:
32
  if "authenticated" not in st.session_state:
33
  st.session_state.authenticated = False
34
 
35
- # PDF processing logic
36
  def process_input(input_data):
37
  # Initialize progress bar and status
38
  progress_bar = st.progress(0)
39
  status = st.empty()
40
 
41
- # Step 1: Read PDF file in memory
42
- status.text("Reading PDF file...")
43
- progress_bar.progress(0.25)
44
 
45
- pdf_reader = PdfReader(BytesIO(input_data.read()))
46
- documents = "".join([page.extract_text() or "" for page in pdf_reader.pages])
47
 
48
- # Step 2: Split text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  status.text("Splitting text into chunks...")
50
- progress_bar.progress(0.50)
51
 
52
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
53
  texts = text_splitter.split_text(documents)
54
 
55
- # Step 3: Create embeddings
56
  status.text("Creating embeddings...")
57
- progress_bar.progress(0.75)
58
 
59
  hf_embeddings = HuggingFaceEmbeddings(
60
  model_name="sentence-transformers/all-mpnet-base-v2",
61
  model_kwargs={'device': 'cpu'}
62
  )
63
 
64
- # Step 4: Initialize FAISS vector store
65
  status.text("Building vector store...")
66
  progress_bar.progress(1.0)
67
 
@@ -158,7 +184,7 @@ with st.sidebar:
158
  st.markdown('</div>', unsafe_allow_html=True)
159
 
160
  if st.session_state.authenticated:
161
- input_data = st.file_uploader("Upload a PDF file", type=["pdf"])
162
 
163
  if st.button("Process File") and input_data is not None:
164
  try:
@@ -169,6 +195,8 @@ with st.sidebar:
169
  st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
170
  except OSError as e:
171
  st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
 
 
172
  except Exception as e:
173
  st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")
174
 
@@ -218,14 +246,14 @@ def main():
218
  """, unsafe_allow_html=True)
219
 
220
  st.title("RAG Q&A App with Mistral AI")
221
- st.markdown("Welcome to the BSNL RAG App! Upload a PDF file and ask questions.", unsafe_allow_html=True)
222
 
223
  if not st.session_state.authenticated:
224
  st.warning("Please authenticate using the sidebar.")
225
  return
226
 
227
  if st.session_state.vectorstore is None:
228
- st.info("Please upload and process a PDF file.")
229
  return
230
 
231
  query = st.text_input("Enter your question:")
 
15
  import uuid
16
  from dotenv import load_dotenv
17
  import requests
18
+ import pandas as pd
19
+ from docx import Document
20
 
21
  # Load environment variables
22
  load_dotenv()
 
34
  if "authenticated" not in st.session_state:
35
  st.session_state.authenticated = False
36
 
37
+ # File processing logic
38
  def process_input(input_data):
39
  # Initialize progress bar and status
40
  progress_bar = st.progress(0)
41
  status = st.empty()
42
 
43
+ # Step 1: Read file in memory
44
+ status.text("Reading file...")
45
+ progress_bar.progress(0.20)
46
 
47
+ file_extension = input_data.name.lower().split('.')[-1]
48
+ documents = ""
49
 
50
+ # Step 2: Extract text based on file type
51
+ status.text("Extracting text...")
52
+ progress_bar.progress(0.40)
53
+
54
+ try:
55
+ if file_extension == 'pdf':
56
+ pdf_reader = PdfReader(BytesIO(input_data.read()))
57
+ documents = "".join([page.extract_text() or "" for page in pdf_reader.pages])
58
+ elif file_extension in ['xls', 'xlsx']:
59
+ df = pd.read_excel(BytesIO(input_data.read()), engine='openpyxl')
60
+ documents = df.to_string(index=False)
61
+ elif file_extension in ['doc', 'docx']:
62
+ doc = Document(BytesIO(input_data.read()))
63
+ documents = "\n".join([para.text for para in doc.paragraphs if para.text])
64
+ elif file_extension == 'txt':
65
+ try:
66
+ documents = input_data.read().decode('utf-8')
67
+ except UnicodeDecodeError:
68
+ documents = input_data.read().decode('latin-1')
69
+ else:
70
+ raise ValueError(f"Unsupported file type: {file_extension}")
71
+ except Exception as e:
72
+ raise RuntimeError(f"Failed to process file: {str(e)}")
73
+
74
+ # Step 3: Split text
75
  status.text("Splitting text into chunks...")
76
+ progress_bar.progress(0.60)
77
 
78
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
79
  texts = text_splitter.split_text(documents)
80
 
81
+ # Step 4: Create embeddings
82
  status.text("Creating embeddings...")
83
+ progress_bar.progress(0.80)
84
 
85
  hf_embeddings = HuggingFaceEmbeddings(
86
  model_name="sentence-transformers/all-mpnet-base-v2",
87
  model_kwargs={'device': 'cpu'}
88
  )
89
 
90
+ # Step 5: Initialize FAISS vector store
91
  status.text("Building vector store...")
92
  progress_bar.progress(1.0)
93
 
 
184
  st.markdown('</div>', unsafe_allow_html=True)
185
 
186
  if st.session_state.authenticated:
187
+ input_data = st.file_uploader("Upload a file (PDF, XLS/XLSX, DOC/DOCX, TXT)", type=["pdf", "xls", "xlsx", "doc", "docx", "txt"])
188
 
189
  if st.button("Process File") and input_data is not None:
190
  try:
 
195
  st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
196
  except OSError as e:
197
  st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
198
+ except ValueError as e:
199
+ st.error(f"File upload failed: {str(e)} (Invalid file format).")
200
  except Exception as e:
201
  st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")
202
 
 
246
  """, unsafe_allow_html=True)
247
 
248
  st.title("RAG Q&A App with Mistral AI")
249
+ st.markdown("Welcome to the BSNL RAG App! Upload a PDF, XLS/XLSX, DOC/DOCX, or TXT file and ask questions.", unsafe_allow_html=True)
250
 
251
  if not st.session_state.authenticated:
252
  st.warning("Please authenticate using the sidebar.")
253
  return
254
 
255
  if st.session_state.vectorstore is None:
256
+ st.info("Please upload and process a file.")
257
  return
258
 
259
  query = st.text_input("Enter your question:")