samim2024 commited on
Commit
6c5d119
·
verified ·
1 Parent(s): c5d0599

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -35
app.py CHANGED
@@ -1,10 +1,11 @@
1
  # app.py
2
  import streamlit as st
3
  import os
4
- import zipfile
5
  import shutil
6
  from io import BytesIO
7
  from PyPDF2 import PdfReader
 
 
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain_community.embeddings import HuggingFaceEmbeddings
10
  from langchain_community.vectorstores import FAISS
@@ -52,18 +53,13 @@ with st.sidebar:
52
 
53
  # File uploader
54
  if st.session_state.authenticated:
55
- input_type = st.selectbox("Select Input Type", ["Single PDF", "Folder/Zip of PDFs"])
56
- input_data = None
57
- if input_type == "Single PDF":
58
- input_data = st.file_uploader("Upload a PDF file", type=["pdf"])
59
- else:
60
- input_data = st.file_uploader("Upload a folder or zip of PDFs", type=["zip"])
61
 
62
- if st.button("Process Files") and input_data is not None:
63
- with st.spinner("Processing files..."):
64
- vector_store = process_input(input_type, input_data)
65
  st.session_state.vectorstore = vector_store
66
- st.success("Files processed successfully. You can now ask questions.")
67
 
68
  # Display chat history
69
  st.subheader("Chat History")
@@ -119,14 +115,14 @@ def main():
119
  """, unsafe_allow_html=True)
120
 
121
  st.title("RAG Q&A App with Mistral AI")
122
- st.markdown("Welcome to the BSNL RAG App! Upload your PDFs and ask questions with ease.", unsafe_allow_html=True)
123
 
124
  if not st.session_state.authenticated:
125
  st.warning("Please authenticate with your API key in the sidebar.")
126
  return
127
 
128
  if st.session_state.vectorstore is None:
129
- st.info("Please upload and process a PDF or folder/zip of PDFs in the sidebar.")
130
  return
131
 
132
  query = st.text_input("Enter your question:")
@@ -136,35 +132,27 @@ def main():
136
  st.session_state.history.append((query, answer))
137
  st.write("**Answer:**", answer)
138
 
139
- def process_input(input_type, input_data):
140
  # Create uploads directory
141
  os.makedirs("uploads", exist_ok=True)
142
 
143
  documents = ""
144
- if input_type == "Single PDF":
 
 
145
  pdf_reader = PdfReader(input_data)
146
  for page in pdf_reader.pages:
147
  documents += page.extract_text() or ""
148
- else:
149
- # Handle zip file
150
- zip_path = "uploads/uploaded.zip"
151
- with open(zip_path, "wb") as f:
152
- f.write(input_data.getvalue())
153
- with zipfile.ZipFile(zip_path, "r") as zip_ref:
154
- zip_ref.extractall("uploads/extracted")
155
-
156
- # Process all PDFs in extracted folder
157
- for root, _, files in os.walk("uploads/extracted"):
158
- for file in files:
159
- if file.endswith(".pdf"):
160
- pdf_path = os.path.join(root, file)
161
- pdf_reader = PdfReader(pdf_path)
162
- for page in pdf_reader.pages:
163
- documents += page.extract_text() or ""
164
-
165
- # Clean up extracted files
166
- shutil.rmtree("uploads/extracted", ignore_errors=True)
167
- os.remove(zip_path)
168
 
169
  # Split text
170
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
 
1
  # app.py
2
  import streamlit as st
3
  import os
 
4
  import shutil
5
  from io import BytesIO
6
  from PyPDF2 import PdfReader
7
+ import pandas as pd
8
+ from docx import Document
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
  from langchain_community.embeddings import HuggingFaceEmbeddings
11
  from langchain_community.vectorstores import FAISS
 
53
 
54
  # File uploader
55
  if st.session_state.authenticated:
56
+ input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
 
 
 
 
 
57
 
58
+ if st.button("Process File") and input_data is not None:
59
+ with st.spinner("Processing file..."):
60
+ vector_store = process_input(input_data)
61
  st.session_state.vectorstore = vector_store
62
+ st.success("File processed successfully. You can now ask questions.")
63
 
64
  # Display chat history
65
  st.subheader("Chat History")
 
115
  """, unsafe_allow_html=True)
116
 
117
  st.title("RAG Q&A App with Mistral AI")
118
+ st.markdown("Welcome to the BSNL RAG App! Upload your PDFs, TXTs, XLS/XLSX, or DOC/DOCX files and ask questions with ease.", unsafe_allow_html=True)
119
 
120
  if not st.session_state.authenticated:
121
  st.warning("Please authenticate with your API key in the sidebar.")
122
  return
123
 
124
  if st.session_state.vectorstore is None:
125
+ st.info("Please upload and process a PDF, TXT, XLS/XLSX, or DOC/DOCX file in the sidebar.")
126
  return
127
 
128
  query = st.text_input("Enter your question:")
 
132
  st.session_state.history.append((query, answer))
133
  st.write("**Answer:**", answer)
134
 
135
+ def process_input(input_data):
136
  # Create uploads directory
137
  os.makedirs("uploads", exist_ok=True)
138
 
139
  documents = ""
140
+ file_name = input_data.name.lower()
141
+
142
+ if file_name.endswith(".pdf"):
143
  pdf_reader = PdfReader(input_data)
144
  for page in pdf_reader.pages:
145
  documents += page.extract_text() or ""
146
+ elif file_name.endswith(".txt"):
147
+ documents = input_data.read().decode("utf-8")
148
+ elif file_name.endswith((".xls", ".xlsx")):
149
+ df = pd.read_excel(input_data)
150
+ # Convert all cells to strings and join
151
+ documents = " ".join(df.astype(str).values.flatten())
152
+ elif file_name.endswith((".doc", ".docx")):
153
+ doc = Document(input_data)
154
+ for para in doc.paragraphs:
155
+ documents += para.text + "\n"
 
 
 
 
 
 
 
 
 
 
156
 
157
  # Split text
158
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)