Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,8 @@ import faiss
|
|
15 |
import uuid
|
16 |
from dotenv import load_dotenv
|
17 |
import requests
|
|
|
|
|
18 |
|
19 |
# Load environment variables
|
20 |
load_dotenv()
|
@@ -32,36 +34,60 @@ if "history" not in st.session_state:
|
|
32 |
if "authenticated" not in st.session_state:
|
33 |
st.session_state.authenticated = False
|
34 |
|
35 |
-
#
|
36 |
def process_input(input_data):
|
37 |
# Initialize progress bar and status
|
38 |
progress_bar = st.progress(0)
|
39 |
status = st.empty()
|
40 |
|
41 |
-
# Step 1: Read
|
42 |
-
status.text("Reading
|
43 |
-
progress_bar.progress(0.
|
44 |
|
45 |
-
|
46 |
-
documents = ""
|
47 |
|
48 |
-
# Step 2:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
status.text("Splitting text into chunks...")
|
50 |
-
progress_bar.progress(0.
|
51 |
|
52 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
53 |
texts = text_splitter.split_text(documents)
|
54 |
|
55 |
-
# Step
|
56 |
status.text("Creating embeddings...")
|
57 |
-
progress_bar.progress(0.
|
58 |
|
59 |
hf_embeddings = HuggingFaceEmbeddings(
|
60 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
61 |
model_kwargs={'device': 'cpu'}
|
62 |
)
|
63 |
|
64 |
-
# Step
|
65 |
status.text("Building vector store...")
|
66 |
progress_bar.progress(1.0)
|
67 |
|
@@ -158,7 +184,7 @@ with st.sidebar:
|
|
158 |
st.markdown('</div>', unsafe_allow_html=True)
|
159 |
|
160 |
if st.session_state.authenticated:
|
161 |
-
input_data = st.file_uploader("Upload a PDF
|
162 |
|
163 |
if st.button("Process File") and input_data is not None:
|
164 |
try:
|
@@ -169,6 +195,8 @@ with st.sidebar:
|
|
169 |
st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
|
170 |
except OSError as e:
|
171 |
st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
|
|
|
|
|
172 |
except Exception as e:
|
173 |
st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")
|
174 |
|
@@ -218,14 +246,14 @@ def main():
|
|
218 |
""", unsafe_allow_html=True)
|
219 |
|
220 |
st.title("RAG Q&A App with Mistral AI")
|
221 |
-
st.markdown("Welcome to the BSNL RAG App! Upload a PDF file and ask questions.", unsafe_allow_html=True)
|
222 |
|
223 |
if not st.session_state.authenticated:
|
224 |
st.warning("Please authenticate using the sidebar.")
|
225 |
return
|
226 |
|
227 |
if st.session_state.vectorstore is None:
|
228 |
-
st.info("Please upload and process a
|
229 |
return
|
230 |
|
231 |
query = st.text_input("Enter your question:")
|
|
|
15 |
import uuid
|
16 |
from dotenv import load_dotenv
|
17 |
import requests
|
18 |
+
import pandas as pd
|
19 |
+
from docx import Document
|
20 |
|
21 |
# Load environment variables
|
22 |
load_dotenv()
|
|
|
34 |
if "authenticated" not in st.session_state:
|
35 |
st.session_state.authenticated = False
|
36 |
|
37 |
+
# File processing logic
|
38 |
def process_input(input_data):
|
39 |
# Initialize progress bar and status
|
40 |
progress_bar = st.progress(0)
|
41 |
status = st.empty()
|
42 |
|
43 |
+
# Step 1: Read file in memory
|
44 |
+
status.text("Reading file...")
|
45 |
+
progress_bar.progress(0.20)
|
46 |
|
47 |
+
file_extension = input_data.name.lower().split('.')[-1]
|
48 |
+
documents = ""
|
49 |
|
50 |
+
# Step 2: Extract text based on file type
|
51 |
+
status.text("Extracting text...")
|
52 |
+
progress_bar.progress(0.40)
|
53 |
+
|
54 |
+
try:
|
55 |
+
if file_extension == 'pdf':
|
56 |
+
pdf_reader = PdfReader(BytesIO(input_data.read()))
|
57 |
+
documents = "".join([page.extract_text() or "" for page in pdf_reader.pages])
|
58 |
+
elif file_extension in ['xls', 'xlsx']:
|
59 |
+
df = pd.read_excel(BytesIO(input_data.read()), engine='openpyxl')
|
60 |
+
documents = df.to_string(index=False)
|
61 |
+
elif file_extension in ['doc', 'docx']:
|
62 |
+
doc = Document(BytesIO(input_data.read()))
|
63 |
+
documents = "\n".join([para.text for para in doc.paragraphs if para.text])
|
64 |
+
elif file_extension == 'txt':
|
65 |
+
try:
|
66 |
+
documents = input_data.read().decode('utf-8')
|
67 |
+
except UnicodeDecodeError:
|
68 |
+
documents = input_data.read().decode('latin-1')
|
69 |
+
else:
|
70 |
+
raise ValueError(f"Unsupported file type: {file_extension}")
|
71 |
+
except Exception as e:
|
72 |
+
raise RuntimeError(f"Failed to process file: {str(e)}")
|
73 |
+
|
74 |
+
# Step 3: Split text
|
75 |
status.text("Splitting text into chunks...")
|
76 |
+
progress_bar.progress(0.60)
|
77 |
|
78 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
79 |
texts = text_splitter.split_text(documents)
|
80 |
|
81 |
+
# Step 4: Create embeddings
|
82 |
status.text("Creating embeddings...")
|
83 |
+
progress_bar.progress(0.80)
|
84 |
|
85 |
hf_embeddings = HuggingFaceEmbeddings(
|
86 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
87 |
model_kwargs={'device': 'cpu'}
|
88 |
)
|
89 |
|
90 |
+
# Step 5: Initialize FAISS vector store
|
91 |
status.text("Building vector store...")
|
92 |
progress_bar.progress(1.0)
|
93 |
|
|
|
184 |
st.markdown('</div>', unsafe_allow_html=True)
|
185 |
|
186 |
if st.session_state.authenticated:
|
187 |
+
input_data = st.file_uploader("Upload a file (PDF, XLS/XLSX, DOC/DOCX, TXT)", type=["pdf", "xls", "xlsx", "doc", "docx", "txt"])
|
188 |
|
189 |
if st.button("Process File") and input_data is not None:
|
190 |
try:
|
|
|
195 |
st.error(f"File upload failed: Permission error - {str(e)}. Check file system access.")
|
196 |
except OSError as e:
|
197 |
st.error(f"File upload failed: OS error - {str(e)}. Check server configuration.")
|
198 |
+
except ValueError as e:
|
199 |
+
st.error(f"File upload failed: {str(e)} (Invalid file format).")
|
200 |
except Exception as e:
|
201 |
st.error(f"File upload failed: {str(e)} (Exception type: {type(e).__name__}). Please try again or check server logs.")
|
202 |
|
|
|
246 |
""", unsafe_allow_html=True)
|
247 |
|
248 |
st.title("RAG Q&A App with Mistral AI")
|
249 |
+
st.markdown("Welcome to the BSNL RAG App! Upload a PDF, XLS/XLSX, DOC/DOCX, or TXT file and ask questions.", unsafe_allow_html=True)
|
250 |
|
251 |
if not st.session_state.authenticated:
|
252 |
st.warning("Please authenticate using the sidebar.")
|
253 |
return
|
254 |
|
255 |
if st.session_state.vectorstore is None:
|
256 |
+
st.info("Please upload and process a file.")
|
257 |
return
|
258 |
|
259 |
query = st.text_input("Enter your question:")
|