Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,5 @@
|
|
1 |
-
# app.py
|
2 |
import streamlit as st
|
3 |
import os
|
4 |
-
import tempfile
|
5 |
from io import BytesIO
|
6 |
from PyPDF2 import PdfReader
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -59,7 +57,9 @@ with st.sidebar:
|
|
59 |
st.session_state.vectorstore = vector_store
|
60 |
st.success("File processed successfully. You can now ask questions.")
|
61 |
except (PermissionError, OSError) as e:
|
62 |
-
st.error(f"
|
|
|
|
|
63 |
|
64 |
# Display chat history
|
65 |
st.subheader("Chat History")
|
@@ -133,57 +133,46 @@ def main():
|
|
133 |
st.write("**Answer:**", answer)
|
134 |
|
135 |
def process_input(input_data):
|
136 |
-
# Create
|
137 |
try:
|
138 |
-
os.makedirs("
|
139 |
-
os.chmod("
|
140 |
except PermissionError as e:
|
141 |
-
st.error(f"Failed to create
|
142 |
raise
|
143 |
|
144 |
# Initialize progress bar and status
|
145 |
progress_bar = st.progress(0)
|
146 |
status = st.status("Processing PDF file...", expanded=True)
|
147 |
|
148 |
-
# Step 1:
|
149 |
-
status.update(label="
|
150 |
progress_bar.progress(0.20)
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
# Step 2: Read PDF file
|
157 |
-
status.update(label="Reading PDF file...")
|
158 |
-
progress_bar.progress(0.40)
|
159 |
|
160 |
-
|
161 |
-
pdf_reader = PdfReader(tmp_file_path)
|
162 |
-
documents = ""
|
163 |
-
for page in pdf_reader.pages:
|
164 |
-
documents += page.extract_text() or ""
|
165 |
-
finally:
|
166 |
-
os.remove(tmp_file_path) # Clean up temporary file
|
167 |
-
|
168 |
-
# Step 3: Split text
|
169 |
status.update(label="Splitting text into chunks...")
|
170 |
-
progress_bar.progress(0.
|
171 |
|
172 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
173 |
texts = text_splitter.split_text(documents)
|
174 |
|
175 |
-
# Step
|
176 |
status.update(label="Creating embeddings...")
|
177 |
-
progress_bar.progress(0.
|
178 |
|
179 |
hf_embeddings = HuggingFaceEmbeddings(
|
180 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
181 |
model_kwargs={'device': 'cpu'}
|
182 |
)
|
183 |
|
184 |
-
# Step
|
185 |
status.update(label="Building vector store...")
|
186 |
-
progress_bar.progress(0.
|
187 |
|
188 |
dimension = len(hf_embeddings.embed_query("sample text"))
|
189 |
index = faiss.IndexFlatL2(dimension)
|
@@ -199,6 +188,9 @@ def process_input(input_data):
|
|
199 |
vector_store.add_texts(texts, ids=uuids)
|
200 |
|
201 |
# Save vector store locally
|
|
|
|
|
|
|
202 |
vector_store.save_local("vectorstore/faiss_index")
|
203 |
|
204 |
# Complete processing
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
|
|
3 |
from io import BytesIO
|
4 |
from PyPDF2 import PdfReader
|
5 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
57 |
st.session_state.vectorstore = vector_store
|
58 |
st.success("File processed successfully. You can now ask questions.")
|
59 |
except (PermissionError, OSError) as e:
|
60 |
+
st.error(f"File upload failed (Permission or OS error): {str(e)}. Check server permissions or file system access.")
|
61 |
+
except Exception as e:
|
62 |
+
st.error(f"File upload failed (Unexpected error): {str(e)}. Please try again or check server logs.")
|
63 |
|
64 |
# Display chat history
|
65 |
st.subheader("Chat History")
|
|
|
133 |
st.write("**Answer:**", answer)
|
134 |
|
135 |
def process_input(input_data):
|
136 |
+
# Create vectorstore directory for FAISS index
|
137 |
try:
|
138 |
+
os.makedirs("vectorstore", exist_ok=True)
|
139 |
+
os.chmod("vectorstore", 0o777) # Ensure write permissions
|
140 |
except PermissionError as e:
|
141 |
+
st.error(f"Failed to create vectorstore directory: {str(e)}")
|
142 |
raise
|
143 |
|
144 |
# Initialize progress bar and status
|
145 |
progress_bar = st.progress(0)
|
146 |
status = st.status("Processing PDF file...", expanded=True)
|
147 |
|
148 |
+
# Step 1: Read PDF file in memory
|
149 |
+
status.update(label="Reading PDF file...")
|
150 |
progress_bar.progress(0.20)
|
151 |
|
152 |
+
pdf_reader = PdfReader(BytesIO(input_data.read()))
|
153 |
+
documents = ""
|
154 |
+
for page in pdf_reader.pages:
|
155 |
+
documents += page.extract_text() or ""
|
|
|
|
|
|
|
156 |
|
157 |
+
# Step 2: Split text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
status.update(label="Splitting text into chunks...")
|
159 |
+
progress_bar.progress(0.40)
|
160 |
|
161 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
162 |
texts = text_splitter.split_text(documents)
|
163 |
|
164 |
+
# Step 3: Create embeddings
|
165 |
status.update(label="Creating embeddings...")
|
166 |
+
progress_bar.progress(0.60)
|
167 |
|
168 |
hf_embeddings = HuggingFaceEmbeddings(
|
169 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
170 |
model_kwargs={'device': 'cpu'}
|
171 |
)
|
172 |
|
173 |
+
# Step 4: Initialize FAISS vector store
|
174 |
status.update(label="Building vector store...")
|
175 |
+
progress_bar.progress(0.80)
|
176 |
|
177 |
dimension = len(hf_embeddings.embed_query("sample text"))
|
178 |
index = faiss.IndexFlatL2(dimension)
|
|
|
188 |
vector_store.add_texts(texts, ids=uuids)
|
189 |
|
190 |
# Save vector store locally
|
191 |
+
status.update(label="Saving vector store...")
|
192 |
+
progress_bar.progress(0.90)
|
193 |
+
|
194 |
vector_store.save_local("vectorstore/faiss_index")
|
195 |
|
196 |
# Complete processing
|