Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
# app.py
|
2 |
import streamlit as st
|
3 |
import os
|
4 |
-
import zipfile
|
5 |
import shutil
|
6 |
from io import BytesIO
|
7 |
from PyPDF2 import PdfReader
|
|
|
|
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
10 |
from langchain_community.vectorstores import FAISS
|
@@ -52,18 +53,13 @@ with st.sidebar:
|
|
52 |
|
53 |
# File uploader
|
54 |
if st.session_state.authenticated:
|
55 |
-
|
56 |
-
input_data = None
|
57 |
-
if input_type == "Single PDF":
|
58 |
-
input_data = st.file_uploader("Upload a PDF file", type=["pdf"])
|
59 |
-
else:
|
60 |
-
input_data = st.file_uploader("Upload a folder or zip of PDFs", type=["zip"])
|
61 |
|
62 |
-
if st.button("Process
|
63 |
-
with st.spinner("Processing
|
64 |
-
vector_store = process_input(
|
65 |
st.session_state.vectorstore = vector_store
|
66 |
-
st.success("
|
67 |
|
68 |
# Display chat history
|
69 |
st.subheader("Chat History")
|
@@ -119,14 +115,14 @@ def main():
|
|
119 |
""", unsafe_allow_html=True)
|
120 |
|
121 |
st.title("RAG Q&A App with Mistral AI")
|
122 |
-
st.markdown("Welcome to the BSNL RAG App! Upload your PDFs and ask questions with ease.", unsafe_allow_html=True)
|
123 |
|
124 |
if not st.session_state.authenticated:
|
125 |
st.warning("Please authenticate with your API key in the sidebar.")
|
126 |
return
|
127 |
|
128 |
if st.session_state.vectorstore is None:
|
129 |
-
st.info("Please upload and process a PDF or
|
130 |
return
|
131 |
|
132 |
query = st.text_input("Enter your question:")
|
@@ -136,35 +132,27 @@ def main():
|
|
136 |
st.session_state.history.append((query, answer))
|
137 |
st.write("**Answer:**", answer)
|
138 |
|
139 |
-
def process_input(
|
140 |
# Create uploads directory
|
141 |
os.makedirs("uploads", exist_ok=True)
|
142 |
|
143 |
documents = ""
|
144 |
-
|
|
|
|
|
145 |
pdf_reader = PdfReader(input_data)
|
146 |
for page in pdf_reader.pages:
|
147 |
documents += page.extract_text() or ""
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
for file in files:
|
159 |
-
if file.endswith(".pdf"):
|
160 |
-
pdf_path = os.path.join(root, file)
|
161 |
-
pdf_reader = PdfReader(pdf_path)
|
162 |
-
for page in pdf_reader.pages:
|
163 |
-
documents += page.extract_text() or ""
|
164 |
-
|
165 |
-
# Clean up extracted files
|
166 |
-
shutil.rmtree("uploads/extracted", ignore_errors=True)
|
167 |
-
os.remove(zip_path)
|
168 |
|
169 |
# Split text
|
170 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
|
|
1 |
# app.py
|
2 |
import streamlit as st
|
3 |
import os
|
|
|
4 |
import shutil
|
5 |
from io import BytesIO
|
6 |
from PyPDF2 import PdfReader
|
7 |
+
import pandas as pd
|
8 |
+
from docx import Document
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
11 |
from langchain_community.vectorstores import FAISS
|
|
|
53 |
|
54 |
# File uploader
|
55 |
if st.session_state.authenticated:
|
56 |
+
input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
if st.button("Process File") and input_data is not None:
|
59 |
+
with st.spinner("Processing file..."):
|
60 |
+
vector_store = process_input(input_data)
|
61 |
st.session_state.vectorstore = vector_store
|
62 |
+
st.success("File processed successfully. You can now ask questions.")
|
63 |
|
64 |
# Display chat history
|
65 |
st.subheader("Chat History")
|
|
|
115 |
""", unsafe_allow_html=True)
|
116 |
|
117 |
st.title("RAG Q&A App with Mistral AI")
|
118 |
+
st.markdown("Welcome to the BSNL RAG App! Upload your PDFs, TXTs, XLS/XLSX, or DOC/DOCX files and ask questions with ease.", unsafe_allow_html=True)
|
119 |
|
120 |
if not st.session_state.authenticated:
|
121 |
st.warning("Please authenticate with your API key in the sidebar.")
|
122 |
return
|
123 |
|
124 |
if st.session_state.vectorstore is None:
|
125 |
+
st.info("Please upload and process a PDF, TXT, XLS/XLSX, or DOC/DOCX file in the sidebar.")
|
126 |
return
|
127 |
|
128 |
query = st.text_input("Enter your question:")
|
|
|
132 |
st.session_state.history.append((query, answer))
|
133 |
st.write("**Answer:**", answer)
|
134 |
|
135 |
+
def process_input(input_data):
|
136 |
# Create uploads directory
|
137 |
os.makedirs("uploads", exist_ok=True)
|
138 |
|
139 |
documents = ""
|
140 |
+
file_name = input_data.name.lower()
|
141 |
+
|
142 |
+
if file_name.endswith(".pdf"):
|
143 |
pdf_reader = PdfReader(input_data)
|
144 |
for page in pdf_reader.pages:
|
145 |
documents += page.extract_text() or ""
|
146 |
+
elif file_name.endswith(".txt"):
|
147 |
+
documents = input_data.read().decode("utf-8")
|
148 |
+
elif file_name.endswith((".xls", ".xlsx")):
|
149 |
+
df = pd.read_excel(input_data)
|
150 |
+
# Convert all cells to strings and join
|
151 |
+
documents = " ".join(df.astype(str).values.flatten())
|
152 |
+
elif file_name.endswith((".doc", ".docx")):
|
153 |
+
doc = Document(input_data)
|
154 |
+
for para in doc.paragraphs:
|
155 |
+
documents += para.text + "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
# Split text
|
158 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|