Spaces:

samim2024
/

bsnl-chatboot

Sleeping

App Files Files Community

samim2024 commited on May 16

Commit

6c5d119

verified ·

1 Parent(s): c5d0599

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -35

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 # app.py
 import streamlit as st
 import os
-import zipfile
 import shutil
 from io import BytesIO
 from PyPDF2 import PdfReader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
@@ -52,18 +53,13 @@ with st.sidebar:
     # File uploader
     if st.session_state.authenticated:
-        input_type = st.selectbox("Select Input Type", ["Single PDF", "Folder/Zip of PDFs"])
-        input_data = None
-        if input_type == "Single PDF":
-            input_data = st.file_uploader("Upload a PDF file", type=["pdf"])
-        else:
-            input_data = st.file_uploader("Upload a folder or zip of PDFs", type=["zip"])
-        if st.button("Process Files") and input_data is not None:
-            with st.spinner("Processing files..."):
-                vector_store = process_input(input_type, input_data)
                 st.session_state.vectorstore = vector_store
-                st.success("Files processed successfully. You can now ask questions.")
     # Display chat history
     st.subheader("Chat History")
@@ -119,14 +115,14 @@ def main():
     """, unsafe_allow_html=True)
     st.title("RAG Q&A App with Mistral AI")
-    st.markdown("Welcome to the BSNL RAG App! Upload your PDFs and ask questions with ease.", unsafe_allow_html=True)
     if not st.session_state.authenticated:
         st.warning("Please authenticate with your API key in the sidebar.")
         return
     if st.session_state.vectorstore is None:
-        st.info("Please upload and process a PDF or folder/zip of PDFs in the sidebar.")
         return
     query = st.text_input("Enter your question:")
@@ -136,35 +132,27 @@ def main():
             st.session_state.history.append((query, answer))
             st.write("**Answer:**", answer)
-def process_input(input_type, input_data):
     # Create uploads directory
     os.makedirs("uploads", exist_ok=True)
     documents = ""
-    if input_type == "Single PDF":
         pdf_reader = PdfReader(input_data)
         for page in pdf_reader.pages:
             documents += page.extract_text() or ""
-    else:
-        # Handle zip file
-        zip_path = "uploads/uploaded.zip"
-        with open(zip_path, "wb") as f:
-            f.write(input_data.getvalue())
-        with zipfile.ZipFile(zip_path, "r") as zip_ref:
-            zip_ref.extractall("uploads/extracted")
-        # Process all PDFs in extracted folder
-        for root, _, files in os.walk("uploads/extracted"):
-            for file in files:
-                if file.endswith(".pdf"):
-                    pdf_path = os.path.join(root, file)
-                    pdf_reader = PdfReader(pdf_path)
-                    for page in pdf_reader.pages:
-                        documents += page.extract_text() or ""
-        # Clean up extracted files
-        shutil.rmtree("uploads/extracted", ignore_errors=True)
-        os.remove(zip_path)
     # Split text
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

 # app.py
 import streamlit as st
 import os
 import shutil
 from io import BytesIO
 from PyPDF2 import PdfReader
+import pandas as pd
+from docx import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
     # File uploader
     if st.session_state.authenticated:
+        input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
+        if st.button("Process File") and input_data is not None:
+            with st.spinner("Processing file..."):
+                vector_store = process_input(input_data)
                 st.session_state.vectorstore = vector_store
+                st.success("File processed successfully. You can now ask questions.")
     # Display chat history
     st.subheader("Chat History")
     """, unsafe_allow_html=True)
     st.title("RAG Q&A App with Mistral AI")
+    st.markdown("Welcome to the BSNL RAG App! Upload your PDFs, TXTs, XLS/XLSX, or DOC/DOCX files and ask questions with ease.", unsafe_allow_html=True)
     if not st.session_state.authenticated:
         st.warning("Please authenticate with your API key in the sidebar.")
         return
     if st.session_state.vectorstore is None:
+        st.info("Please upload and process a PDF, TXT, XLS/XLSX, or DOC/DOCX file in the sidebar.")
         return
     query = st.text_input("Enter your question:")
             st.session_state.history.append((query, answer))
             st.write("**Answer:**", answer)
+def process_input(input_data):
     # Create uploads directory
     os.makedirs("uploads", exist_ok=True)
     documents = ""
+    file_name = input_data.name.lower()
+    if file_name.endswith(".pdf"):
         pdf_reader = PdfReader(input_data)
         for page in pdf_reader.pages:
             documents += page.extract_text() or ""
+    elif file_name.endswith(".txt"):
+        documents = input_data.read().decode("utf-8")
+    elif file_name.endswith((".xls", ".xlsx")):
+        df = pd.read_excel(input_data)
+        # Convert all cells to strings and join
+        documents = " ".join(df.astype(str).values.flatten())
+    elif file_name.endswith((".doc", ".docx")):
+        doc = Document(input_data)
+        for para in doc.paragraphs:
+            documents += para.text + "\n"
     # Split text
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)