Spaces:

Inferno-721
/

Sutra_AI

Sleeping

App Files Files Community

Inferno-721 commited on Jan 7

Commit

0753d2e

1 Parent(s): 83a3714

Initial

Browse files

Files changed (10) hide show

.env +1 -0
.gitignore +3 -0
README.md +58 -4
app.py +170 -0
extracted_text.txt +551 -0
requirements.txt +8 -0
textScript.py +50 -0
utils/embeddings_utils.py +48 -0
utils/pdf_utils.py +35 -0
utils/qa_utils.py +27 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY=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

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+Chat_with_PDF_Application
+venv
+__pycache__

README.md CHANGED Viewed

@@ -1,12 +1,66 @@
 ---
-title: Sutra AI
-emoji: 👀
 colorFrom: red
-colorTo: gray
 sdk: streamlit
 sdk_version: 1.41.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Chat With PDF Application
+emoji: 😻
 colorFrom: red
+colorTo: yellow
 sdk: streamlit
 sdk_version: 1.41.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Chat with PDF Application
+**Chat with PDF** is an interactive Streamlit app that lets you upload PDFs, converts their content into embeddings using OpenAI, and enables question-answering via GPT-4.
+## Features
+- **PDF Upload:** Upload one or multiple PDFs.
+- **Text Extraction & Chunking:** Extracts text from PDFs and splits it into manageable chunks.
+- **Embedding Generation:** Converts text chunks into embeddings using OpenAI's `text-embedding-ada-002`.
+- **Question Answering:** Ask questions about your documents and get context-aware answers generated by GPT-4.
+- **Context Display:** View relevant sections from the PDF that support the generated answers.
+## Installation
+## Setup
+1. Create and activate a virtual environment:
+    ```bash
+    python3 -m venv venv
+    source venv/bin/activate
+    ```
+# .\venv\Scripts\activate  # On Windows
+2. Install requirements:
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Run the application:
+   ```bash
+   streamlit run app.py
+   ```
+4. **Configure API Key:**
+   - Create a `.env` file in the root directory.
+   - Add your OpenAI API key:
+     ```
+     OPENAI_API_KEY=your_openai_api_key_here
+     ```
+## Usage
+1. **Run the application:**
+   ```bash
+   streamlit run app.py
+   ```
+2. **Interact:**
+   - Upload PDF files.
+   - Wait for processing and embedding generation.
+   - Enter a question to get answers with relevant context excerpts from your PDFs.
+## Notes
+- The app meets core requirements: PDF uploading, text processing, embedding conversion, and Q&A.
+- While context is shown, highlighting directly on the PDF is not implemented yet.
+- Supports multiple PDF uploads and cross-document querying.

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import streamlit as st
+import os
+from utils.pdf_utils import PDFProcessor
+from utils.embeddings_utils import EmbeddingsManager
+from utils.qa_utils import QASystem
+from dotenv import load_dotenv
+import openai
+def initialize_session_state():
+    if 'pdf_processor' not in st.session_state:
+        st.session_state['pdf_processor'] = None
+    if 'embeddings_manager' not in st.session_state:
+        st.session_state['embeddings_manager'] = None
+    if 'qa_system' not in st.session_state:
+        st.session_state['qa_system'] = None
+    if 'processed_pdfs' not in st.session_state:
+        st.session_state['processed_pdfs'] = set()
+    if 'all_text_chunks' not in st.session_state:
+        st.session_state['all_text_chunks'] = []
+def main():
+    load_dotenv()
+    st.set_page_config(page_title="AI-Powered PDF Assistant", layout="wide")
+    initialize_session_state()
+    # Header Section
+    st.markdown(
+        """
+        <style>
+        .main-header {
+            font-size: 2.5rem;
+            color: #1F77B4;
+            text-align: center;
+            margin-bottom: 1rem;
+        }
+        .sub-header {
+            font-size: 1.25rem;
+            color: #555;
+            text-align: center;
+            margin-bottom: 2rem;
+        }
+        </style>
+        <div class="main-header">📘 AI-Powered PDF Assistant</div>
+        <div class="sub-header">Upload, Analyze, and Interact with Your Documents</div>
+        """,
+        unsafe_allow_html=True
+    )
+    # Navigation Menu
+    selected_page = st.sidebar.radio(
+        "Navigate", ["Upload PDFs", "Ask Questions", "About"]
+    )
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        st.sidebar.error("OpenAI API key not found in .env file!")
+        return
+    openai.api_key = api_key
+    if not st.session_state['pdf_processor']:
+        st.session_state['pdf_processor'] = PDFProcessor()
+    if not st.session_state['embeddings_manager']:
+        st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
+    if not st.session_state['qa_system']:
+        st.session_state['qa_system'] = QASystem(api_key)
+    if selected_page == "Upload PDFs":
+        st.header("📤 Upload PDFs")
+        st.markdown(
+            """<p style='font-size: 1.1rem;'>Drag and drop your PDF files below to extract and process content for analysis.</p>""",
+            unsafe_allow_html=True
+        )
+        uploaded_files = st.file_uploader(
+            "Upload PDF files", type=['pdf'], accept_multiple_files=True
+        )
+        if uploaded_files:
+            new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
+            if new_files:
+                with st.spinner("Processing PDFs..."):
+                    for pdf_file in new_files:
+                        try:
+                            pages = st.session_state['pdf_processor'].extract_text(pdf_file)
+                            for page_text in pages.values():
+                                chunks = st.session_state['pdf_processor'].chunk_text(page_text)
+                                st.session_state['all_text_chunks'].extend(chunks)
+                            st.session_state['processed_pdfs'].add(pdf_file.name)
+                        except Exception as e:
+                            st.error(f"Error processing {pdf_file.name}: {str(e)}")
+                            continue
+                    with st.spinner("Generating embeddings..."):
+                        try:
+                            st.session_state['embeddings_manager'].generate_embeddings(
+                                st.session_state['all_text_chunks']
+                            )
+                            st.success("✅ Documents processed successfully!")
+                        except Exception as e:
+                            st.error(f"Error generating embeddings: {str(e)}")
+    elif selected_page == "Ask Questions":
+        st.header("❓ Ask Questions")
+        st.markdown(
+            """<p style='font-size: 1.1rem;'>Query your uploaded documents and get precise answers backed by AI-powered analysis.</p>""",
+            unsafe_allow_html=True
+        )
+        if st.session_state['all_text_chunks']:
+            question = st.text_input("Enter your question:")
+            if question:
+                try:
+                    with st.spinner("Finding relevant information..."):
+                        relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
+                            question, k=3
+                        )
+                        answer = st.session_state['qa_system'].generate_answer(
+                            question, relevant_chunks
+                        )
+                        st.markdown("### 🤖 Answer")
+                        st.write(answer)
+                        with st.expander("🔍 View Source Context"):
+                            for i, chunk in enumerate(relevant_chunks, 1):
+                                st.markdown(f"**Context {i}:**")
+                                st.write(chunk)
+                                st.markdown("---")
+                except openai.error.RateLimitError:
+                    st.error("Rate limit exceeded. Please try again later.")
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+        else:
+            st.warning("Please upload and process documents in the 'Upload PDFs' section first.")
+    elif selected_page == "About":
+        st.header("ℹ️ About This App")
+        st.markdown(
+            """
+            <p style='font-size: 1.1rem;'>
+            <b>AI-Powered PDF Assistant</b> is a smart solution for extracting and querying information from PDF files. With powerful AI integrations,
+            this tool allows seamless document analysis and interaction.
+            </p>
+            <h3>🔑 Key Features</h3>
+            <ul>
+                <li>Upload and process multiple PDF files</li>
+                <li>Generate embeddings for precise content retrieval</li>
+                <li>Query documents and receive context-aware answers</li>
+            </ul>
+            <h3>🛠️ Technologies Used</h3>
+            <ul>
+                <li>Streamlit for interactive UI</li>
+                <li>OpenAI GPT API for Q&A</li>
+                <li>Custom PDF processing and embedding tools</li>
+            </ul>
+            <p style='text-align: center;'>
+            Built with ❤️ by [Your Name]
+            </p>
+            """,
+            unsafe_allow_html=True
+        )
+if __name__ == "__main__":
+    main()

extracted_text.txt ADDED Viewed

	@@ -0,0 +1,551 @@

+--- File: /home/sk/Desktop/chat-with-pdf/app.py ---
+import streamlit as st
+import os
+from utils.pdf_utils import PDFProcessor
+from utils.embeddings_utils import EmbeddingsManager
+from utils.qa_utils import QASystem
+from dotenv import load_dotenv
+import openai
+import time
+def initialize_session_state():
+    if 'pdf_processor' not in st.session_state:
+        st.session_state['pdf_processor'] = None
+    if 'embeddings_manager' not in st.session_state:
+        st.session_state['embeddings_manager'] = None
+    if 'qa_system' not in st.session_state:
+        st.session_state['qa_system'] = None
+    if 'processed_pdfs' not in st.session_state:
+        st.session_state['processed_pdfs'] = set()
+    if 'all_text_chunks' not in st.session_state:
+        st.session_state['all_text_chunks'] = []
+def main():
+    load_dotenv()
+    st.set_page_config(page_title="Chat with PDF", layout="wide")
+    st.title("📄💬 Chat with PDF")
+    initialize_session_state()
+    with st.sidebar:
+        st.header("🔍 How to Use")
+        st.markdown("""
+        1. Upload PDF document(s)
+        2. Ask questions about the content
+        3. View answers and relevant context
+        """)
+        if 'total_tokens_used' in st.session_state:
+            st.markdown("---")
+            st.markdown("### 📊 Usage Statistics")
+            st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        st.error("OpenAI API key not found in .env file!")
+        return
+    openai.api_key = api_key
+    if not st.session_state['pdf_processor']:
+        st.session_state['pdf_processor'] = PDFProcessor()
+    if not st.session_state['embeddings_manager']:
+        st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
+    if not st.session_state['qa_system']:
+        st.session_state['qa_system'] = QASystem(api_key)
+    st.subheader("📤 Upload PDFs")
+    uploaded_files = st.file_uploader(
+        "Upload PDF documents",
+        type=['pdf'],
+        accept_multiple_files=True
+    )
+    if uploaded_files:
+        new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
+        if new_files:
+            with st.spinner("Processing PDFs..."):
+                for pdf_file in new_files:
+                    try:
+                        pages = st.session_state['pdf_processor'].extract_text(pdf_file)
+                        for page_text in pages.values():
+                            chunks = st.session_state['pdf_processor'].chunk_text(page_text)
+                            st.session_state['all_text_chunks'].extend(chunks)
+                        st.session_state['processed_pdfs'].add(pdf_file.name)
+                    except Exception as e:
+                        st.error(f"Error processing {pdf_file.name}: {str(e)}")
+                        continue
+                with st.spinner("Generating embeddings..."):
+                    try:
+                        st.session_state['embeddings_manager'].generate_embeddings(
+                            st.session_state['all_text_chunks']
+                        )
+                        st.success("✅ Documents processed!")
+                    except Exception as e:
+                        st.error(f"Error generating embeddings: {str(e)}")
+                        return
+        if st.session_state['all_text_chunks']:
+            st.write("---")
+            st.subheader("❓ Ask Questions About Your Documents")
+            question = st.text_input("Enter your question:")
+            if question:
+                try:
+                    with st.spinner("Searching for relevant information..."):
+                        relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
+                            question,
+                            k=3
+                        )
+                        answer = st.session_state['qa_system'].generate_answer(
+                            question,
+                            relevant_chunks
+                        )
+                        st.markdown("### 🤖 Answer:")
+                        st.write(answer)
+                        with st.expander("🔍 View Source Context"):
+                            for i, chunk in enumerate(relevant_chunks, 1):
+                                st.markdown(f"**Context {i}:**")
+                                st.write(chunk)
+                                st.markdown("---")
+                except openai.error.RateLimitError:
+                    st.error("Rate limit exceeded. Please try again later.")
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+if __name__ == "__main__":
+    main()
+--- File: /home/sk/Desktop/chat-with-pdf/requirements.txt ---
+streamlit
+PyPDF2
+openai
+python-dotenv
+faiss-cpu
+numpy
+pdf2image
+Pillow
+--- File: /home/sk/Desktop/chat-with-pdf/.env ---
+OPENAI_API_KEY=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
+--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py ---
+import streamlit as st
+import os
+from utils.pdf_utils import PDFProcessor
+from utils.embeddings_utils import EmbeddingsManager
+from utils.qa_utils import QASystem
+from dotenv import load_dotenv
+import openai
+import time
+def initialize_session_state():
+    if 'pdf_processor' not in st.session_state:
+        st.session_state['pdf_processor'] = None
+    if 'embeddings_manager' not in st.session_state:
+        st.session_state['embeddings_manager'] = None
+    if 'qa_system' not in st.session_state:
+        st.session_state['qa_system'] = None
+    if 'processed_pdfs' not in st.session_state:
+        st.session_state['processed_pdfs'] = set()
+    if 'all_text_chunks' not in st.session_state:
+        st.session_state['all_text_chunks'] = []
+def main():
+    load_dotenv()
+    st.set_page_config(page_title="Chat with PDF", layout="wide")
+    st.title("📄💬 Chat with PDF")
+    initialize_session_state()
+    with st.sidebar:
+        st.header("🔍 How to Use")
+        st.markdown("""
+        1. Upload PDF document(s)
+        2. Ask questions about the content
+        3. View answers and relevant context
+        """)
+        if 'total_tokens_used' in st.session_state:
+            st.markdown("---")
+            st.markdown("### 📊 Usage Statistics")
+            st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        st.error("OpenAI API key not found in .env file!")
+        return
+    openai.api_key = api_key
+    if not st.session_state['pdf_processor']:
+        st.session_state['pdf_processor'] = PDFProcessor()
+    if not st.session_state['embeddings_manager']:
+        st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
+    if not st.session_state['qa_system']:
+        st.session_state['qa_system'] = QASystem(api_key)
+    st.subheader("📤 Upload PDFs")
+    uploaded_files = st.file_uploader(
+        "Upload PDF documents",
+        type=['pdf'],
+        accept_multiple_files=True
+    )
+    if uploaded_files:
+        new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
+        if new_files:
+            with st.spinner("Processing PDFs..."):
+                for pdf_file in new_files:
+                    try:
+                        pages = st.session_state['pdf_processor'].extract_text(pdf_file)
+                        for page_text in pages.values():
+                            chunks = st.session_state['pdf_processor'].chunk_text(page_text)
+                            st.session_state['all_text_chunks'].extend(chunks)
+                        st.session_state['processed_pdfs'].add(pdf_file.name)
+                    except Exception as e:
+                        st.error(f"Error processing {pdf_file.name}: {str(e)}")
+                        continue
+                with st.spinner("Generating embeddings..."):
+                    try:
+                        st.session_state['embeddings_manager'].generate_embeddings(
+                            st.session_state['all_text_chunks']
+                        )
+                        st.success("✅ Documents processed!")
+                    except Exception as e:
+                        st.error(f"Error generating embeddings: {str(e)}")
+                        return
+        if st.session_state['all_text_chunks']:
+            st.write("---")
+            st.subheader("❓ Ask Questions About Your Documents")
+            question = st.text_input("Enter your question:")
+            if question:
+                try:
+                    with st.spinner("Searching for relevant information..."):
+                        relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
+                            question,
+                            k=3
+                        )
+                        answer = st.session_state['qa_system'].generate_answer(
+                            question,
+                            relevant_chunks
+                        )
+                        st.markdown("### 🤖 Answer:")
+                        st.write(answer)
+                        with st.expander("🔍 View Source Context"):
+                            for i, chunk in enumerate(relevant_chunks, 1):
+                                st.markdown(f"**Context {i}:**")
+                                st.write(chunk)
+                                st.markdown("---")
+                except openai.error.RateLimitError:
+                    st.error("Rate limit exceeded. Please try again later.")
+                except Exception as e:
+                    st.error(f"Error: {str(e)}")
+if __name__ == "__main__":
+    main()
+--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt ---
+streamlit
+PyPDF2
+openai
+python-dotenv
+faiss-cpu
+numpy
+pdf2image
+Pillow
+--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes ---
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env ---
+OPENAI_API_KEY=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
+--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py ---
+import openai
+from typing import List
+class QASystem:
+    def __init__(self, api_key: str):
+        openai.api_key = api_key
+    def generate_answer(self, question: str, context: List[str]) -> str:
+        prompt = f"""Based on the context provided below, answer the question.
+        If the answer is not in the context, respond with "The answer is not in the provided context."
+        Context:
+        {' '.join(context)}
+        Question: {question}
+        """
+        response = openai.chat.completions.create(  # Updated line
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0,
+            max_tokens=500
+        )
+        return response.choices[0].message.content
+--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py ---
+import openai
+import numpy as np
+import faiss
+from typing import List
+class EmbeddingsManager:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.index = None
+        self.chunks = []
+    def generate_embeddings(self, text_chunks: List[str]):
+        """Generate embeddings for text chunks using OpenAI API."""
+        batch_size = 10
+        embeddings = []
+        for i in range(0, len(text_chunks), batch_size):
+            batch = text_chunks[i:i + batch_size]
+            response = openai.embeddings.create(
+                input=batch,
+                model="text-embedding-ada-002"
+            )
+            # Access the embeddings using attributes
+            batch_embeddings = [item.embedding for item in response.data]
+            embeddings.extend(batch_embeddings)
+        # Create FAISS index
+        dimension = len(embeddings[0])
+        self.index = faiss.IndexFlatL2(dimension)
+        embeddings_array = np.array(embeddings).astype('float32')
+        self.index.add(embeddings_array)
+        self.chunks = text_chunks
+    def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
+        """Find most relevant text chunks for a given query."""
+        response = openai.embeddings.create(
+            input=[query],
+            model="text-embedding-ada-002"
+        )
+        # Access the query embedding using attributes
+        query_embedding = response.data[0].embedding
+        D, I = self.index.search(
+            np.array([query_embedding]).astype('float32'),
+            k
+        )
+        return [self.chunks[i] for i in I[0] if i != -1]
+--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py ---
+import PyPDF2
+from typing import List, Dict
+class PDFProcessor:
+    def __init__(self):
+        self.pages = {}
+    def extract_text(self, pdf_file) -> Dict[int, str]:
+        """Extract text from PDF and return a dictionary of page numbers and text."""
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        for page_num in range(len(pdf_reader.pages)):
+            text = pdf_reader.pages[page_num].extract_text()
+            self.pages[page_num] = text
+        return self.pages
+    def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
+        """Split text into chunks of specified size."""
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for word in words:
+            current_size += len(word) + 1  # +1 for space
+            if current_size > chunk_size:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_size = len(word)
+            else:
+                current_chunk.append(word)
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks
+--- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py ---
+import openai
+from typing import List
+class QASystem:
+    def __init__(self, api_key: str):
+        openai.api_key = api_key
+    def generate_answer(self, question: str, context: List[str]) -> str:
+        prompt = f"""Based on the context provided below, answer the question.
+        If the answer is not in the context, respond with "The answer is not in the provided context."
+        Context:
+        {' '.join(context)}
+        Question: {question}
+        """
+        response = openai.chat.completions.create(  # Updated line
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0,
+            max_tokens=500
+        )
+        return response.choices[0].message.content
+--- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py ---
+import openai
+import numpy as np
+import faiss
+from typing import List
+class EmbeddingsManager:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.index = None
+        self.chunks = []
+    def generate_embeddings(self, text_chunks: List[str]):
+        """Generate embeddings for text chunks using OpenAI API."""
+        batch_size = 10
+        embeddings = []
+        for i in range(0, len(text_chunks), batch_size):
+            batch = text_chunks[i:i + batch_size]
+            response = openai.embeddings.create(
+                input=batch,
+                model="text-embedding-ada-002"
+            )
+            # Access the embeddings using attributes
+            batch_embeddings = [item.embedding for item in response.data]
+            embeddings.extend(batch_embeddings)
+        # Create FAISS index
+        dimension = len(embeddings[0])
+        self.index = faiss.IndexFlatL2(dimension)
+        embeddings_array = np.array(embeddings).astype('float32')
+        self.index.add(embeddings_array)
+        self.chunks = text_chunks
+    def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
+        """Find most relevant text chunks for a given query."""
+        response = openai.embeddings.create(
+            input=[query],
+            model="text-embedding-ada-002"
+        )
+        # Access the query embedding using attributes
+        query_embedding = response.data[0].embedding
+        D, I = self.index.search(
+            np.array([query_embedding]).astype('float32'),
+            k
+        )
+        return [self.chunks[i] for i in I[0] if i != -1]
+--- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py ---
+import PyPDF2
+from typing import List, Dict
+class PDFProcessor:
+    def __init__(self):
+        self.pages = {}
+    def extract_text(self, pdf_file) -> Dict[int, str]:
+        """Extract text from PDF and return a dictionary of page numbers and text."""
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        for page_num in range(len(pdf_reader.pages)):
+            text = pdf_reader.pages[page_num].extract_text()
+            self.pages[page_num] = text
+        return self.pages
+    def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
+        """Split text into chunks of specified size."""
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for word in words:
+            current_size += len(word) + 1  # +1 for space
+            if current_size > chunk_size:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_size = len(word)
+            else:
+                current_chunk.append(word)
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+PyPDF2
+openai
+python-dotenv
+faiss-cpu
+numpy
+pdf2image
+Pillow

textScript.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+def extract_text_from_folder(folder_path, output_file, files_to_skip=None, folders_to_skip=None):
+    """
+    Extracts text from all files within a folder and its subfolders.
+    """
+    if files_to_skip is None:
+        files_to_skip = []
+    if folders_to_skip is None:
+        folders_to_skip = []
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    output_file_path = os.path.join(script_dir, output_file)
+    with open(output_file_path, 'w', encoding='utf-8') as outfile:
+        for foldername, subfolders, filenames in os.walk(folder_path):
+            # Check if folder to skip is in the current folder path
+            should_skip_folder = any(folder in foldername for folder in folders_to_skip)
+            if should_skip_folder:
+                print(f"Skipping specified folder: {foldername}")
+                continue
+            for filename in filenames:
+                if filename in files_to_skip:
+                    print(f"Skipping specified file: {filename}")
+                    continue
+                file_path = os.path.join(foldername, filename)
+                try:
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        text = f.read()
+                    outfile.write(f"--- File: {file_path} ---\n\n")
+                    outfile.write(text)
+                    outfile.write("\n\n")
+                except UnicodeDecodeError:
+                    print(f"Skipping binary file: {file_path}")
+                except Exception as e:
+                    print(f"Error processing {file_path}: {e}")
+if __name__ == "__main__":
+    folder_to_extract = "/home/sk/Desktop/chat-with-pdf"
+    output_text_file = "extracted_text.txt"
+    files_to_skip = ["extracted_text.txt", "next.config.ts", "next.config.mjs", "tailwind.config.ts", "tsconfig.json","postcss.config.mjs","next-env.d.ts","components.json",".eslintrc.json","EDA.ipynb","evaluate.ipynb","textScript.py","stock_price.csv","README.md","globals.css","auto_complete.json", "another_file.css", "LogoBadge.svelte","README.md",".gitignore","package-lock.json","package.json"]
+    folders_to_skip = ["__pycache__", "venv", ".next","results","models","notebooks","data","env","__pycache__","redux","resetpassword","login","register","assets","icon", "asset", "node_modules",".git"]
+    extract_text_from_folder(folder_to_extract, output_text_file, files_to_skip, folders_to_skip)
+    print(f"Text extraction complete. Output saved to: {output_text_file}")

utils/embeddings_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import openai
+import numpy as np
+import faiss
+from typing import List
+class EmbeddingsManager:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        self.index = None
+        self.chunks = []
+    def generate_embeddings(self, text_chunks: List[str]):
+        """Generate embeddings for text chunks using OpenAI API."""
+        batch_size = 10
+        embeddings = []
+        for i in range(0, len(text_chunks), batch_size):
+            batch = text_chunks[i:i + batch_size]
+            response = openai.embeddings.create(
+                input=batch,
+                model="text-embedding-ada-002"
+            )
+            # Access the embeddings using attributes
+            batch_embeddings = [item.embedding for item in response.data]
+            embeddings.extend(batch_embeddings)
+        # Create FAISS index
+        dimension = len(embeddings[0])
+        self.index = faiss.IndexFlatL2(dimension)
+        embeddings_array = np.array(embeddings).astype('float32')
+        self.index.add(embeddings_array)
+        self.chunks = text_chunks
+    def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
+        """Find most relevant text chunks for a given query."""
+        response = openai.embeddings.create(
+            input=[query],
+            model="text-embedding-ada-002"
+        )
+        # Access the query embedding using attributes
+        query_embedding = response.data[0].embedding
+        D, I = self.index.search(
+            np.array([query_embedding]).astype('float32'),
+            k
+        )
+        return [self.chunks[i] for i in I[0] if i != -1]

utils/pdf_utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import PyPDF2
+from typing import List, Dict
+class PDFProcessor:
+    def __init__(self):
+        self.pages = {}
+    def extract_text(self, pdf_file) -> Dict[int, str]:
+        """Extract text from PDF and return a dictionary of page numbers and text."""
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        for page_num in range(len(pdf_reader.pages)):
+            text = pdf_reader.pages[page_num].extract_text()
+            self.pages[page_num] = text
+        return self.pages
+    def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
+        """Split text into chunks of specified size."""
+        words = text.split()
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        for word in words:
+            current_size += len(word) + 1  # +1 for space
+            if current_size > chunk_size:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [word]
+                current_size = len(word)
+            else:
+                current_chunk.append(word)
+        if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        return chunks

utils/qa_utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import openai
+from typing import List
+class QASystem:
+    def __init__(self, api_key: str):
+        openai.api_key = api_key
+    def generate_answer(self, question: str, context: List[str]) -> str:
+        prompt = f"""Based on the context provided below, answer the question.
+        If the answer is not in the context, respond with "The answer is not in the provided context."
+        Context:
+        {' '.join(context)}
+        Question: {question}
+        """
+        response = openai.chat.completions.create(  # Updated line
+            model="gpt-4",
+            messages=[
+                {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0,
+            max_tokens=500
+        )
+        return response.choices[0].message.content