Inferno-721 commited on
Commit
0753d2e
·
1 Parent(s): 83a3714
Files changed (10) hide show
  1. .env +1 -0
  2. .gitignore +3 -0
  3. README.md +58 -4
  4. app.py +170 -0
  5. extracted_text.txt +551 -0
  6. requirements.txt +8 -0
  7. textScript.py +50 -0
  8. utils/embeddings_utils.py +48 -0
  9. utils/pdf_utils.py +35 -0
  10. utils/qa_utils.py +27 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Chat_with_PDF_Application
2
+ venv
3
+ __pycache__
README.md CHANGED
@@ -1,12 +1,66 @@
1
  ---
2
- title: Sutra AI
3
- emoji: 👀
4
  colorFrom: red
5
- colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.41.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Chat With PDF Application
3
+ emoji: 😻
4
  colorFrom: red
5
+ colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.41.1
8
  app_file: app.py
9
  pinned: false
10
  ---
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
+
13
+ # Chat with PDF Application
14
+
15
+ **Chat with PDF** is an interactive Streamlit app that lets you upload PDFs, converts their content into embeddings using OpenAI, and enables question-answering via GPT-4.
16
+
17
+ ## Features
18
+ - **PDF Upload:** Upload one or multiple PDFs.
19
+ - **Text Extraction & Chunking:** Extracts text from PDFs and splits it into manageable chunks.
20
+ - **Embedding Generation:** Converts text chunks into embeddings using OpenAI's `text-embedding-ada-002`.
21
+ - **Question Answering:** Ask questions about your documents and get context-aware answers generated by GPT-4.
22
+ - **Context Display:** View relevant sections from the PDF that support the generated answers.
23
+
24
+ ## Installation
25
+
26
+ ## Setup
27
+ 1. Create and activate a virtual environment:
28
+ ```bash
29
+ python3 -m venv venv
30
+ source venv/bin/activate
31
+ ```
32
+ # .\venv\Scripts\activate # On Windows
33
+
34
+ 2. Install requirements:
35
+ ```bash
36
+ pip install -r requirements.txt
37
+ ```
38
+
39
+ 3. Run the application:
40
+ ```bash
41
+ streamlit run app.py
42
+ ```
43
+
44
+ 4. **Configure API Key:**
45
+ - Create a `.env` file in the root directory.
46
+ - Add your OpenAI API key:
47
+ ```
48
+ OPENAI_API_KEY=your_openai_api_key_here
49
+ ```
50
+
51
+ ## Usage
52
+
53
+ 1. **Run the application:**
54
+ ```bash
55
+ streamlit run app.py
56
+ ```
57
+
58
+ 2. **Interact:**
59
+ - Upload PDF files.
60
+ - Wait for processing and embedding generation.
61
+ - Enter a question to get answers with relevant context excerpts from your PDFs.
62
+
63
+ ## Notes
64
+ - The app meets core requirements: PDF uploading, text processing, embedding conversion, and Q&A.
65
+ - While context is shown, highlighting directly on the PDF is not implemented yet.
66
+ - Supports multiple PDF uploads and cross-document querying.
app.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from utils.pdf_utils import PDFProcessor
4
+ from utils.embeddings_utils import EmbeddingsManager
5
+ from utils.qa_utils import QASystem
6
+ from dotenv import load_dotenv
7
+ import openai
8
+
9
+ def initialize_session_state():
10
+ if 'pdf_processor' not in st.session_state:
11
+ st.session_state['pdf_processor'] = None
12
+ if 'embeddings_manager' not in st.session_state:
13
+ st.session_state['embeddings_manager'] = None
14
+ if 'qa_system' not in st.session_state:
15
+ st.session_state['qa_system'] = None
16
+ if 'processed_pdfs' not in st.session_state:
17
+ st.session_state['processed_pdfs'] = set()
18
+ if 'all_text_chunks' not in st.session_state:
19
+ st.session_state['all_text_chunks'] = []
20
+
21
+ def main():
22
+ load_dotenv()
23
+ st.set_page_config(page_title="AI-Powered PDF Assistant", layout="wide")
24
+
25
+ initialize_session_state()
26
+
27
+ # Header Section
28
+ st.markdown(
29
+ """
30
+ <style>
31
+ .main-header {
32
+ font-size: 2.5rem;
33
+ color: #1F77B4;
34
+ text-align: center;
35
+ margin-bottom: 1rem;
36
+ }
37
+ .sub-header {
38
+ font-size: 1.25rem;
39
+ color: #555;
40
+ text-align: center;
41
+ margin-bottom: 2rem;
42
+ }
43
+ </style>
44
+ <div class="main-header">📘 AI-Powered PDF Assistant</div>
45
+ <div class="sub-header">Upload, Analyze, and Interact with Your Documents</div>
46
+ """,
47
+ unsafe_allow_html=True
48
+ )
49
+
50
+ # Navigation Menu
51
+ selected_page = st.sidebar.radio(
52
+ "Navigate", ["Upload PDFs", "Ask Questions", "About"]
53
+ )
54
+
55
+ api_key = os.getenv("OPENAI_API_KEY")
56
+ if not api_key:
57
+ st.sidebar.error("OpenAI API key not found in .env file!")
58
+ return
59
+
60
+ openai.api_key = api_key
61
+
62
+ if not st.session_state['pdf_processor']:
63
+ st.session_state['pdf_processor'] = PDFProcessor()
64
+ if not st.session_state['embeddings_manager']:
65
+ st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
66
+ if not st.session_state['qa_system']:
67
+ st.session_state['qa_system'] = QASystem(api_key)
68
+
69
+ if selected_page == "Upload PDFs":
70
+ st.header("📤 Upload PDFs")
71
+ st.markdown(
72
+ """<p style='font-size: 1.1rem;'>Drag and drop your PDF files below to extract and process content for analysis.</p>""",
73
+ unsafe_allow_html=True
74
+ )
75
+
76
+ uploaded_files = st.file_uploader(
77
+ "Upload PDF files", type=['pdf'], accept_multiple_files=True
78
+ )
79
+
80
+ if uploaded_files:
81
+ new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
82
+ if new_files:
83
+ with st.spinner("Processing PDFs..."):
84
+ for pdf_file in new_files:
85
+ try:
86
+ pages = st.session_state['pdf_processor'].extract_text(pdf_file)
87
+ for page_text in pages.values():
88
+ chunks = st.session_state['pdf_processor'].chunk_text(page_text)
89
+ st.session_state['all_text_chunks'].extend(chunks)
90
+ st.session_state['processed_pdfs'].add(pdf_file.name)
91
+ except Exception as e:
92
+ st.error(f"Error processing {pdf_file.name}: {str(e)}")
93
+ continue
94
+
95
+ with st.spinner("Generating embeddings..."):
96
+ try:
97
+ st.session_state['embeddings_manager'].generate_embeddings(
98
+ st.session_state['all_text_chunks']
99
+ )
100
+ st.success("✅ Documents processed successfully!")
101
+ except Exception as e:
102
+ st.error(f"Error generating embeddings: {str(e)}")
103
+
104
+ elif selected_page == "Ask Questions":
105
+ st.header("❓ Ask Questions")
106
+ st.markdown(
107
+ """<p style='font-size: 1.1rem;'>Query your uploaded documents and get precise answers backed by AI-powered analysis.</p>""",
108
+ unsafe_allow_html=True
109
+ )
110
+
111
+ if st.session_state['all_text_chunks']:
112
+ question = st.text_input("Enter your question:")
113
+
114
+ if question:
115
+ try:
116
+ with st.spinner("Finding relevant information..."):
117
+ relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
118
+ question, k=3
119
+ )
120
+ answer = st.session_state['qa_system'].generate_answer(
121
+ question, relevant_chunks
122
+ )
123
+
124
+ st.markdown("### 🤖 Answer")
125
+ st.write(answer)
126
+
127
+ with st.expander("🔍 View Source Context"):
128
+ for i, chunk in enumerate(relevant_chunks, 1):
129
+ st.markdown(f"**Context {i}:**")
130
+ st.write(chunk)
131
+ st.markdown("---")
132
+ except openai.error.RateLimitError:
133
+ st.error("Rate limit exceeded. Please try again later.")
134
+ except Exception as e:
135
+ st.error(f"Error: {str(e)}")
136
+ else:
137
+ st.warning("Please upload and process documents in the 'Upload PDFs' section first.")
138
+
139
+ elif selected_page == "About":
140
+ st.header("ℹ️ About This App")
141
+ st.markdown(
142
+ """
143
+ <p style='font-size: 1.1rem;'>
144
+ <b>AI-Powered PDF Assistant</b> is a smart solution for extracting and querying information from PDF files. With powerful AI integrations,
145
+ this tool allows seamless document analysis and interaction.
146
+ </p>
147
+
148
+ <h3>🔑 Key Features</h3>
149
+ <ul>
150
+ <li>Upload and process multiple PDF files</li>
151
+ <li>Generate embeddings for precise content retrieval</li>
152
+ <li>Query documents and receive context-aware answers</li>
153
+ </ul>
154
+
155
+ <h3>🛠️ Technologies Used</h3>
156
+ <ul>
157
+ <li>Streamlit for interactive UI</li>
158
+ <li>OpenAI GPT API for Q&A</li>
159
+ <li>Custom PDF processing and embedding tools</li>
160
+ </ul>
161
+
162
+ <p style='text-align: center;'>
163
+ Built with ❤️ by [Your Name]
164
+ </p>
165
+ """,
166
+ unsafe_allow_html=True
167
+ )
168
+
169
+ if __name__ == "__main__":
170
+ main()
extracted_text.txt ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --- File: /home/sk/Desktop/chat-with-pdf/app.py ---
2
+
3
+ import streamlit as st
4
+ import os
5
+ from utils.pdf_utils import PDFProcessor
6
+ from utils.embeddings_utils import EmbeddingsManager
7
+ from utils.qa_utils import QASystem
8
+ from dotenv import load_dotenv
9
+ import openai
10
+ import time
11
+
12
+ def initialize_session_state():
13
+ if 'pdf_processor' not in st.session_state:
14
+ st.session_state['pdf_processor'] = None
15
+ if 'embeddings_manager' not in st.session_state:
16
+ st.session_state['embeddings_manager'] = None
17
+ if 'qa_system' not in st.session_state:
18
+ st.session_state['qa_system'] = None
19
+ if 'processed_pdfs' not in st.session_state:
20
+ st.session_state['processed_pdfs'] = set()
21
+ if 'all_text_chunks' not in st.session_state:
22
+ st.session_state['all_text_chunks'] = []
23
+
24
+ def main():
25
+ load_dotenv()
26
+ st.set_page_config(page_title="Chat with PDF", layout="wide")
27
+ st.title("📄💬 Chat with PDF")
28
+
29
+ initialize_session_state()
30
+
31
+ with st.sidebar:
32
+ st.header("🔍 How to Use")
33
+ st.markdown("""
34
+ 1. Upload PDF document(s)
35
+ 2. Ask questions about the content
36
+ 3. View answers and relevant context
37
+ """)
38
+ if 'total_tokens_used' in st.session_state:
39
+ st.markdown("---")
40
+ st.markdown("### 📊 Usage Statistics")
41
+ st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")
42
+
43
+ api_key = os.getenv("OPENAI_API_KEY")
44
+ if not api_key:
45
+ st.error("OpenAI API key not found in .env file!")
46
+ return
47
+
48
+ openai.api_key = api_key
49
+
50
+ if not st.session_state['pdf_processor']:
51
+ st.session_state['pdf_processor'] = PDFProcessor()
52
+ if not st.session_state['embeddings_manager']:
53
+ st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
54
+ if not st.session_state['qa_system']:
55
+ st.session_state['qa_system'] = QASystem(api_key)
56
+
57
+ st.subheader("📤 Upload PDFs")
58
+ uploaded_files = st.file_uploader(
59
+ "Upload PDF documents",
60
+ type=['pdf'],
61
+ accept_multiple_files=True
62
+ )
63
+
64
+ if uploaded_files:
65
+ new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
66
+ if new_files:
67
+ with st.spinner("Processing PDFs..."):
68
+ for pdf_file in new_files:
69
+ try:
70
+ pages = st.session_state['pdf_processor'].extract_text(pdf_file)
71
+ for page_text in pages.values():
72
+ chunks = st.session_state['pdf_processor'].chunk_text(page_text)
73
+ st.session_state['all_text_chunks'].extend(chunks)
74
+ st.session_state['processed_pdfs'].add(pdf_file.name)
75
+ except Exception as e:
76
+ st.error(f"Error processing {pdf_file.name}: {str(e)}")
77
+ continue
78
+
79
+ with st.spinner("Generating embeddings..."):
80
+ try:
81
+ st.session_state['embeddings_manager'].generate_embeddings(
82
+ st.session_state['all_text_chunks']
83
+ )
84
+ st.success("✅ Documents processed!")
85
+ except Exception as e:
86
+ st.error(f"Error generating embeddings: {str(e)}")
87
+ return
88
+
89
+ if st.session_state['all_text_chunks']:
90
+ st.write("---")
91
+ st.subheader("❓ Ask Questions About Your Documents")
92
+ question = st.text_input("Enter your question:")
93
+ if question:
94
+ try:
95
+ with st.spinner("Searching for relevant information..."):
96
+ relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
97
+ question,
98
+ k=3
99
+ )
100
+ answer = st.session_state['qa_system'].generate_answer(
101
+ question,
102
+ relevant_chunks
103
+ )
104
+ st.markdown("### 🤖 Answer:")
105
+ st.write(answer)
106
+ with st.expander("🔍 View Source Context"):
107
+ for i, chunk in enumerate(relevant_chunks, 1):
108
+ st.markdown(f"**Context {i}:**")
109
+ st.write(chunk)
110
+ st.markdown("---")
111
+ except openai.error.RateLimitError:
112
+ st.error("Rate limit exceeded. Please try again later.")
113
+ except Exception as e:
114
+ st.error(f"Error: {str(e)}")
115
+
116
+ if __name__ == "__main__":
117
+ main()
118
+
119
+
120
+ --- File: /home/sk/Desktop/chat-with-pdf/requirements.txt ---
121
+
122
+ streamlit
123
+ PyPDF2
124
+ openai
125
+ python-dotenv
126
+ faiss-cpu
127
+ numpy
128
+ pdf2image
129
+ Pillow
130
+
131
+ --- File: /home/sk/Desktop/chat-with-pdf/.env ---
132
+
133
+ OPENAI_API_KEY=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
134
+
135
+ --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py ---
136
+
137
+ import streamlit as st
138
+ import os
139
+ from utils.pdf_utils import PDFProcessor
140
+ from utils.embeddings_utils import EmbeddingsManager
141
+ from utils.qa_utils import QASystem
142
+ from dotenv import load_dotenv
143
+ import openai
144
+ import time
145
+
146
+ def initialize_session_state():
147
+ if 'pdf_processor' not in st.session_state:
148
+ st.session_state['pdf_processor'] = None
149
+ if 'embeddings_manager' not in st.session_state:
150
+ st.session_state['embeddings_manager'] = None
151
+ if 'qa_system' not in st.session_state:
152
+ st.session_state['qa_system'] = None
153
+ if 'processed_pdfs' not in st.session_state:
154
+ st.session_state['processed_pdfs'] = set()
155
+ if 'all_text_chunks' not in st.session_state:
156
+ st.session_state['all_text_chunks'] = []
157
+
158
+ def main():
159
+ load_dotenv()
160
+ st.set_page_config(page_title="Chat with PDF", layout="wide")
161
+ st.title("📄💬 Chat with PDF")
162
+
163
+ initialize_session_state()
164
+
165
+ with st.sidebar:
166
+ st.header("🔍 How to Use")
167
+ st.markdown("""
168
+ 1. Upload PDF document(s)
169
+ 2. Ask questions about the content
170
+ 3. View answers and relevant context
171
+ """)
172
+ if 'total_tokens_used' in st.session_state:
173
+ st.markdown("---")
174
+ st.markdown("### 📊 Usage Statistics")
175
+ st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")
176
+
177
+ api_key = os.getenv("OPENAI_API_KEY")
178
+ if not api_key:
179
+ st.error("OpenAI API key not found in .env file!")
180
+ return
181
+
182
+ openai.api_key = api_key
183
+
184
+ if not st.session_state['pdf_processor']:
185
+ st.session_state['pdf_processor'] = PDFProcessor()
186
+ if not st.session_state['embeddings_manager']:
187
+ st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
188
+ if not st.session_state['qa_system']:
189
+ st.session_state['qa_system'] = QASystem(api_key)
190
+
191
+ st.subheader("📤 Upload PDFs")
192
+ uploaded_files = st.file_uploader(
193
+ "Upload PDF documents",
194
+ type=['pdf'],
195
+ accept_multiple_files=True
196
+ )
197
+
198
+ if uploaded_files:
199
+ new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
200
+ if new_files:
201
+ with st.spinner("Processing PDFs..."):
202
+ for pdf_file in new_files:
203
+ try:
204
+ pages = st.session_state['pdf_processor'].extract_text(pdf_file)
205
+ for page_text in pages.values():
206
+ chunks = st.session_state['pdf_processor'].chunk_text(page_text)
207
+ st.session_state['all_text_chunks'].extend(chunks)
208
+ st.session_state['processed_pdfs'].add(pdf_file.name)
209
+ except Exception as e:
210
+ st.error(f"Error processing {pdf_file.name}: {str(e)}")
211
+ continue
212
+
213
+ with st.spinner("Generating embeddings..."):
214
+ try:
215
+ st.session_state['embeddings_manager'].generate_embeddings(
216
+ st.session_state['all_text_chunks']
217
+ )
218
+ st.success("✅ Documents processed!")
219
+ except Exception as e:
220
+ st.error(f"Error generating embeddings: {str(e)}")
221
+ return
222
+
223
+ if st.session_state['all_text_chunks']:
224
+ st.write("---")
225
+ st.subheader("❓ Ask Questions About Your Documents")
226
+ question = st.text_input("Enter your question:")
227
+ if question:
228
+ try:
229
+ with st.spinner("Searching for relevant information..."):
230
+ relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
231
+ question,
232
+ k=3
233
+ )
234
+ answer = st.session_state['qa_system'].generate_answer(
235
+ question,
236
+ relevant_chunks
237
+ )
238
+ st.markdown("### 🤖 Answer:")
239
+ st.write(answer)
240
+ with st.expander("🔍 View Source Context"):
241
+ for i, chunk in enumerate(relevant_chunks, 1):
242
+ st.markdown(f"**Context {i}:**")
243
+ st.write(chunk)
244
+ st.markdown("---")
245
+ except openai.error.RateLimitError:
246
+ st.error("Rate limit exceeded. Please try again later.")
247
+ except Exception as e:
248
+ st.error(f"Error: {str(e)}")
249
+
250
+ if __name__ == "__main__":
251
+ main()
252
+
253
+
254
+ --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt ---
255
+
256
+ streamlit
257
+ PyPDF2
258
+ openai
259
+ python-dotenv
260
+ faiss-cpu
261
+ numpy
262
+ pdf2image
263
+ Pillow
264
+
265
+ --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes ---
266
+
267
+ *.7z filter=lfs diff=lfs merge=lfs -text
268
+ *.arrow filter=lfs diff=lfs merge=lfs -text
269
+ *.bin filter=lfs diff=lfs merge=lfs -text
270
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
271
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
272
+ *.ftz filter=lfs diff=lfs merge=lfs -text
273
+ *.gz filter=lfs diff=lfs merge=lfs -text
274
+ *.h5 filter=lfs diff=lfs merge=lfs -text
275
+ *.joblib filter=lfs diff=lfs merge=lfs -text
276
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
277
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
278
+ *.model filter=lfs diff=lfs merge=lfs -text
279
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
280
+ *.npy filter=lfs diff=lfs merge=lfs -text
281
+ *.npz filter=lfs diff=lfs merge=lfs -text
282
+ *.onnx filter=lfs diff=lfs merge=lfs -text
283
+ *.ot filter=lfs diff=lfs merge=lfs -text
284
+ *.parquet filter=lfs diff=lfs merge=lfs -text
285
+ *.pb filter=lfs diff=lfs merge=lfs -text
286
+ *.pickle filter=lfs diff=lfs merge=lfs -text
287
+ *.pkl filter=lfs diff=lfs merge=lfs -text
288
+ *.pt filter=lfs diff=lfs merge=lfs -text
289
+ *.pth filter=lfs diff=lfs merge=lfs -text
290
+ *.rar filter=lfs diff=lfs merge=lfs -text
291
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
292
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
293
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
294
+ *.tar filter=lfs diff=lfs merge=lfs -text
295
+ *.tflite filter=lfs diff=lfs merge=lfs -text
296
+ *.tgz filter=lfs diff=lfs merge=lfs -text
297
+ *.wasm filter=lfs diff=lfs merge=lfs -text
298
+ *.xz filter=lfs diff=lfs merge=lfs -text
299
+ *.zip filter=lfs diff=lfs merge=lfs -text
300
+ *.zst filter=lfs diff=lfs merge=lfs -text
301
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
302
+
303
+
304
+ --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env ---
305
+
306
+ OPENAI_API_KEY=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
307
+
308
+ --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py ---
309
+
310
+ import openai
311
+ from typing import List
312
+
313
+ class QASystem:
314
+ def __init__(self, api_key: str):
315
+ openai.api_key = api_key
316
+
317
+ def generate_answer(self, question: str, context: List[str]) -> str:
318
+ prompt = f"""Based on the context provided below, answer the question.
319
+ If the answer is not in the context, respond with "The answer is not in the provided context."
320
+
321
+ Context:
322
+ {' '.join(context)}
323
+
324
+ Question: {question}
325
+ """
326
+
327
+ response = openai.chat.completions.create( # Updated line
328
+ model="gpt-4",
329
+ messages=[
330
+ {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
331
+ {"role": "user", "content": prompt}
332
+ ],
333
+ temperature=0,
334
+ max_tokens=500
335
+ )
336
+ return response.choices[0].message.content
337
+
338
+
339
+ --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py ---
340
+
341
+ import openai
342
+ import numpy as np
343
+ import faiss
344
+ from typing import List
345
+
346
+ class EmbeddingsManager:
347
+ def __init__(self, api_key: str):
348
+ self.api_key = api_key
349
+ self.index = None
350
+ self.chunks = []
351
+
352
+ def generate_embeddings(self, text_chunks: List[str]):
353
+ """Generate embeddings for text chunks using OpenAI API."""
354
+ batch_size = 10
355
+ embeddings = []
356
+
357
+ for i in range(0, len(text_chunks), batch_size):
358
+ batch = text_chunks[i:i + batch_size]
359
+ response = openai.embeddings.create(
360
+ input=batch,
361
+ model="text-embedding-ada-002"
362
+ )
363
+ # Access the embeddings using attributes
364
+ batch_embeddings = [item.embedding for item in response.data]
365
+ embeddings.extend(batch_embeddings)
366
+
367
+ # Create FAISS index
368
+ dimension = len(embeddings[0])
369
+ self.index = faiss.IndexFlatL2(dimension)
370
+ embeddings_array = np.array(embeddings).astype('float32')
371
+ self.index.add(embeddings_array)
372
+ self.chunks = text_chunks
373
+
374
+ def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
375
+ """Find most relevant text chunks for a given query."""
376
+ response = openai.embeddings.create(
377
+ input=[query],
378
+ model="text-embedding-ada-002"
379
+ )
380
+ # Access the query embedding using attributes
381
+ query_embedding = response.data[0].embedding
382
+
383
+ D, I = self.index.search(
384
+ np.array([query_embedding]).astype('float32'),
385
+ k
386
+ )
387
+
388
+ return [self.chunks[i] for i in I[0] if i != -1]
389
+
390
+
391
+ --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py ---
392
+
393
+ import PyPDF2
394
+ from typing import List, Dict
395
+
396
+ class PDFProcessor:
397
+ def __init__(self):
398
+ self.pages = {}
399
+
400
+ def extract_text(self, pdf_file) -> Dict[int, str]:
401
+ """Extract text from PDF and return a dictionary of page numbers and text."""
402
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
403
+ for page_num in range(len(pdf_reader.pages)):
404
+ text = pdf_reader.pages[page_num].extract_text()
405
+ self.pages[page_num] = text
406
+ return self.pages
407
+
408
+ def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
409
+ """Split text into chunks of specified size."""
410
+ words = text.split()
411
+ chunks = []
412
+ current_chunk = []
413
+ current_size = 0
414
+
415
+ for word in words:
416
+ current_size += len(word) + 1 # +1 for space
417
+ if current_size > chunk_size:
418
+ chunks.append(' '.join(current_chunk))
419
+ current_chunk = [word]
420
+ current_size = len(word)
421
+ else:
422
+ current_chunk.append(word)
423
+
424
+ if current_chunk:
425
+ chunks.append(' '.join(current_chunk))
426
+
427
+ return chunks
428
+
429
+
430
+ --- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py ---
431
+
432
+ import openai
433
+ from typing import List
434
+
435
+ class QASystem:
436
+ def __init__(self, api_key: str):
437
+ openai.api_key = api_key
438
+
439
+ def generate_answer(self, question: str, context: List[str]) -> str:
440
+ prompt = f"""Based on the context provided below, answer the question.
441
+ If the answer is not in the context, respond with "The answer is not in the provided context."
442
+
443
+ Context:
444
+ {' '.join(context)}
445
+
446
+ Question: {question}
447
+ """
448
+
449
+ response = openai.chat.completions.create( # Updated line
450
+ model="gpt-4",
451
+ messages=[
452
+ {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
453
+ {"role": "user", "content": prompt}
454
+ ],
455
+ temperature=0,
456
+ max_tokens=500
457
+ )
458
+ return response.choices[0].message.content
459
+
460
+
461
+ --- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py ---
462
+
463
+ import openai
464
+ import numpy as np
465
+ import faiss
466
+ from typing import List
467
+
468
+ class EmbeddingsManager:
469
+ def __init__(self, api_key: str):
470
+ self.api_key = api_key
471
+ self.index = None
472
+ self.chunks = []
473
+
474
+ def generate_embeddings(self, text_chunks: List[str]):
475
+ """Generate embeddings for text chunks using OpenAI API."""
476
+ batch_size = 10
477
+ embeddings = []
478
+
479
+ for i in range(0, len(text_chunks), batch_size):
480
+ batch = text_chunks[i:i + batch_size]
481
+ response = openai.embeddings.create(
482
+ input=batch,
483
+ model="text-embedding-ada-002"
484
+ )
485
+ # Access the embeddings using attributes
486
+ batch_embeddings = [item.embedding for item in response.data]
487
+ embeddings.extend(batch_embeddings)
488
+
489
+ # Create FAISS index
490
+ dimension = len(embeddings[0])
491
+ self.index = faiss.IndexFlatL2(dimension)
492
+ embeddings_array = np.array(embeddings).astype('float32')
493
+ self.index.add(embeddings_array)
494
+ self.chunks = text_chunks
495
+
496
+ def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
497
+ """Find most relevant text chunks for a given query."""
498
+ response = openai.embeddings.create(
499
+ input=[query],
500
+ model="text-embedding-ada-002"
501
+ )
502
+ # Access the query embedding using attributes
503
+ query_embedding = response.data[0].embedding
504
+
505
+ D, I = self.index.search(
506
+ np.array([query_embedding]).astype('float32'),
507
+ k
508
+ )
509
+
510
+ return [self.chunks[i] for i in I[0] if i != -1]
511
+
512
+
513
+ --- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py ---
514
+
515
+ import PyPDF2
516
+ from typing import List, Dict
517
+
518
+ class PDFProcessor:
519
+ def __init__(self):
520
+ self.pages = {}
521
+
522
+ def extract_text(self, pdf_file) -> Dict[int, str]:
523
+ """Extract text from PDF and return a dictionary of page numbers and text."""
524
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
525
+ for page_num in range(len(pdf_reader.pages)):
526
+ text = pdf_reader.pages[page_num].extract_text()
527
+ self.pages[page_num] = text
528
+ return self.pages
529
+
530
+ def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
531
+ """Split text into chunks of specified size."""
532
+ words = text.split()
533
+ chunks = []
534
+ current_chunk = []
535
+ current_size = 0
536
+
537
+ for word in words:
538
+ current_size += len(word) + 1 # +1 for space
539
+ if current_size > chunk_size:
540
+ chunks.append(' '.join(current_chunk))
541
+ current_chunk = [word]
542
+ current_size = len(word)
543
+ else:
544
+ current_chunk.append(word)
545
+
546
+ if current_chunk:
547
+ chunks.append(' '.join(current_chunk))
548
+
549
+ return chunks
550
+
551
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ openai
4
+ python-dotenv
5
+ faiss-cpu
6
+ numpy
7
+ pdf2image
8
+ Pillow
textScript.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def extract_text_from_folder(folder_path, output_file, files_to_skip=None, folders_to_skip=None):
4
+ """
5
+ Extracts text from all files within a folder and its subfolders.
6
+ """
7
+
8
+ if files_to_skip is None:
9
+ files_to_skip = []
10
+ if folders_to_skip is None:
11
+ folders_to_skip = []
12
+
13
+ script_dir = os.path.dirname(os.path.abspath(__file__))
14
+ output_file_path = os.path.join(script_dir, output_file)
15
+
16
+ with open(output_file_path, 'w', encoding='utf-8') as outfile:
17
+ for foldername, subfolders, filenames in os.walk(folder_path):
18
+ # Check if folder to skip is in the current folder path
19
+ should_skip_folder = any(folder in foldername for folder in folders_to_skip)
20
+
21
+ if should_skip_folder:
22
+ print(f"Skipping specified folder: {foldername}")
23
+ continue
24
+
25
+ for filename in filenames:
26
+ if filename in files_to_skip:
27
+ print(f"Skipping specified file: {filename}")
28
+ continue
29
+
30
+ file_path = os.path.join(foldername, filename)
31
+
32
+ try:
33
+ with open(file_path, 'r', encoding='utf-8') as f:
34
+ text = f.read()
35
+ outfile.write(f"--- File: {file_path} ---\n\n")
36
+ outfile.write(text)
37
+ outfile.write("\n\n")
38
+ except UnicodeDecodeError:
39
+ print(f"Skipping binary file: {file_path}")
40
+ except Exception as e:
41
+ print(f"Error processing {file_path}: {e}")
42
+
43
+ if __name__ == "__main__":
44
+ folder_to_extract = "/home/sk/Desktop/chat-with-pdf"
45
+ output_text_file = "extracted_text.txt"
46
+ files_to_skip = ["extracted_text.txt", "next.config.ts", "next.config.mjs", "tailwind.config.ts", "tsconfig.json","postcss.config.mjs","next-env.d.ts","components.json",".eslintrc.json","EDA.ipynb","evaluate.ipynb","textScript.py","stock_price.csv","README.md","globals.css","auto_complete.json", "another_file.css", "LogoBadge.svelte","README.md",".gitignore","package-lock.json","package.json"]
47
+ folders_to_skip = ["__pycache__", "venv", ".next","results","models","notebooks","data","env","__pycache__","redux","resetpassword","login","register","assets","icon", "asset", "node_modules",".git"]
48
+
49
+ extract_text_from_folder(folder_to_extract, output_text_file, files_to_skip, folders_to_skip)
50
+ print(f"Text extraction complete. Output saved to: {output_text_file}")
utils/embeddings_utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import numpy as np
3
+ import faiss
4
+ from typing import List
5
+
6
+ class EmbeddingsManager:
7
+ def __init__(self, api_key: str):
8
+ self.api_key = api_key
9
+ self.index = None
10
+ self.chunks = []
11
+
12
+ def generate_embeddings(self, text_chunks: List[str]):
13
+ """Generate embeddings for text chunks using OpenAI API."""
14
+ batch_size = 10
15
+ embeddings = []
16
+
17
+ for i in range(0, len(text_chunks), batch_size):
18
+ batch = text_chunks[i:i + batch_size]
19
+ response = openai.embeddings.create(
20
+ input=batch,
21
+ model="text-embedding-ada-002"
22
+ )
23
+ # Access the embeddings using attributes
24
+ batch_embeddings = [item.embedding for item in response.data]
25
+ embeddings.extend(batch_embeddings)
26
+
27
+ # Create FAISS index
28
+ dimension = len(embeddings[0])
29
+ self.index = faiss.IndexFlatL2(dimension)
30
+ embeddings_array = np.array(embeddings).astype('float32')
31
+ self.index.add(embeddings_array)
32
+ self.chunks = text_chunks
33
+
34
+ def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
35
+ """Find most relevant text chunks for a given query."""
36
+ response = openai.embeddings.create(
37
+ input=[query],
38
+ model="text-embedding-ada-002"
39
+ )
40
+ # Access the query embedding using attributes
41
+ query_embedding = response.data[0].embedding
42
+
43
+ D, I = self.index.search(
44
+ np.array([query_embedding]).astype('float32'),
45
+ k
46
+ )
47
+
48
+ return [self.chunks[i] for i in I[0] if i != -1]
utils/pdf_utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from typing import List, Dict
3
+
4
+ class PDFProcessor:
5
+ def __init__(self):
6
+ self.pages = {}
7
+
8
+ def extract_text(self, pdf_file) -> Dict[int, str]:
9
+ """Extract text from PDF and return a dictionary of page numbers and text."""
10
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
11
+ for page_num in range(len(pdf_reader.pages)):
12
+ text = pdf_reader.pages[page_num].extract_text()
13
+ self.pages[page_num] = text
14
+ return self.pages
15
+
16
+ def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
17
+ """Split text into chunks of specified size."""
18
+ words = text.split()
19
+ chunks = []
20
+ current_chunk = []
21
+ current_size = 0
22
+
23
+ for word in words:
24
+ current_size += len(word) + 1 # +1 for space
25
+ if current_size > chunk_size:
26
+ chunks.append(' '.join(current_chunk))
27
+ current_chunk = [word]
28
+ current_size = len(word)
29
+ else:
30
+ current_chunk.append(word)
31
+
32
+ if current_chunk:
33
+ chunks.append(' '.join(current_chunk))
34
+
35
+ return chunks
utils/qa_utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ from typing import List
3
+
4
+ class QASystem:
5
+ def __init__(self, api_key: str):
6
+ openai.api_key = api_key
7
+
8
+ def generate_answer(self, question: str, context: List[str]) -> str:
9
+ prompt = f"""Based on the context provided below, answer the question.
10
+ If the answer is not in the context, respond with "The answer is not in the provided context."
11
+
12
+ Context:
13
+ {' '.join(context)}
14
+
15
+ Question: {question}
16
+ """
17
+
18
+ response = openai.chat.completions.create( # Updated line
19
+ model="gpt-4",
20
+ messages=[
21
+ {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
22
+ {"role": "user", "content": prompt}
23
+ ],
24
+ temperature=0,
25
+ max_tokens=500
26
+ )
27
+ return response.choices[0].message.content