Files Uploaded
Browse files- Report.txt +23 -0
- app.py +123 -0
- requirements.txt +9 -0
Report.txt
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Title: RAG-Based Chatbot for Multi-PDF Question Answering with Word Cloud Visualization and Confidence Scoring
|
2 |
+
|
3 |
+
Overview:
|
4 |
+
This project implements a Retrieval-Augmented Generation (RAG) chatbot that allows users to upload multiple PDF documents, processes their content, and answers natural language questions based on the combined knowledge of the documents. The application uses a Gradio interface for interaction, PyMuPDF for extracting text from PDFs, SentenceTransformers for creating semantic embeddings, scikit-learn for similarity-based retrieval, and Hugging Face Transformers for answering the questions.
|
5 |
+
|
6 |
+
Unique Enhancements:
|
7 |
+
|
8 |
+
1. Word Cloud Preview (Visual Summary):
|
9 |
+
Before asking questions, users are shown a word cloud generated from the content of all uploaded PDFs. This allows them to quickly understand the key topics present in the documents and helps guide meaningful queries.
|
10 |
+
|
11 |
+
2. Confidence Score for Answers:
|
12 |
+
Each generated answer includes a confidence score based on the cosine similarity between the question and the retrieved chunks. This provides transparency about how well the chatbot understands and matches the question with relevant content from the PDFs.
|
13 |
+
|
14 |
+
Challenges Faced:
|
15 |
+
- FAISS library compatibility issues on Windows were resolved by switching to scikit-learn for vector similarity search.
|
16 |
+
- Ensuring that the word cloud was both readable and helpful required tuning of text preprocessing and visualization.
|
17 |
+
- Accurately interpreting similarity scores to compute confidence required normalization and tuning based on real test cases.
|
18 |
+
|
19 |
+
|
20 |
+
Submitted Files:
|
21 |
+
- app.py (main chatbot implementation)
|
22 |
+
- requirements.txt (all Python dependencies including sklearn, matplotlib, etc.)
|
23 |
+
- RAG_Report.txt (this report)
|
app.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
import os
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from wordcloud import WordCloud
|
7 |
+
from io import BytesIO
|
8 |
+
from collections import Counter
|
9 |
+
import re
|
10 |
+
|
11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
from sentence_transformers import SentenceTransformer
|
13 |
+
from transformers import pipeline
|
14 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
+
|
16 |
+
# Load models
|
17 |
+
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
+
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
|
19 |
+
|
20 |
+
# Globals
|
21 |
+
all_chunks = []
|
22 |
+
chunk_sources = []
|
23 |
+
chunk_embeddings = None
|
24 |
+
combined_text = ""
|
25 |
+
|
26 |
+
def extract_text_from_pdfs(pdf_files):
|
27 |
+
"""Extract text and page info from uploaded PDFs"""
|
28 |
+
global all_chunks, chunk_sources, combined_text
|
29 |
+
texts = []
|
30 |
+
chunk_sources = []
|
31 |
+
combined_text = ""
|
32 |
+
|
33 |
+
for file in pdf_files:
|
34 |
+
doc = fitz.open(file.name)
|
35 |
+
for page_num, page in enumerate(doc):
|
36 |
+
text = page.get_text()
|
37 |
+
if text.strip():
|
38 |
+
texts.append((text, f"{os.path.basename(file.name)} - Page {page_num + 1}"))
|
39 |
+
combined_text += " " + text
|
40 |
+
|
41 |
+
return texts
|
42 |
+
|
43 |
+
def split_and_embed(texts_with_sources):
|
44 |
+
"""Split text into chunks and compute embeddings"""
|
45 |
+
global all_chunks, chunk_sources, chunk_embeddings
|
46 |
+
all_chunks = []
|
47 |
+
chunk_sources = []
|
48 |
+
|
49 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
50 |
+
|
51 |
+
for text, source in texts_with_sources:
|
52 |
+
docs = splitter.create_documents([text])
|
53 |
+
for doc in docs:
|
54 |
+
all_chunks.append(doc.page_content)
|
55 |
+
chunk_sources.append(source)
|
56 |
+
|
57 |
+
chunk_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
|
58 |
+
|
59 |
+
def generate_wordcloud():
|
60 |
+
"""Generate a word cloud from combined PDF text"""
|
61 |
+
global combined_text
|
62 |
+
if not combined_text.strip():
|
63 |
+
return None
|
64 |
+
|
65 |
+
cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
|
66 |
+
word_freq = Counter(cleaned.split())
|
67 |
+
|
68 |
+
wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
|
69 |
+
|
70 |
+
fig, ax = plt.subplots()
|
71 |
+
ax.imshow(wc, interpolation='bilinear')
|
72 |
+
ax.axis("off")
|
73 |
+
|
74 |
+
buffer = BytesIO()
|
75 |
+
plt.savefig(buffer, format="png")
|
76 |
+
buffer.seek(0)
|
77 |
+
return buffer
|
78 |
+
|
79 |
+
def answer_question(question):
|
80 |
+
"""Retrieve top chunks, answer question, and show confidence"""
|
81 |
+
global all_chunks, chunk_sources, chunk_embeddings
|
82 |
+
if not all_chunks or chunk_embeddings is None:
|
83 |
+
return "Please upload and process some PDFs first.", None
|
84 |
+
|
85 |
+
q_emb = embed_model.encode([question], convert_to_numpy=True)
|
86 |
+
sims = cosine_similarity(q_emb, chunk_embeddings)[0]
|
87 |
+
top_k_idx = sims.argsort()[::-1][:3]
|
88 |
+
|
89 |
+
selected_chunks = [all_chunks[i] for i in top_k_idx]
|
90 |
+
selected_sources = [chunk_sources[i] for i in top_k_idx]
|
91 |
+
context = "\n\n".join(selected_chunks)
|
92 |
+
|
93 |
+
answer = qa_pipeline(question=question, context=context)["answer"]
|
94 |
+
avg_conf = np.mean([sims[i] for i in top_k_idx]) * 100
|
95 |
+
|
96 |
+
source_info = "\n".join([f"- {src}" for src in selected_sources])
|
97 |
+
result = f"**Answer**: {answer}\n\n**Sources**:\n{source_info}\n\n**Confidence Score**: {avg_conf:.2f}%"
|
98 |
+
return result, None
|
99 |
+
|
100 |
+
# Gradio UI
|
101 |
+
with gr.Blocks() as demo:
|
102 |
+
gr.Markdown("# π Enhanced RAG PDF Chatbot (Windows Compatible)")
|
103 |
+
gr.Markdown("Upload PDFs β Preview Keywords β Ask Questions β Get Answers with Confidence & Sources")
|
104 |
+
|
105 |
+
with gr.Row():
|
106 |
+
pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
|
107 |
+
load_button = gr.Button("Extract & Index")
|
108 |
+
cloud_output = gr.Image(label="Keyword Preview (Word Cloud)")
|
109 |
+
|
110 |
+
with gr.Row():
|
111 |
+
question_input = gr.Textbox(lines=2, placeholder="Ask your question here...", label="Question")
|
112 |
+
ask_button = gr.Button("Get Answer")
|
113 |
+
answer_output = gr.Markdown()
|
114 |
+
|
115 |
+
def load_and_index(files):
|
116 |
+
texts = extract_text_from_pdfs(files)
|
117 |
+
split_and_embed(texts)
|
118 |
+
return generate_wordcloud()
|
119 |
+
|
120 |
+
load_button.click(fn=load_and_index, inputs=[pdf_input], outputs=[cloud_output])
|
121 |
+
ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output, cloud_output])
|
122 |
+
|
123 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
PyMuPDF
|
3 |
+
langchain
|
4 |
+
sentence-transformers
|
5 |
+
sklearn
|
6 |
+
transformers
|
7 |
+
torch
|
8 |
+
wordcloud
|
9 |
+
matplotlib
|