Update app.py
Browse files
app.py
CHANGED
@@ -11,11 +11,11 @@ import re
|
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
from sentence_transformers import SentenceTransformer
|
13 |
from transformers import pipeline
|
14 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
|
16 |
# Load models
|
17 |
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
-
qa_pipeline = pipeline("question-answering", model="
|
19 |
|
20 |
# Globals
|
21 |
all_chunks = []
|
@@ -54,7 +54,10 @@ def split_and_embed(texts_with_sources):
|
|
54 |
all_chunks.append(doc.page_content)
|
55 |
chunk_sources.append(source)
|
56 |
|
57 |
-
|
|
|
|
|
|
|
58 |
|
59 |
def generate_wordcloud():
|
60 |
"""Generate a word cloud from combined PDF text"""
|
@@ -79,7 +82,10 @@ def generate_wordcloud():
|
|
79 |
def answer_question(question):
|
80 |
"""Retrieve top chunks, answer question, and show confidence"""
|
81 |
global all_chunks, chunk_sources, chunk_embeddings
|
|
|
|
|
82 |
if not all_chunks or chunk_embeddings is None:
|
|
|
83 |
return "Please upload and process some PDFs first.", None
|
84 |
|
85 |
q_emb = embed_model.encode([question], convert_to_numpy=True)
|
@@ -90,16 +96,26 @@ def answer_question(question):
|
|
90 |
selected_sources = [chunk_sources[i] for i in top_k_idx]
|
91 |
context = "\n\n".join(selected_chunks)
|
92 |
|
93 |
-
|
94 |
-
|
|
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
source_info = "\n".join([f"- {src}" for src in selected_sources])
|
97 |
result = f"**Answer**: {answer}\n\n**Sources**:\n{source_info}\n\n**Confidence Score**: {avg_conf:.2f}%"
|
|
|
98 |
return result, None
|
99 |
|
100 |
# Gradio UI
|
101 |
with gr.Blocks() as demo:
|
102 |
-
gr.Markdown("# π Enhanced RAG PDF Chatbot
|
103 |
gr.Markdown("Upload PDFs β Preview Keywords β Ask Questions β Get Answers with Confidence & Sources")
|
104 |
|
105 |
with gr.Row():
|
|
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
from sentence_transformers import SentenceTransformer
|
13 |
from transformers import pipeline
|
14 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
|
16 |
# Load models
|
17 |
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
+
qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2") # β
lightweight model
|
19 |
|
20 |
# Globals
|
21 |
all_chunks = []
|
|
|
54 |
all_chunks.append(doc.page_content)
|
55 |
chunk_sources.append(source)
|
56 |
|
57 |
+
if all_chunks:
|
58 |
+
chunk_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
|
59 |
+
else:
|
60 |
+
chunk_embeddings = None
|
61 |
|
62 |
def generate_wordcloud():
|
63 |
"""Generate a word cloud from combined PDF text"""
|
|
|
82 |
def answer_question(question):
|
83 |
"""Retrieve top chunks, answer question, and show confidence"""
|
84 |
global all_chunks, chunk_sources, chunk_embeddings
|
85 |
+
print("π₯ Question received:", question)
|
86 |
+
|
87 |
if not all_chunks or chunk_embeddings is None:
|
88 |
+
print("β οΈ PDF not processed or empty.")
|
89 |
return "Please upload and process some PDFs first.", None
|
90 |
|
91 |
q_emb = embed_model.encode([question], convert_to_numpy=True)
|
|
|
96 |
selected_sources = [chunk_sources[i] for i in top_k_idx]
|
97 |
context = "\n\n".join(selected_chunks)
|
98 |
|
99 |
+
if not context.strip():
|
100 |
+
print("β οΈ Empty context from chunks.")
|
101 |
+
return "Could not extract relevant content from the PDFs.", None
|
102 |
|
103 |
+
try:
|
104 |
+
answer_dict = qa_pipeline(question=question, context=context)
|
105 |
+
answer = answer_dict.get("answer", "No answer found.")
|
106 |
+
except Exception as e:
|
107 |
+
print("β Error from QA model:", e)
|
108 |
+
return "Model failed to generate an answer.", None
|
109 |
+
|
110 |
+
avg_conf = np.mean([sims[i] for i in top_k_idx]) * 100
|
111 |
source_info = "\n".join([f"- {src}" for src in selected_sources])
|
112 |
result = f"**Answer**: {answer}\n\n**Sources**:\n{source_info}\n\n**Confidence Score**: {avg_conf:.2f}%"
|
113 |
+
print("β
Answer generated.")
|
114 |
return result, None
|
115 |
|
116 |
# Gradio UI
|
117 |
with gr.Blocks() as demo:
|
118 |
+
gr.Markdown("# π Enhanced RAG PDF Chatbot")
|
119 |
gr.Markdown("Upload PDFs β Preview Keywords β Ask Questions β Get Answers with Confidence & Sources")
|
120 |
|
121 |
with gr.Row():
|