Update app.py
Browse files
app.py
CHANGED
@@ -1,53 +1,50 @@
|
|
1 |
import gradio as gr
|
2 |
import fitz # PyMuPDF
|
3 |
import os
|
|
|
|
|
|
|
|
|
4 |
import numpy as np
|
5 |
import matplotlib.pyplot as plt
|
6 |
from wordcloud import WordCloud
|
7 |
-
from io import BytesIO
|
8 |
-
from collections import Counter
|
9 |
-
import re
|
10 |
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
from sentence_transformers import SentenceTransformer
|
13 |
from transformers import pipeline
|
14 |
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
|
16 |
-
# Load models
|
17 |
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
18 |
-
qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2")
|
19 |
|
20 |
-
# Globals
|
21 |
all_chunks = []
|
22 |
chunk_sources = []
|
23 |
chunk_embeddings = None
|
24 |
combined_text = ""
|
25 |
|
26 |
def extract_text_from_pdfs(pdf_files):
|
27 |
-
|
28 |
-
|
29 |
texts = []
|
30 |
chunk_sources = []
|
31 |
-
combined_text = ""
|
32 |
|
33 |
for file in pdf_files:
|
34 |
doc = fitz.open(file.name)
|
35 |
-
for
|
36 |
text = page.get_text()
|
37 |
if text.strip():
|
38 |
-
texts.append((text, f"{os.path.basename(file.name)} - Page {
|
39 |
combined_text += " " + text
|
40 |
-
|
41 |
return texts
|
42 |
|
43 |
def split_and_embed(texts_with_sources):
|
44 |
-
"""Split text into chunks and compute embeddings"""
|
45 |
global all_chunks, chunk_sources, chunk_embeddings
|
46 |
all_chunks = []
|
47 |
chunk_sources = []
|
48 |
|
49 |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
50 |
-
|
51 |
for text, source in texts_with_sources:
|
52 |
docs = splitter.create_documents([text])
|
53 |
for doc in docs:
|
@@ -60,80 +57,67 @@ def split_and_embed(texts_with_sources):
|
|
60 |
chunk_embeddings = None
|
61 |
|
62 |
def generate_wordcloud():
|
63 |
-
"""Generate a word cloud from combined PDF text"""
|
64 |
global combined_text
|
65 |
if not combined_text.strip():
|
66 |
return None
|
67 |
|
68 |
cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
|
69 |
word_freq = Counter(cleaned.split())
|
70 |
-
|
71 |
wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
|
72 |
|
73 |
fig, ax = plt.subplots()
|
74 |
ax.imshow(wc, interpolation='bilinear')
|
75 |
ax.axis("off")
|
76 |
|
77 |
-
|
78 |
-
plt.savefig(
|
79 |
-
|
80 |
-
|
|
|
81 |
|
82 |
def answer_question(question):
|
83 |
-
"""Retrieve top chunks, answer question, and show confidence"""
|
84 |
global all_chunks, chunk_sources, chunk_embeddings
|
85 |
-
print("π₯ Question received:", question)
|
86 |
-
|
87 |
if not all_chunks or chunk_embeddings is None:
|
88 |
-
|
89 |
-
return "Please upload and process some PDFs first.", None
|
90 |
|
91 |
q_emb = embed_model.encode([question], convert_to_numpy=True)
|
92 |
sims = cosine_similarity(q_emb, chunk_embeddings)[0]
|
93 |
top_k_idx = sims.argsort()[::-1][:3]
|
94 |
|
95 |
-
|
96 |
-
selected_sources = [chunk_sources[i] for i in top_k_idx]
|
97 |
-
context = "\n\n".join(selected_chunks)
|
98 |
-
|
99 |
if not context.strip():
|
100 |
-
|
101 |
-
return "Could not extract relevant content from the PDFs.", None
|
102 |
|
103 |
try:
|
104 |
-
|
105 |
-
answer =
|
106 |
-
except Exception
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
print("β
Answer generated.")
|
114 |
-
return result, None
|
115 |
-
|
116 |
-
# Gradio UI
|
117 |
with gr.Blocks() as demo:
|
118 |
-
gr.Markdown("#
|
119 |
-
gr.Markdown("Upload PDFs
|
120 |
|
121 |
with gr.Row():
|
122 |
-
pdf_input = gr.File(file_types=[".pdf"], file_count="multiple"
|
123 |
-
|
124 |
-
|
125 |
|
126 |
with gr.Row():
|
127 |
-
question_input = gr.Textbox(lines=2, placeholder="Ask your question here..."
|
128 |
-
|
129 |
-
|
130 |
|
131 |
-
def
|
132 |
texts = extract_text_from_pdfs(files)
|
133 |
split_and_embed(texts)
|
134 |
return generate_wordcloud()
|
135 |
|
136 |
-
|
137 |
-
|
138 |
|
139 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import fitz # PyMuPDF
|
3 |
import os
|
4 |
+
import re
|
5 |
+
from io import BytesIO
|
6 |
+
from collections import Counter
|
7 |
+
|
8 |
import numpy as np
|
9 |
import matplotlib.pyplot as plt
|
10 |
from wordcloud import WordCloud
|
|
|
|
|
|
|
11 |
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
from sentence_transformers import SentenceTransformer
|
14 |
from transformers import pipeline
|
15 |
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
|
17 |
+
# Load models once
|
18 |
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
|
19 |
+
qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2")
|
20 |
|
21 |
+
# Globals to hold data
|
22 |
all_chunks = []
|
23 |
chunk_sources = []
|
24 |
chunk_embeddings = None
|
25 |
combined_text = ""
|
26 |
|
27 |
def extract_text_from_pdfs(pdf_files):
|
28 |
+
global combined_text, chunk_sources
|
29 |
+
combined_text = ""
|
30 |
texts = []
|
31 |
chunk_sources = []
|
|
|
32 |
|
33 |
for file in pdf_files:
|
34 |
doc = fitz.open(file.name)
|
35 |
+
for i, page in enumerate(doc):
|
36 |
text = page.get_text()
|
37 |
if text.strip():
|
38 |
+
texts.append((text, f"{os.path.basename(file.name)} - Page {i+1}"))
|
39 |
combined_text += " " + text
|
|
|
40 |
return texts
|
41 |
|
42 |
def split_and_embed(texts_with_sources):
|
|
|
43 |
global all_chunks, chunk_sources, chunk_embeddings
|
44 |
all_chunks = []
|
45 |
chunk_sources = []
|
46 |
|
47 |
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
|
|
48 |
for text, source in texts_with_sources:
|
49 |
docs = splitter.create_documents([text])
|
50 |
for doc in docs:
|
|
|
57 |
chunk_embeddings = None
|
58 |
|
59 |
def generate_wordcloud():
|
|
|
60 |
global combined_text
|
61 |
if not combined_text.strip():
|
62 |
return None
|
63 |
|
64 |
cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
|
65 |
word_freq = Counter(cleaned.split())
|
|
|
66 |
wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
|
67 |
|
68 |
fig, ax = plt.subplots()
|
69 |
ax.imshow(wc, interpolation='bilinear')
|
70 |
ax.axis("off")
|
71 |
|
72 |
+
buf = BytesIO()
|
73 |
+
plt.savefig(buf, format="png")
|
74 |
+
plt.close(fig)
|
75 |
+
buf.seek(0)
|
76 |
+
return buf
|
77 |
|
78 |
def answer_question(question):
|
|
|
79 |
global all_chunks, chunk_sources, chunk_embeddings
|
|
|
|
|
80 |
if not all_chunks or chunk_embeddings is None:
|
81 |
+
return "Please upload and index PDFs first."
|
|
|
82 |
|
83 |
q_emb = embed_model.encode([question], convert_to_numpy=True)
|
84 |
sims = cosine_similarity(q_emb, chunk_embeddings)[0]
|
85 |
top_k_idx = sims.argsort()[::-1][:3]
|
86 |
|
87 |
+
context = "\n\n".join([all_chunks[i] for i in top_k_idx])
|
|
|
|
|
|
|
88 |
if not context.strip():
|
89 |
+
return "No relevant content found in PDFs."
|
|
|
90 |
|
91 |
try:
|
92 |
+
result = qa_pipeline(question=question, context=context)
|
93 |
+
answer = result.get("answer", "No answer found.")
|
94 |
+
except Exception:
|
95 |
+
return "Error generating answer from the model."
|
96 |
+
|
97 |
+
sources = "\n".join(set(chunk_sources[i] for i in top_k_idx))
|
98 |
+
confidence = np.mean([sims[i] for i in top_k_idx]) * 100
|
99 |
+
return f"**Answer:** {answer}\n\n**Sources:**\n{sources}\n\n**Confidence:** {confidence:.2f}%"
|
100 |
+
|
|
|
|
|
|
|
|
|
101 |
with gr.Blocks() as demo:
|
102 |
+
gr.Markdown("# PDF Chatbot")
|
103 |
+
gr.Markdown("Upload PDFs, extract text, then ask questions.")
|
104 |
|
105 |
with gr.Row():
|
106 |
+
pdf_input = gr.File(file_types=[".pdf"], file_count="multiple")
|
107 |
+
extract_btn = gr.Button("Extract & Index")
|
108 |
+
wc_img = gr.Image(label="Word Cloud")
|
109 |
|
110 |
with gr.Row():
|
111 |
+
question_input = gr.Textbox(lines=2, placeholder="Ask your question here...")
|
112 |
+
ask_btn = gr.Button("Get Answer")
|
113 |
+
answer_out = gr.Markdown()
|
114 |
|
115 |
+
def extract_and_show_wordcloud(files):
|
116 |
texts = extract_text_from_pdfs(files)
|
117 |
split_and_embed(texts)
|
118 |
return generate_wordcloud()
|
119 |
|
120 |
+
extract_btn.click(extract_and_show_wordcloud, inputs=[pdf_input], outputs=[wc_img])
|
121 |
+
ask_btn.click(answer_question, inputs=[question_input], outputs=[answer_out])
|
122 |
|
123 |
demo.launch()
|