luckygill commited on
Commit
d365d0d
Β·
verified Β·
1 Parent(s): 90b9419

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -55
app.py CHANGED
@@ -1,53 +1,50 @@
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
  import os
 
 
 
 
4
  import numpy as np
5
  import matplotlib.pyplot as plt
6
  from wordcloud import WordCloud
7
- from io import BytesIO
8
- from collections import Counter
9
- import re
10
 
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from sentence_transformers import SentenceTransformer
13
  from transformers import pipeline
14
  from sklearn.metrics.pairwise import cosine_similarity
15
 
16
- # Load models
17
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
18
- qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2") # βœ… lightweight model
19
 
20
- # Globals
21
  all_chunks = []
22
  chunk_sources = []
23
  chunk_embeddings = None
24
  combined_text = ""
25
 
26
  def extract_text_from_pdfs(pdf_files):
27
- """Extract text and page info from uploaded PDFs"""
28
- global all_chunks, chunk_sources, combined_text
29
  texts = []
30
  chunk_sources = []
31
- combined_text = ""
32
 
33
  for file in pdf_files:
34
  doc = fitz.open(file.name)
35
- for page_num, page in enumerate(doc):
36
  text = page.get_text()
37
  if text.strip():
38
- texts.append((text, f"{os.path.basename(file.name)} - Page {page_num + 1}"))
39
  combined_text += " " + text
40
-
41
  return texts
42
 
43
  def split_and_embed(texts_with_sources):
44
- """Split text into chunks and compute embeddings"""
45
  global all_chunks, chunk_sources, chunk_embeddings
46
  all_chunks = []
47
  chunk_sources = []
48
 
49
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
50
-
51
  for text, source in texts_with_sources:
52
  docs = splitter.create_documents([text])
53
  for doc in docs:
@@ -60,80 +57,67 @@ def split_and_embed(texts_with_sources):
60
  chunk_embeddings = None
61
 
62
  def generate_wordcloud():
63
- """Generate a word cloud from combined PDF text"""
64
  global combined_text
65
  if not combined_text.strip():
66
  return None
67
 
68
  cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
69
  word_freq = Counter(cleaned.split())
70
-
71
  wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
72
 
73
  fig, ax = plt.subplots()
74
  ax.imshow(wc, interpolation='bilinear')
75
  ax.axis("off")
76
 
77
- buffer = BytesIO()
78
- plt.savefig(buffer, format="png")
79
- buffer.seek(0)
80
- return buffer
 
81
 
82
  def answer_question(question):
83
- """Retrieve top chunks, answer question, and show confidence"""
84
  global all_chunks, chunk_sources, chunk_embeddings
85
- print("πŸ“₯ Question received:", question)
86
-
87
  if not all_chunks or chunk_embeddings is None:
88
- print("⚠️ PDF not processed or empty.")
89
- return "Please upload and process some PDFs first.", None
90
 
91
  q_emb = embed_model.encode([question], convert_to_numpy=True)
92
  sims = cosine_similarity(q_emb, chunk_embeddings)[0]
93
  top_k_idx = sims.argsort()[::-1][:3]
94
 
95
- selected_chunks = [all_chunks[i] for i in top_k_idx]
96
- selected_sources = [chunk_sources[i] for i in top_k_idx]
97
- context = "\n\n".join(selected_chunks)
98
-
99
  if not context.strip():
100
- print("⚠️ Empty context from chunks.")
101
- return "Could not extract relevant content from the PDFs.", None
102
 
103
  try:
104
- answer_dict = qa_pipeline(question=question, context=context)
105
- answer = answer_dict.get("answer", "No answer found.")
106
- except Exception as e:
107
- print("❌ Error from QA model:", e)
108
- return "Model failed to generate an answer.", None
109
-
110
- avg_conf = np.mean([sims[i] for i in top_k_idx]) * 100
111
- source_info = "\n".join([f"- {src}" for src in selected_sources])
112
- result = f"**Answer**: {answer}\n\n**Sources**:\n{source_info}\n\n**Confidence Score**: {avg_conf:.2f}%"
113
- print("βœ… Answer generated.")
114
- return result, None
115
-
116
- # Gradio UI
117
  with gr.Blocks() as demo:
118
- gr.Markdown("# πŸ“š Enhanced RAG PDF Chatbot")
119
- gr.Markdown("Upload PDFs β†’ Preview Keywords β†’ Ask Questions β†’ Get Answers with Confidence & Sources")
120
 
121
  with gr.Row():
122
- pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
123
- load_button = gr.Button("Extract & Index")
124
- cloud_output = gr.Image(label="Keyword Preview (Word Cloud)")
125
 
126
  with gr.Row():
127
- question_input = gr.Textbox(lines=2, placeholder="Ask your question here...", label="Question")
128
- ask_button = gr.Button("Get Answer")
129
- answer_output = gr.Markdown()
130
 
131
- def load_and_index(files):
132
  texts = extract_text_from_pdfs(files)
133
  split_and_embed(texts)
134
  return generate_wordcloud()
135
 
136
- load_button.click(fn=load_and_index, inputs=[pdf_input], outputs=[cloud_output])
137
- ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output, cloud_output])
138
 
139
  demo.launch()
 
1
  import gradio as gr
2
  import fitz # PyMuPDF
3
  import os
4
+ import re
5
+ from io import BytesIO
6
+ from collections import Counter
7
+
8
  import numpy as np
9
  import matplotlib.pyplot as plt
10
  from wordcloud import WordCloud
 
 
 
11
 
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from sentence_transformers import SentenceTransformer
14
  from transformers import pipeline
15
  from sklearn.metrics.pairwise import cosine_similarity
16
 
17
+ # Load models once
18
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
19
+ qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2")
20
 
21
+ # Globals to hold data
22
  all_chunks = []
23
  chunk_sources = []
24
  chunk_embeddings = None
25
  combined_text = ""
26
 
27
  def extract_text_from_pdfs(pdf_files):
28
+ global combined_text, chunk_sources
29
+ combined_text = ""
30
  texts = []
31
  chunk_sources = []
 
32
 
33
  for file in pdf_files:
34
  doc = fitz.open(file.name)
35
+ for i, page in enumerate(doc):
36
  text = page.get_text()
37
  if text.strip():
38
+ texts.append((text, f"{os.path.basename(file.name)} - Page {i+1}"))
39
  combined_text += " " + text
 
40
  return texts
41
 
42
  def split_and_embed(texts_with_sources):
 
43
  global all_chunks, chunk_sources, chunk_embeddings
44
  all_chunks = []
45
  chunk_sources = []
46
 
47
  splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
 
48
  for text, source in texts_with_sources:
49
  docs = splitter.create_documents([text])
50
  for doc in docs:
 
57
  chunk_embeddings = None
58
 
59
  def generate_wordcloud():
 
60
  global combined_text
61
  if not combined_text.strip():
62
  return None
63
 
64
  cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
65
  word_freq = Counter(cleaned.split())
 
66
  wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
67
 
68
  fig, ax = plt.subplots()
69
  ax.imshow(wc, interpolation='bilinear')
70
  ax.axis("off")
71
 
72
+ buf = BytesIO()
73
+ plt.savefig(buf, format="png")
74
+ plt.close(fig)
75
+ buf.seek(0)
76
+ return buf
77
 
78
  def answer_question(question):
 
79
  global all_chunks, chunk_sources, chunk_embeddings
 
 
80
  if not all_chunks or chunk_embeddings is None:
81
+ return "Please upload and index PDFs first."
 
82
 
83
  q_emb = embed_model.encode([question], convert_to_numpy=True)
84
  sims = cosine_similarity(q_emb, chunk_embeddings)[0]
85
  top_k_idx = sims.argsort()[::-1][:3]
86
 
87
+ context = "\n\n".join([all_chunks[i] for i in top_k_idx])
 
 
 
88
  if not context.strip():
89
+ return "No relevant content found in PDFs."
 
90
 
91
  try:
92
+ result = qa_pipeline(question=question, context=context)
93
+ answer = result.get("answer", "No answer found.")
94
+ except Exception:
95
+ return "Error generating answer from the model."
96
+
97
+ sources = "\n".join(set(chunk_sources[i] for i in top_k_idx))
98
+ confidence = np.mean([sims[i] for i in top_k_idx]) * 100
99
+ return f"**Answer:** {answer}\n\n**Sources:**\n{sources}\n\n**Confidence:** {confidence:.2f}%"
100
+
 
 
 
 
101
  with gr.Blocks() as demo:
102
+ gr.Markdown("# PDF Chatbot")
103
+ gr.Markdown("Upload PDFs, extract text, then ask questions.")
104
 
105
  with gr.Row():
106
+ pdf_input = gr.File(file_types=[".pdf"], file_count="multiple")
107
+ extract_btn = gr.Button("Extract & Index")
108
+ wc_img = gr.Image(label="Word Cloud")
109
 
110
  with gr.Row():
111
+ question_input = gr.Textbox(lines=2, placeholder="Ask your question here...")
112
+ ask_btn = gr.Button("Get Answer")
113
+ answer_out = gr.Markdown()
114
 
115
+ def extract_and_show_wordcloud(files):
116
  texts = extract_text_from_pdfs(files)
117
  split_and_embed(texts)
118
  return generate_wordcloud()
119
 
120
+ extract_btn.click(extract_and_show_wordcloud, inputs=[pdf_input], outputs=[wc_img])
121
+ ask_btn.click(answer_question, inputs=[question_input], outputs=[answer_out])
122
 
123
  demo.launch()