luckygill commited on
Commit
2c4ecd7
Β·
verified Β·
1 Parent(s): fe92e58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -6
app.py CHANGED
@@ -11,11 +11,11 @@ import re
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from sentence_transformers import SentenceTransformer
13
  from transformers import pipeline
14
- from sklearn.metrics.pairwise import cosine_similarity # βœ… Correct import
15
 
16
  # Load models
17
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
18
- qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
19
 
20
  # Globals
21
  all_chunks = []
@@ -54,7 +54,10 @@ def split_and_embed(texts_with_sources):
54
  all_chunks.append(doc.page_content)
55
  chunk_sources.append(source)
56
 
57
- chunk_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
 
 
 
58
 
59
  def generate_wordcloud():
60
  """Generate a word cloud from combined PDF text"""
@@ -79,7 +82,10 @@ def generate_wordcloud():
79
  def answer_question(question):
80
  """Retrieve top chunks, answer question, and show confidence"""
81
  global all_chunks, chunk_sources, chunk_embeddings
 
 
82
  if not all_chunks or chunk_embeddings is None:
 
83
  return "Please upload and process some PDFs first.", None
84
 
85
  q_emb = embed_model.encode([question], convert_to_numpy=True)
@@ -90,16 +96,26 @@ def answer_question(question):
90
  selected_sources = [chunk_sources[i] for i in top_k_idx]
91
  context = "\n\n".join(selected_chunks)
92
 
93
- answer = qa_pipeline(question=question, context=context)["answer"]
94
- avg_conf = np.mean([sims[i] for i in top_k_idx]) * 100
 
95
 
 
 
 
 
 
 
 
 
96
  source_info = "\n".join([f"- {src}" for src in selected_sources])
97
  result = f"**Answer**: {answer}\n\n**Sources**:\n{source_info}\n\n**Confidence Score**: {avg_conf:.2f}%"
 
98
  return result, None
99
 
100
  # Gradio UI
101
  with gr.Blocks() as demo:
102
- gr.Markdown("# πŸ“š Enhanced RAG PDF Chatbot (Hugging Face Compatible)")
103
  gr.Markdown("Upload PDFs β†’ Preview Keywords β†’ Ask Questions β†’ Get Answers with Confidence & Sources")
104
 
105
  with gr.Row():
 
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from sentence_transformers import SentenceTransformer
13
  from transformers import pipeline
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
 
16
  # Load models
17
  embed_model = SentenceTransformer('all-MiniLM-L6-v2')
18
+ qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2") # βœ… lightweight model
19
 
20
  # Globals
21
  all_chunks = []
 
54
  all_chunks.append(doc.page_content)
55
  chunk_sources.append(source)
56
 
57
+ if all_chunks:
58
+ chunk_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
59
+ else:
60
+ chunk_embeddings = None
61
 
62
  def generate_wordcloud():
63
  """Generate a word cloud from combined PDF text"""
 
82
  def answer_question(question):
83
  """Retrieve top chunks, answer question, and show confidence"""
84
  global all_chunks, chunk_sources, chunk_embeddings
85
+ print("πŸ“₯ Question received:", question)
86
+
87
  if not all_chunks or chunk_embeddings is None:
88
+ print("⚠️ PDF not processed or empty.")
89
  return "Please upload and process some PDFs first.", None
90
 
91
  q_emb = embed_model.encode([question], convert_to_numpy=True)
 
96
  selected_sources = [chunk_sources[i] for i in top_k_idx]
97
  context = "\n\n".join(selected_chunks)
98
 
99
+ if not context.strip():
100
+ print("⚠️ Empty context from chunks.")
101
+ return "Could not extract relevant content from the PDFs.", None
102
 
103
+ try:
104
+ answer_dict = qa_pipeline(question=question, context=context)
105
+ answer = answer_dict.get("answer", "No answer found.")
106
+ except Exception as e:
107
+ print("❌ Error from QA model:", e)
108
+ return "Model failed to generate an answer.", None
109
+
110
+ avg_conf = np.mean([sims[i] for i in top_k_idx]) * 100
111
  source_info = "\n".join([f"- {src}" for src in selected_sources])
112
  result = f"**Answer**: {answer}\n\n**Sources**:\n{source_info}\n\n**Confidence Score**: {avg_conf:.2f}%"
113
+ print("βœ… Answer generated.")
114
  return result, None
115
 
116
  # Gradio UI
117
  with gr.Blocks() as demo:
118
+ gr.Markdown("# πŸ“š Enhanced RAG PDF Chatbot")
119
  gr.Markdown("Upload PDFs β†’ Preview Keywords β†’ Ask Questions β†’ Get Answers with Confidence & Sources")
120
 
121
  with gr.Row():