File size: 3,988 Bytes
24c530c d365d0d 24c530c 2c4ecd7 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c 2c4ecd7 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 2c4ecd7 d365d0d 24c530c 2c4ecd7 d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c d365d0d 24c530c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import fitz # PyMuPDF
import os
import re
from io import BytesIO
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
# Load models once
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2")
# Globals to hold data
all_chunks = []
chunk_sources = []
chunk_embeddings = None
combined_text = ""
def extract_text_from_pdfs(pdf_files):
global combined_text, chunk_sources
combined_text = ""
texts = []
chunk_sources = []
for file in pdf_files:
doc = fitz.open(file.name)
for i, page in enumerate(doc):
text = page.get_text()
if text.strip():
texts.append((text, f"{os.path.basename(file.name)} - Page {i+1}"))
combined_text += " " + text
return texts
def split_and_embed(texts_with_sources):
global all_chunks, chunk_sources, chunk_embeddings
all_chunks = []
chunk_sources = []
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
for text, source in texts_with_sources:
docs = splitter.create_documents([text])
for doc in docs:
all_chunks.append(doc.page_content)
chunk_sources.append(source)
if all_chunks:
chunk_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
else:
chunk_embeddings = None
def generate_wordcloud():
global combined_text
if not combined_text.strip():
return None
cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
word_freq = Counter(cleaned.split())
wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)
fig, ax = plt.subplots()
ax.imshow(wc, interpolation='bilinear')
ax.axis("off")
buf = BytesIO()
plt.savefig(buf, format="png")
plt.close(fig)
buf.seek(0)
return buf
def answer_question(question):
global all_chunks, chunk_sources, chunk_embeddings
if not all_chunks or chunk_embeddings is None:
return "Please upload and index PDFs first."
q_emb = embed_model.encode([question], convert_to_numpy=True)
sims = cosine_similarity(q_emb, chunk_embeddings)[0]
top_k_idx = sims.argsort()[::-1][:3]
context = "\n\n".join([all_chunks[i] for i in top_k_idx])
if not context.strip():
return "No relevant content found in PDFs."
try:
result = qa_pipeline(question=question, context=context)
answer = result.get("answer", "No answer found.")
except Exception:
return "Error generating answer from the model."
sources = "\n".join(set(chunk_sources[i] for i in top_k_idx))
confidence = np.mean([sims[i] for i in top_k_idx]) * 100
return f"**Answer:** {answer}\n\n**Sources:**\n{sources}\n\n**Confidence:** {confidence:.2f}%"
with gr.Blocks() as demo:
gr.Markdown("# PDF Chatbot")
gr.Markdown("Upload PDFs, extract text, then ask questions.")
with gr.Row():
pdf_input = gr.File(file_types=[".pdf"], file_count="multiple")
extract_btn = gr.Button("Extract & Index")
wc_img = gr.Image(label="Word Cloud")
with gr.Row():
question_input = gr.Textbox(lines=2, placeholder="Ask your question here...")
ask_btn = gr.Button("Get Answer")
answer_out = gr.Markdown()
def extract_and_show_wordcloud(files):
texts = extract_text_from_pdfs(files)
split_and_embed(texts)
return generate_wordcloud()
extract_btn.click(extract_and_show_wordcloud, inputs=[pdf_input], outputs=[wc_img])
ask_btn.click(answer_question, inputs=[question_input], outputs=[answer_out])
demo.launch()
|