File size: 4,824 Bytes
24c530c
 
 
 
 
 
 
 
 
 
 
 
 
2c4ecd7
24c530c
 
 
2c4ecd7
24c530c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c4ecd7
 
 
 
24c530c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c4ecd7
 
24c530c
2c4ecd7
24c530c
 
 
 
 
 
 
 
 
 
2c4ecd7
 
 
24c530c
2c4ecd7
 
 
 
 
 
 
 
24c530c
 
2c4ecd7
24c530c
 
 
 
2c4ecd7
24c530c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import gradio as gr
import fitz  # PyMuPDF
import os
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from io import BytesIO
from collections import Counter
import re

from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity

# Load models
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2")  # βœ… lightweight model

# Globals
all_chunks = []
chunk_sources = []
chunk_embeddings = None
combined_text = ""

def extract_text_from_pdfs(pdf_files):
    """Extract text and page info from uploaded PDFs"""
    global all_chunks, chunk_sources, combined_text
    texts = []
    chunk_sources = []
    combined_text = ""

    for file in pdf_files:
        doc = fitz.open(file.name)
        for page_num, page in enumerate(doc):
            text = page.get_text()
            if text.strip():
                texts.append((text, f"{os.path.basename(file.name)} - Page {page_num + 1}"))
                combined_text += " " + text

    return texts

def split_and_embed(texts_with_sources):
    """Split text into chunks and compute embeddings"""
    global all_chunks, chunk_sources, chunk_embeddings
    all_chunks = []
    chunk_sources = []

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

    for text, source in texts_with_sources:
        docs = splitter.create_documents([text])
        for doc in docs:
            all_chunks.append(doc.page_content)
            chunk_sources.append(source)

    if all_chunks:
        chunk_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
    else:
        chunk_embeddings = None

def generate_wordcloud():
    """Generate a word cloud from combined PDF text"""
    global combined_text
    if not combined_text.strip():
        return None

    cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
    word_freq = Counter(cleaned.split())

    wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)

    fig, ax = plt.subplots()
    ax.imshow(wc, interpolation='bilinear')
    ax.axis("off")

    buffer = BytesIO()
    plt.savefig(buffer, format="png")
    buffer.seek(0)
    return buffer

def answer_question(question):
    """Retrieve top chunks, answer question, and show confidence"""
    global all_chunks, chunk_sources, chunk_embeddings
    print("πŸ“₯ Question received:", question)

    if not all_chunks or chunk_embeddings is None:
        print("⚠️ PDF not processed or empty.")
        return "Please upload and process some PDFs first.", None

    q_emb = embed_model.encode([question], convert_to_numpy=True)
    sims = cosine_similarity(q_emb, chunk_embeddings)[0]
    top_k_idx = sims.argsort()[::-1][:3]

    selected_chunks = [all_chunks[i] for i in top_k_idx]
    selected_sources = [chunk_sources[i] for i in top_k_idx]
    context = "\n\n".join(selected_chunks)

    if not context.strip():
        print("⚠️ Empty context from chunks.")
        return "Could not extract relevant content from the PDFs.", None

    try:
        answer_dict = qa_pipeline(question=question, context=context)
        answer = answer_dict.get("answer", "No answer found.")
    except Exception as e:
        print("❌ Error from QA model:", e)
        return "Model failed to generate an answer.", None

    avg_conf = np.mean([sims[i] for i in top_k_idx]) * 100
    source_info = "\n".join([f"- {src}" for src in selected_sources])
    result = f"**Answer**: {answer}\n\n**Sources**:\n{source_info}\n\n**Confidence Score**: {avg_conf:.2f}%"
    print("βœ… Answer generated.")
    return result, None

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# πŸ“š Enhanced RAG PDF Chatbot")
    gr.Markdown("Upload PDFs β†’ Preview Keywords β†’ Ask Questions β†’ Get Answers with Confidence & Sources")

    with gr.Row():
        pdf_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
        load_button = gr.Button("Extract & Index")
        cloud_output = gr.Image(label="Keyword Preview (Word Cloud)")

    with gr.Row():
        question_input = gr.Textbox(lines=2, placeholder="Ask your question here...", label="Question")
        ask_button = gr.Button("Get Answer")
        answer_output = gr.Markdown()

    def load_and_index(files):
        texts = extract_text_from_pdfs(files)
        split_and_embed(texts)
        return generate_wordcloud()

    load_button.click(fn=load_and_index, inputs=[pdf_input], outputs=[cloud_output])
    ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output, cloud_output])

demo.launch()