File size: 3,988 Bytes
24c530c
 
 
d365d0d
 
 
 
24c530c
 
 
 
 
 
 
2c4ecd7
24c530c
d365d0d
24c530c
d365d0d
24c530c
d365d0d
24c530c
 
 
 
 
 
d365d0d
 
24c530c
 
 
 
 
d365d0d
24c530c
 
d365d0d
24c530c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c4ecd7
 
 
 
24c530c
 
 
 
 
 
 
 
 
 
 
 
 
 
d365d0d
 
 
 
 
24c530c
 
 
 
d365d0d
24c530c
 
 
 
 
d365d0d
2c4ecd7
d365d0d
24c530c
2c4ecd7
d365d0d
 
 
 
 
 
 
 
 
24c530c
d365d0d
 
24c530c
 
d365d0d
 
 
24c530c
 
d365d0d
 
 
24c530c
d365d0d
24c530c
 
 
 
d365d0d
 
24c530c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import fitz  # PyMuPDF
import os
import re
from io import BytesIO
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity

# Load models once
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
qa_pipeline = pipeline("question-answering", model="deepset/tinyroberta-squad2")

# Globals to hold data
all_chunks = []
chunk_sources = []
chunk_embeddings = None
combined_text = ""

def extract_text_from_pdfs(pdf_files):
    global combined_text, chunk_sources
    combined_text = ""
    texts = []
    chunk_sources = []

    for file in pdf_files:
        doc = fitz.open(file.name)
        for i, page in enumerate(doc):
            text = page.get_text()
            if text.strip():
                texts.append((text, f"{os.path.basename(file.name)} - Page {i+1}"))
                combined_text += " " + text
    return texts

def split_and_embed(texts_with_sources):
    global all_chunks, chunk_sources, chunk_embeddings
    all_chunks = []
    chunk_sources = []

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    for text, source in texts_with_sources:
        docs = splitter.create_documents([text])
        for doc in docs:
            all_chunks.append(doc.page_content)
            chunk_sources.append(source)

    if all_chunks:
        chunk_embeddings = embed_model.encode(all_chunks, convert_to_numpy=True)
    else:
        chunk_embeddings = None

def generate_wordcloud():
    global combined_text
    if not combined_text.strip():
        return None

    cleaned = re.sub(r"[^a-zA-Z\s]", "", combined_text.lower())
    word_freq = Counter(cleaned.split())
    wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(word_freq)

    fig, ax = plt.subplots()
    ax.imshow(wc, interpolation='bilinear')
    ax.axis("off")

    buf = BytesIO()
    plt.savefig(buf, format="png")
    plt.close(fig)
    buf.seek(0)
    return buf

def answer_question(question):
    global all_chunks, chunk_sources, chunk_embeddings
    if not all_chunks or chunk_embeddings is None:
        return "Please upload and index PDFs first."

    q_emb = embed_model.encode([question], convert_to_numpy=True)
    sims = cosine_similarity(q_emb, chunk_embeddings)[0]
    top_k_idx = sims.argsort()[::-1][:3]

    context = "\n\n".join([all_chunks[i] for i in top_k_idx])
    if not context.strip():
        return "No relevant content found in PDFs."

    try:
        result = qa_pipeline(question=question, context=context)
        answer = result.get("answer", "No answer found.")
    except Exception:
        return "Error generating answer from the model."

    sources = "\n".join(set(chunk_sources[i] for i in top_k_idx))
    confidence = np.mean([sims[i] for i in top_k_idx]) * 100
    return f"**Answer:** {answer}\n\n**Sources:**\n{sources}\n\n**Confidence:** {confidence:.2f}%"

with gr.Blocks() as demo:
    gr.Markdown("# PDF Chatbot")
    gr.Markdown("Upload PDFs, extract text, then ask questions.")

    with gr.Row():
        pdf_input = gr.File(file_types=[".pdf"], file_count="multiple")
        extract_btn = gr.Button("Extract & Index")
        wc_img = gr.Image(label="Word Cloud")

    with gr.Row():
        question_input = gr.Textbox(lines=2, placeholder="Ask your question here...")
        ask_btn = gr.Button("Get Answer")
        answer_out = gr.Markdown()

    def extract_and_show_wordcloud(files):
        texts = extract_text_from_pdfs(files)
        split_and_embed(texts)
        return generate_wordcloud()

    extract_btn.click(extract_and_show_wordcloud, inputs=[pdf_input], outputs=[wc_img])
    ask_btn.click(answer_question, inputs=[question_input], outputs=[answer_out])

demo.launch()