Spaces:
Sleeping
Sleeping
# app.py | |
from pypdf import PdfReader | |
import gradio as gr | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import numpy as np | |
# Load embedding model | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
# Global state to persist embeddings and chunks | |
index = None | |
chunks = [] | |
# Step 1: Extract text from uploaded PDFs | |
def extract_text_from_pdfs(files): | |
all_text = "" | |
for file in files: | |
reader = PdfReader(file.name) | |
for page in reader.pages: | |
text = page.extract_text() | |
if text: | |
all_text += text + "\n" | |
return all_text | |
# Step 2: Chunk text | |
def chunk_text(text, chunk_size=500, overlap=50): | |
words = text.split() | |
result = [] | |
for i in range(0, len(words), chunk_size - overlap): | |
chunk = " ".join(words[i:i + chunk_size]) | |
result.append(chunk) | |
return result | |
# Step 3: Embed and store chunks | |
def create_index(text_chunks): | |
global index, chunks | |
chunks = text_chunks | |
embeddings = model.encode(chunks) | |
index = faiss.IndexFlatL2(len(embeddings[0])) | |
index.add(np.array(embeddings)) | |
# Step 4: Retrieve top relevant chunks | |
def get_top_chunks(query, k=3): | |
query_vec = model.encode([query]) | |
D, I = index.search(np.array(query_vec), k) | |
return [chunks[i] for i in I[0]] | |
# Step 5: Fake LLM response (replace with real API call if needed) | |
def call_llm(context, question): | |
return f"Answer (simulated): Based on context:\n\n{context}\n\nQuestion: {question}" | |
# Step 6: Gradio main function | |
def rag_pipeline(files, question): | |
text = extract_text_from_pdfs(files) | |
text_chunks = chunk_text(text) | |
create_index(text_chunks) | |
top_chunks = get_top_chunks(question) | |
context = "\n".join(top_chunks) | |
answer = call_llm(context, question) | |
return answer | |
# Step 7: Gradio UI | |
demo = gr.Interface( | |
fn=rag_pipeline, | |
inputs=[ | |
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"), | |
gr.Textbox(lines=2, label="Ask a question") | |
], | |
outputs="text", | |
title="RAG PDF Chatbot", | |
description="Upload PDFs and ask questions based on their content" | |
) | |
if __name__ == "__main__": | |
demo.launch() | |