Spaces:
Sleeping
Sleeping
File size: 2,264 Bytes
e9edc62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# app.py
from pypdf import PdfReader
import gradio as gr
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Global state to persist embeddings and chunks
index = None
chunks = []
# Step 1: Extract text from uploaded PDFs
def extract_text_from_pdfs(files):
all_text = ""
for file in files:
reader = PdfReader(file.name)
for page in reader.pages:
text = page.extract_text()
if text:
all_text += text + "\n"
return all_text
# Step 2: Chunk text
def chunk_text(text, chunk_size=500, overlap=50):
words = text.split()
result = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
result.append(chunk)
return result
# Step 3: Embed and store chunks
def create_index(text_chunks):
global index, chunks
chunks = text_chunks
embeddings = model.encode(chunks)
index = faiss.IndexFlatL2(len(embeddings[0]))
index.add(np.array(embeddings))
# Step 4: Retrieve top relevant chunks
def get_top_chunks(query, k=3):
query_vec = model.encode([query])
D, I = index.search(np.array(query_vec), k)
return [chunks[i] for i in I[0]]
# Step 5: Fake LLM response (replace with real API call if needed)
def call_llm(context, question):
return f"Answer (simulated): Based on context:\n\n{context}\n\nQuestion: {question}"
# Step 6: Gradio main function
def rag_pipeline(files, question):
text = extract_text_from_pdfs(files)
text_chunks = chunk_text(text)
create_index(text_chunks)
top_chunks = get_top_chunks(question)
context = "\n".join(top_chunks)
answer = call_llm(context, question)
return answer
# Step 7: Gradio UI
demo = gr.Interface(
fn=rag_pipeline,
inputs=[
gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs"),
gr.Textbox(lines=2, label="Ask a question")
],
outputs="text",
title="RAG PDF Chatbot",
description="Upload PDFs and ask questions based on their content"
)
if __name__ == "__main__":
demo.launch()
|