Spaces:
Sleeping
Sleeping
import fitz | |
import tempfile | |
import gradio as gr | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.docstore.document import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.chains import RetrievalQA | |
from langchain_community.llms import HuggingFacePipeline | |
from transformers import pipeline | |
# Load and chunk PDF | |
def load_pdf_chunks(file_path, chunk_size=500, chunk_overlap=50): | |
doc = fitz.open(file_path) | |
text = "\n".join([page.get_text() for page in doc]) | |
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
chunks = splitter.split_text(text) | |
return [Document(page_content=chunk, metadata={"source": file_path}) for chunk in chunks if chunk.strip()] | |
# Setup RAG pipeline | |
def setup_rag(documents): | |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
vectorstore = FAISS.from_documents(documents, embeddings) | |
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 8, "lambda_mult": 0.5}) | |
gen_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_length=128) | |
llm = HuggingFacePipeline(pipeline=gen_pipeline) | |
chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True) | |
return chain | |
# Global RAG chain (updated on upload) | |
qa_chain = None | |
def upload_pdf(file): | |
global qa_chain | |
pdf_path = file.name | |
docs = load_pdf_chunks(pdf_path) | |
qa_chain = setup_rag(docs) | |
return "PDF uploaded and indexed!" | |
def query_rag(question): | |
if qa_chain is None: | |
return "Upload a PDF first!" | |
result = qa_chain({"query": question}) | |
return result["result"] | |
# Gradio UI | |
with gr.Blocks() as demo: | |
gr.Markdown("## 🧠 RAG App with MMR + PDF Upload (Hugging Face Demo)") | |
with gr.Row(): | |
file = gr.File(label="Upload a PDF", file_types=[".pdf"]) | |
upload_btn = gr.Button("Upload and Index") | |
status = gr.Textbox(label="Status") | |
upload_btn.click(upload_pdf, inputs=file, outputs=status) | |
with gr.Row(): | |
question = gr.Textbox(label="Enter your question") | |
answer = gr.Textbox(label="Answer") | |
answer_btn = gr.Button("Answer") | |
answer_btn.click(query_rag, inputs=question, outputs=answer) | |
if __name__ == "__main__": | |
demo.launch() |