Jesudian commited on
Commit
bc4be0a
Β·
verified Β·
1 Parent(s): 0fa1096

Upload 3 files

Browse files

Required files are added to the repo

Files changed (3) hide show
  1. README.md +14 -5
  2. app.py +63 -0
  3. requirements.txt +7 -0
README.md CHANGED
@@ -1,13 +1,22 @@
1
  ---
2
- title: RAG Chatbot
3
- emoji: πŸ“š
4
  colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Retrieval-Augmented Generation with MMR and PDF Upload
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: RAG with MMR + PDF Upload
3
+ emoji: πŸ“„
4
  colorFrom: blue
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ # 🧠 Retrieval-Augmented Generation with MMR and PDF Upload
14
+
15
+ This Gradio demo allows you to:
16
+
17
+ - Upload a PDF document
18
+ - Chunk the content and embed using `MiniLM`
19
+ - Store and search chunks using FAISS with **Maximal Marginal Relevance (MMR)**
20
+ - Answer questions using `FLAN-T5`
21
+
22
+ > Powered by LangChain + HuggingFace + Gradio + FAISS
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import tempfile
3
+ import gradio as gr
4
+
5
+ from langchain_community.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.vectorstores import FAISS
7
+ from langchain.docstore.document import Document
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain.chains import RetrievalQA
10
+ from langchain_community.llms import HuggingFacePipeline
11
+ from transformers import pipeline
12
+
13
+ # Load and chunk PDF
14
+ def load_pdf_chunks(file_path, chunk_size=500, chunk_overlap=50):
15
+ doc = fitz.open(file_path)
16
+ text = "\n".join([page.get_text() for page in doc])
17
+ splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
18
+ chunks = splitter.split_text(text)
19
+ return [Document(page_content=chunk, metadata={"source": file_path}) for chunk in chunks if chunk.strip()]
20
+
21
+ # Setup RAG pipeline
22
+ def setup_rag(documents):
23
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
24
+ vectorstore = FAISS.from_documents(documents, embeddings)
25
+ retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 8, "lambda_mult": 0.5})
26
+ gen_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_length=128)
27
+ llm = HuggingFacePipeline(pipeline=gen_pipeline)
28
+ chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
29
+ return chain
30
+
31
+ # Global RAG chain (updated on upload)
32
+ qa_chain = None
33
+
34
+ def upload_pdf(file):
35
+ global qa_chain
36
+ pdf_path = file.name
37
+ docs = load_pdf_chunks(pdf_path)
38
+ qa_chain = setup_rag(docs)
39
+ return "PDF uploaded and indexed!"
40
+
41
+ def query_rag(question):
42
+ if qa_chain is None:
43
+ return "Upload a PDF first!"
44
+ result = qa_chain({"query": question})
45
+ return result["result"]
46
+
47
+ # Gradio UI
48
+ with gr.Blocks() as demo:
49
+ gr.Markdown("## 🧠 RAG App with MMR + PDF Upload (Hugging Face Demo)")
50
+ with gr.Row():
51
+ file = gr.File(label="Upload a PDF", file_types=[".pdf"])
52
+ upload_btn = gr.Button("Upload and Index")
53
+ status = gr.Textbox(label="Status")
54
+ upload_btn.click(upload_pdf, inputs=file, outputs=status)
55
+
56
+ with gr.Row():
57
+ question = gr.Textbox(label="Enter your question")
58
+ answer = gr.Textbox(label="Answer")
59
+ answer_btn = gr.Button("Answer")
60
+ answer_btn.click(query_rag, inputs=question, outputs=answer)
61
+
62
+ if __name__ == "__main__":
63
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ langchain-community
4
+ sentence-transformers
5
+ transformers
6
+ faiss-cpu
7
+ pymupdf