Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

File size: 7,477 Bytes

# import os
# import streamlit as st
# import fitz  # PyMuPDF
# import logging
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import SentenceTransformerEmbeddings
# from langchain_community.llms import HuggingFacePipeline
# from langchain.chains import RetrievalQA
# from langchain.prompts import PromptTemplate
# from langchain_community.document_loaders import TextLoader

# # --- Configuration ---
# st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
# st.title("📚 RAG-based PDF Chatbot")
# device = "cpu"

# # --- Logging ---
# logging.basicConfig(level=logging.INFO)

# # --- Load LLM ---
# @st.cache_resource
# def load_model():
#     checkpoint = "MBZUAI/LaMini-T5-738M"
#     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
#     pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
#     return HuggingFacePipeline(pipeline=pipe)

# # --- Extract PDF Text ---
# def read_pdf(file):
#     try:
#         doc = fitz.open(stream=file.read(), filetype="pdf")
#         text = ""
#         for page in doc:
#             text += page.get_text()
#         return text.strip()
#     except Exception as e:
#         logging.error(f"Failed to extract text: {e}")
#         return ""

# # --- Process Answer ---dd
# def process_answer(question, full_text):
#     # Save the full_text to a temporary file
#     with open("temp_text.txt", "w") as f:
#         f.write(full_text)

#     loader = TextLoader("temp_text.txt")
#     docs = loader.load()

#     # Chunk the documents with increased size and overlap
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
#     splits = text_splitter.split_documents(docs)

#     # Load embeddings
#     embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")

#     # Create Chroma in-memory vector store
#     db = Chroma.from_documents(splits, embedding=embeddings)
#     retriever = db.as_retriever()

#     # Set up the model
#     llm = load_model()

#     # Create a custom prompt
#     prompt_template = PromptTemplate(
#     input_variables=["context", "question"],
#     template="""
#     You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
     
#     Context:
#     {context}
     
#     Question:
#     {question}
     
#     Important Instructions:
#     - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
#     - Do NOT summarize or paraphrase.
#     - If the information is not in the context, say "Not found in the document."
     
#     Answer:
#     """)
     

#     # Retrieval QA with custom prompt
#     qa_chain = RetrievalQA.from_chain_type(
#         llm=llm,
#         retriever=retriever,
#         chain_type="stuff",
#         chain_type_kwargs={"prompt": prompt_template}
#     )

#     # Return the answer using the retrieval QA chain
#     return qa_chain.run(question)

# # --- UI Layout ---
# with st.sidebar:
#     st.header("📄 Upload PDF")
#     uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])

# # --- Main Interface ---
# if uploaded_file:
#     st.success(f"You uploaded: {uploaded_file.name}")
#     full_text = read_pdf(uploaded_file)

#     if full_text:
#         st.subheader("📁 PDF Preview")
#         with st.expander("View Extracted Text"):
#             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))

#         st.subheader("💬 Ask a Question")
#         user_question = st.text_input("Type your question about the PDF content")

#         if user_question:
#             with st.spinner("Thinking..."):
#                 answer = process_answer(user_question, full_text)
#                 st.markdown("### 🤖 Answer")
#                 st.write(answer)

#         with st.sidebar:
#             st.markdown("---")
#             st.markdown("**💡 Suggestions:**")
#             st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"") 
#         with st.expander("💡 Suggestions", expanded=True):
#             st.markdown("""
#             - "Summarize this document"
#             - "Give a quick summary"
#             - "What are the main points?"
#             - "Explain this document in short"
#             """)

#     else:
#         st.error("⚠️ No text could be extracted from the PDF. Try another file.")
# else:
#     st.info("Upload a PDF to begin.")



import streamlit as st
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFaceHub
import os

# Set Hugging Face API Token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_huggingfacehub_api_token_here"

# Custom Prompt
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Use the context below to answer the question.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:
"""
)

# Load PDF and split into chunks

from langchain_community.document_loaders import PyPDFLoader
import tempfile

def load_and_split_pdf(uploaded_file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(uploaded_file.read())
        tmp_file_path = tmp_file.name

    loader = PyPDFLoader(tmp_file_path)
    documents = loader.load()

    # Then your text splitting logic follows
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks
    
# Build vectorstore from document chunks
def build_vectorstore(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = FAISS.from_documents(chunks, embedding=embeddings)
    return db

# Build QA chain
def build_qa_chain(vectorstore):
    llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"temperature": 0.2, "max_length": 512})
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        chain_type="stuff",
        chain_type_kwargs={"prompt": custom_prompt}
    )
    return qa_chain

# Streamlit App
st.set_page_config(page_title="Accurate PDF Chatbot", layout="centered")
st.title("PDF QA Chatbot - RAG Powered")

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    with st.spinner("Reading and processing PDF..."):
        chunks = load_and_split_pdf(uploaded_file)
        vectorstore = build_vectorstore(chunks)
        qa_chain = build_qa_chain(vectorstore)
        st.success("PDF processed. Ask your question below.")

    question = st.text_input("Ask a question from the PDF:")

    if question:
        with st.spinner("Searching answer..."):
            answer = qa_chain.run(question)
            st.markdown(f"**Answer:** {answer}")