# import os # import streamlit as st # import fitz # PyMuPDF # import logging # from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline # from langchain.text_splitter import RecursiveCharacterTextSplitter # from langchain_community.vectorstores import Chroma # from langchain_community.embeddings import SentenceTransformerEmbeddings # from langchain_community.llms import HuggingFacePipeline # from langchain.chains import RetrievalQA # from langchain.prompts import PromptTemplate # from langchain_community.document_loaders import TextLoader # # --- Configuration --- # st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide") # st.title("📚 RAG-based PDF Chatbot") # device = "cpu" # # --- Logging --- # logging.basicConfig(level=logging.INFO) # # --- Load LLM --- # @st.cache_resource # def load_model(): # checkpoint = "MBZUAI/LaMini-T5-738M" # tokenizer = AutoTokenizer.from_pretrained(checkpoint) # model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) # pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95) # return HuggingFacePipeline(pipeline=pipe) # # --- Extract PDF Text --- # def read_pdf(file): # try: # doc = fitz.open(stream=file.read(), filetype="pdf") # text = "" # for page in doc: # text += page.get_text() # return text.strip() # except Exception as e: # logging.error(f"Failed to extract text: {e}") # return "" # # --- Process Answer ---dd # def process_answer(question, full_text): # # Save the full_text to a temporary file # with open("temp_text.txt", "w") as f: # f.write(full_text) # loader = TextLoader("temp_text.txt") # docs = loader.load() # # Chunk the documents with increased size and overlap # text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300) # splits = text_splitter.split_documents(docs) # # Load embeddings # embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5") # # Create Chroma in-memory vector store # db = Chroma.from_documents(splits, embedding=embeddings) # retriever = db.as_retriever() # # Set up the model # llm = load_model() # # Create a custom prompt # prompt_template = PromptTemplate( # input_variables=["context", "question"], # template=""" # You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it. # Context: # {context} # Question: # {question} # Important Instructions: # - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears. # - Do NOT summarize or paraphrase. # - If the information is not in the context, say "Not found in the document." # Answer: # """) # # Retrieval QA with custom prompt # qa_chain = RetrievalQA.from_chain_type( # llm=llm, # retriever=retriever, # chain_type="stuff", # chain_type_kwargs={"prompt": prompt_template} # ) # # Return the answer using the retrieval QA chain # return qa_chain.run(question) # # --- UI Layout --- # with st.sidebar: # st.header("📄 Upload PDF") # uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"]) # # --- Main Interface --- # if uploaded_file: # st.success(f"You uploaded: {uploaded_file.name}") # full_text = read_pdf(uploaded_file) # if full_text: # st.subheader("📁 PDF Preview") # with st.expander("View Extracted Text"): # st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else "")) # st.subheader("💬 Ask a Question") # user_question = st.text_input("Type your question about the PDF content") # if user_question: # with st.spinner("Thinking..."): # answer = process_answer(user_question, full_text) # st.markdown("### 🤖 Answer") # st.write(answer) # with st.sidebar: # st.markdown("---") # st.markdown("**💡 Suggestions:**") # st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"") # with st.expander("💡 Suggestions", expanded=True): # st.markdown(""" # - "Summarize this document" # - "Give a quick summary" # - "What are the main points?" # - "Explain this document in short" # """) # else: # st.error("⚠️ No text could be extracted from the PDF. Try another file.") # else: # st.info("Upload a PDF to begin.") import streamlit as st from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain.llms import HuggingFaceHub import os # Set Hugging Face API Token os.environ["HUGGINGFACEHUB_API_TOKEN"] = "your_huggingfacehub_api_token_here" # Custom Prompt custom_prompt = PromptTemplate( input_variables=["context", "question"], template=""" You are a helpful assistant. Use the context below to answer the question. If the answer is not in the context, say "I don't know." Context: {context} Question: {question} Answer: """ ) # Load PDF and split into chunks from langchain_community.document_loaders import PyPDFLoader import tempfile def load_and_split_pdf(uploaded_file): with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: tmp_file.write(uploaded_file.read()) tmp_file_path = tmp_file.name loader = PyPDFLoader(tmp_file_path) documents = loader.load() # Then your text splitting logic follows text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) chunks = text_splitter.split_documents(documents) return chunks # Build vectorstore from document chunks def build_vectorstore(chunks): embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") db = FAISS.from_documents(chunks, embedding=embeddings) return db # Build QA chain def build_qa_chain(vectorstore): llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.1", model_kwargs={"temperature": 0.2, "max_length": 512}) qa_chain = RetrievalQA.from_chain_type( llm=llm, retriever=vectorstore.as_retriever(), chain_type="stuff", chain_type_kwargs={"prompt": custom_prompt} ) return qa_chain # Streamlit App st.set_page_config(page_title="Accurate PDF Chatbot", layout="centered") st.title("PDF QA Chatbot - RAG Powered") uploaded_file = st.file_uploader("Upload a PDF", type="pdf") if uploaded_file: with st.spinner("Reading and processing PDF..."): chunks = load_and_split_pdf(uploaded_file) vectorstore = build_vectorstore(chunks) qa_chain = build_qa_chain(vectorstore) st.success("PDF processed. Ask your question below.") question = st.text_input("Ask a question from the PDF:") if question: with st.spinner("Searching answer..."): answer = qa_chain.run(question) st.markdown(f"**Answer:** {answer}")