Spaces:

pradeepsengarr
/

Bot_RAG

Sleeping

File size: 4,757 Bytes

# import os
# import streamlit as st
# import fitz  # PyMuPDF
# import logging
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import SentenceTransformerEmbeddings
# from langchain_community.llms import HuggingFacePipeline
# from langchain.chains import RetrievalQA
# from langchain.prompts import PromptTemplate
# from langchain_community.document_loaders import TextLoader

# # --- Configuration ---
# st.set_page_config(page_title="📚 RAG PDF Chatbot", layout="wide")
# st.title("📚 RAG-based PDF Chatbot")
# device = "cpu"

# # --- Logging ---
# logging.basicConfig(level=logging.INFO)

# # --- Load LLM ---
# @st.cache_resource
# def load_model():
#     checkpoint = "MBZUAI/LaMini-T5-738M"
#     tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#     model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
#     pipe = pipeline('text2text-generation', model=model, tokenizer=tokenizer, max_length=1024, do_sample=True, temperature=0.3, top_k=50, top_p=0.95)
#     return HuggingFacePipeline(pipeline=pipe)

# # --- Extract PDF Text ---
# def read_pdf(file):
#     try:
#         doc = fitz.open(stream=file.read(), filetype="pdf")
#         text = ""
#         for page in doc:
#             text += page.get_text()
#         return text.strip()
#     except Exception as e:
#         logging.error(f"Failed to extract text: {e}")
#         return ""

# # --- Process Answer ---dd
# def process_answer(question, full_text):
#     # Save the full_text to a temporary file
#     with open("temp_text.txt", "w") as f:
#         f.write(full_text)

#     loader = TextLoader("temp_text.txt")
#     docs = loader.load()

#     # Chunk the documents with increased size and overlap
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=300)
#     splits = text_splitter.split_documents(docs)

#     # Load embeddings
#     embeddings = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")

#     # Create Chroma in-memory vector store
#     db = Chroma.from_documents(splits, embedding=embeddings)
#     retriever = db.as_retriever()

#     # Set up the model
#     llm = load_model()

#     # Create a custom prompt
#     prompt_template = PromptTemplate(
#     input_variables=["context", "question"],
#     template="""
#     You are a helpful assistant. Carefully analyze the given context and extract direct answers ONLY from it.
     
#     Context:
#     {context}
     
#     Question:
#     {question}
     
#     Important Instructions:
#     - If the question asks for a URL (e.g., LinkedIn link), provide the exact URL as it appears.
#     - Do NOT summarize or paraphrase.
#     - If the information is not in the context, say "Not found in the document."
     
#     Answer:
#     """)
     

#     # Retrieval QA with custom prompt
#     qa_chain = RetrievalQA.from_chain_type(
#         llm=llm,
#         retriever=retriever,
#         chain_type="stuff",
#         chain_type_kwargs={"prompt": prompt_template}
#     )

#     # Return the answer using the retrieval QA chain
#     return qa_chain.run(question)

# # --- UI Layout ---
# with st.sidebar:
#     st.header("📄 Upload PDF")
#     uploaded_file = st.file_uploader("Choose a PDF", type=["pdf"])

# # --- Main Interface ---
# if uploaded_file:
#     st.success(f"You uploaded: {uploaded_file.name}")
#     full_text = read_pdf(uploaded_file)

#     if full_text:
#         st.subheader("📁 PDF Preview")
#         with st.expander("View Extracted Text"):
#             st.write(full_text[:3000] + ("..." if len(full_text) > 3000 else ""))

#         st.subheader("💬 Ask a Question")
#         user_question = st.text_input("Type your question about the PDF content")

#         if user_question:
#             with st.spinner("Thinking..."):
#                 answer = process_answer(user_question, full_text)
#                 st.markdown("### 🤖 Answer")
#                 st.write(answer)

#         with st.sidebar:
#             st.markdown("---")
#             st.markdown("**💡 Suggestions:**")
#             st.caption("Try: \"Summarize this document\" or \"What is the key idea?\"") 
#         with st.expander("💡 Suggestions", expanded=True):
#             st.markdown("""
#             - "Summarize this document"
#             - "Give a quick summary"
#             - "What are the main points?"
#             - "Explain this document in short"
#             """)

#     else:
#         st.error("⚠️ No text could be extracted from the PDF. Try another file.")
# else:
#     st.info("Upload a PDF to begin.")