import os import logging import torch import streamlit as st from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline from langchain_community.document_loaders import PDFMinerLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import SentenceTransformerEmbeddings from langchain_community.vectorstores import Chroma from langchain_community.llms import HuggingFacePipeline from langchain.chains import RetrievalQA # Setup logging.basicConfig(level=logging.INFO) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') persist_directory = "db" uploaded_files_dir = "uploaded_files" os.makedirs(uploaded_files_dir, exist_ok=True) checkpoint = "MBZUAI/LaMini-T5-738M" tokenizer = AutoTokenizer.from_pretrained(checkpoint) base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) def data_ingestion(): try: documents = [] for filename in os.listdir(uploaded_files_dir): if filename.endswith(".pdf"): file_path = os.path.join(uploaded_files_dir, filename) loader = PDFMinerLoader(file_path) docs = loader.load() for doc in docs: if hasattr(doc, 'page_content') and len(doc.page_content.strip()) > 0: documents.append(doc) if not documents: st.error("No valid text extracted from uploaded PDFs.") return splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) texts = splitter.split_documents(documents) embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory) db.persist() st.success("Document ingested and stored successfully.") except Exception as e: st.error(f"Error during data ingestion: {str(e)}") def qa_llm(): pipe = pipeline( 'text2text-generation', model=base_model, tokenizer=tokenizer, max_length=256, do_sample=True, temperature=0.3, top_p=0.95, device=0 if torch.cuda.is_available() else -1 ) llm = HuggingFacePipeline(pipeline=pipe) embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) retriever = db.as_retriever() qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True) return qa def process_query(query): try: qa = qa_llm() tailored_prompt = f""" You are an expert chatbot designed to assist Chartered Accountants (CAs) in the field of audits. Your goal is to provide accurate and comprehensive answers to any questions related to audit policies, procedures, and accounting standards based on the uploaded PDF documents. User question: {query} """ result = qa({"query": tailored_prompt}) return result["result"] except Exception as e: return f"Error: {str(e)}" # Streamlit UI st.set_page_config(page_title="CA Audit Chatbot", layout="centered") st.title("📚 Chartered Accountant Audit Assistant") st.markdown("Upload a PDF file and ask audit-related questions. This AI assistant will answer based on document content.") # File uploader uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"]) if uploaded_file is not None: save_path = os.path.join(uploaded_files_dir, uploaded_file.name) with open(save_path, "wb") as f: f.write(uploaded_file.getbuffer()) st.success("PDF uploaded successfully!") if st.button("Ingest Document"): data_ingestion() # Query input user_query = st.text_input("Ask a question about the audit document:") if user_query: response = process_query(user_query) st.markdown("### 📌 Answer:") st.write(response)