import streamlit as st import os import subprocess import pdfplumber from lxml import etree from bs4 import BeautifulSoup from PyPDF2 import PdfReader from langchain_community.vectorstores import FAISS from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceBgeEmbeddings from langchain_openai import ChatOpenAI from langchain.agents import initialize_agent, AgentType from langchain.agents import Tool from langchain.memory import ConversationBufferMemory from langchain.text_splitter import CharacterTextSplitter from dotenv import load_dotenv from keybert import KeyBERT from sentence_transformers import CrossEncoder from langchain_groq import ChatGroq import sys import asyncio # Windows fix for asyncio compatibility if sys.platform.startswith('win') and sys.version_info >= (3, 8): asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # Initialize global variable vectorstore_global = None # Load OpenAI API key def load_environment(): load_dotenv() # PDF to XML Conversion def convert_pdf_to_xml(pdf_file, xml_path): os.makedirs("temp", exist_ok=True) pdf_path = os.path.join("temp", pdf_file.name) with open(pdf_path, 'wb') as f: f.write(pdf_file.getbuffer()) subprocess.run(["pdftohtml", "-xml", pdf_path, xml_path], check=True) return xml_path # Extract text from XML def extract_text_from_xml(xml_path, document_name): from lxml import etree tree = etree.parse(xml_path) text_chunks = [] for page in tree.xpath("//page"): page_num = int(page.get("number", 0)) texts = [text.text for text in page.xpath('.//text') if text.text] combined_text = '\n'.join(texts) text_chunks.append({"text": combined_text, "page": page_num, "document": document_name}) return text_chunks # Process uploaded files def get_uploaded_text(uploaded_files): raw_text = [] print(f"Total uploaded files: {len(uploaded_files)}") for uploaded_file in uploaded_files: document_name = uploaded_file.name if document_name.endswith(".pdf"): xml_path = os.path.join("temp", document_name.replace(".pdf", ".xml")) text_chunks = extract_text_from_xml(convert_pdf_to_xml(uploaded_file, xml_path), document_name) raw_text.extend(text_chunks) elif uploaded_file.name.endswith((".html", ".htm")): soup = BeautifulSoup(uploaded_file.getvalue(), 'lxml') raw_text.append({"text": soup.get_text(), "page": None, "document": document_name}) elif uploaded_file.name.endswith((".txt")): content = uploaded_file.getvalue().decode("utf-8") raw_text.append({"text": content, "page": None, "document": document_name}) return raw_text # Text Chunking def get_text_chunks(raw_text): splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100) final_chunks = [] for chunk in raw_text: for split_text in splitter.split_text(chunk["text"]): final_chunks.append({"text": split_text, "page": chunk["page"], "document": chunk["document"]}) return final_chunks # Vectorstore Initialization def get_vectorstore(text_chunks): if not text_chunks: raise ValueError("text_chunks is empty. Cannot initialize FAISS vectorstore.") #model_name = "BAAI/bge-large-en-v1.5" #encode_kwargs = {'normalize_embeddings': True} #embeddings = OpenAIEmbeddings(model="text-embedding-3-large") embeddings = HuggingFaceBgeEmbeddings( model_name="BAAI/bge-base-en-v1.5", encode_kwargs={"normalize_embeddings": True} ) texts = [chunk["text"] for chunk in text_chunks] metadatas = [{"page": chunk["page"], "document": chunk["document"]} for chunk in text_chunks] return FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas) def set_global_vectorstore(vectorstore): global vectorstore_global vectorstore_global = vectorstore kw_model = KeyBERT() def faiss_search_with_keywords(query): global vectorstore_global if vectorstore_global is None: raise ValueError("FAISS vectorstore is not initialized.") # Extract keywords from the query keywords = kw_model.extract_keywords(query, keyphrase_ngram_range=(1,2), stop_words='english', top_n=5) refined_query = " ".join([keyword[0] for keyword in keywords]) retriever = vectorstore_global.as_retriever(search_kwargs={"k": 13}) docs = retriever.get_relevant_documents(refined_query) return '\n\n'.join([f"[Page {doc.metadata.get('page', 'Unknown')}] {doc.page_content}" for doc in docs]) def self_reasoning(query, context): llm = ChatOpenAI(model="gpt-4", temperature=0.3) #llm = ChatGroq(temperature=0.3, model_name="llama3-8b-8192") reasoning_prompt = f""" You are an AI assistant that analyzes the context provided to answer the user's query comprehensively and clearly. Answer in a concise, factual way using the terminology from the context. Avoid extra explanation unless explicitly asked. If asked for the page number,YOU MUST mention the page number. ### Example 1: **Question:** What is the purpose of the MODTRAN GUI? **Context:** [Page 10 of the docuemnt] The MODTRAN GUI helps users set parameters and visualize the model's output. **Answer:** The MODTRAN GUI assists users in parameter setup and output visualization. You can find the answer at Page 10 of the document provided. ### Example 2: **Question:** How do you run MODTRAN on Linux? Answer with page number. **Context:** [Page 15 of the docuemnt] On Linux systems, MODTRAN can be run using the `mod6c` binary via terminal. **Answer:** Use the `mod6c` binary via terminal. (Page 15) ### Now answer: **Question:** {query} **Context:** {context} **Answer:** """ response = llm.predict(reasoning_prompt) return response def faiss_search_with_reasoning(query): global vectorstore_global if vectorstore_global is None: raise ValueError("FAISS vectorstore is not initialized.") retriever = vectorstore_global.as_retriever(search_kwargs={"k": 13}) docs = retriever.get_relevant_documents(query) # Rerank using cross-encoder #pairs = [(query, doc.page_content) for doc in docs] #scores = reranker.predict(pairs) #reranked_docs = sorted(zip(scores, docs), key=lambda x: x[0], reverse=True) #top_docs = [doc for _, doc in reranked_docs[:5]] context = '\n\n'.join([f"[Page {doc.metadata.get('page', 'Unknown')}] {doc.page_content.strip()}" for doc in docs]) return self_reasoning(query, context) faiss_keyword_tool = Tool( name="FAISS Keyword Search", func=faiss_search_with_keywords, description="Searches FAISS with a keyword-based approach to retrieve context." ) faiss_reasoning_tool = Tool( name="FAISS Reasoning Search", func=faiss_search_with_reasoning, description="Searches FAISS with detailed reasoning to retrieve context." ) # Agent Initialization def initialize_chatbot_agent(): llm = ChatOpenAI(model="gpt-4", temperature=0.3) #llm = ChatGroq(temperature=0.3, model_name="llama3-8b-8192") memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) tools = [faiss_keyword_tool, faiss_reasoning_tool] agent = initialize_agent( tools=tools, llm=llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, memory=memory, verbose=False, handle_parsing_errors=True) return agent # Query Handler def handle_user_query(query, agent): response = agent.run(query) return response # Main Streamlit App def main(): global vectorstore_global load_environment() if "agent" not in st.session_state: st.session_state.agent = None if "chat_history" not in st.session_state: st.session_state.chat_history = [] st.header("Chat with MODTRAN Documents :satellite:") user_question = st.text_input("Ask a question about your uploaded files:") with st.sidebar: uploaded_files = st.file_uploader("Upload PDF, HTML, or MODTRAN output files:", accept_multiple_files=True) if st.button("Process") and uploaded_files: with st.spinner("Processing..."): raw_text = get_uploaded_text(uploaded_files) print(f"Total text chunks: {len(raw_text)}") if raw_text: print("Example chunk:", raw_text[0]) text_chunks = get_text_chunks(raw_text) vectorstore_global = get_vectorstore(text_chunks) st.session_state.agent = initialize_chatbot_agent() st.success("Files processed successfully!") if st.session_state.agent and user_question: response = handle_user_query(user_question, st.session_state.agent) st.session_state.chat_history.append({"user": user_question, "bot": response}) for chat in st.session_state.chat_history: st.write(f"**You:** {chat['user']}") st.write(f"**Bot:** {chat['bot']}") if __name__ == "__main__": load_environment() main()