Spaces:

jatinmehra
/

PDF-Insight-PRO

Running

App Files Files Community

Jatin Mehra commited on May 2

Commit

1dc0983

1 Parent(s): ba76b7d

Refactor PDF processing and embedding creation; update chunking to include metadata

Browse files

Files changed (2) hide show

app.py +16 -15
preprocessing.py +49 -33

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import dotenv
 import pickle
 import uuid
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, BackgroundTasks, Request
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
@@ -16,8 +16,7 @@ from preprocessing import (
     build_faiss_index,
     retrieve_similar_chunks,
     agentic_rag,
-    tools,
-    memory
 )
 from sentence_transformers import SentenceTransformer
 import shutil
@@ -88,8 +87,8 @@ def load_session(session_id, model_name="meta-llama/llama-4-scout-17b-16e-instru
             # Recreate non-pickled objects
             if data.get("chunks") and data.get("file_path") and os.path.exists(data["file_path"]):
                 # Recreate model, embeddings and index
-                model = SentenceTransformer('all-MiniLM-L6-v2')
-                embeddings = create_embeddings(data["chunks"], model)
                 index = build_faiss_index(embeddings)
                 # Recreate LLM
@@ -165,13 +164,15 @@ async def upload_pdf(
             raise ValueError("GROQ_API_KEY is not set in the environment variables")
         # Process the PDF
-        text = process_pdf_file(file_path)
-        chunks = chunk_text(text, max_length=1500)
         # Create embeddings
-        model = SentenceTransformer('all-MiniLM-L6-v2')
-        embeddings = create_embeddings(chunks, model)
-        index = build_faiss_index(embeddings)
         # Initialize LLM
         llm = model_selection(model_name)
@@ -180,7 +181,7 @@ async def upload_pdf(
         session_data = {
             "file_path": file_path,
             "file_name": file.filename,
-            "chunks": chunks,
             "model": model,
             "index": index,
             "llm": llm,
@@ -224,16 +225,15 @@ async def chat(request: ChatRequest):
             session["index"],
             session["chunks"],
             session["model"],
-            k=3
         )
-        context = "\n".join([chunk for chunk, _ in similar_chunks])
         # Generate response using agentic_rag
         response = agentic_rag(
             session["llm"],
             tools,
             query=request.query,
-            context=context,
             Use_Tavily=request.use_search
         )
@@ -244,12 +244,13 @@ async def chat(request: ChatRequest):
         return {
             "status": "success",
             "answer": response["output"],
-            "context_used": [{"text": chunk, "score": float(score)} for chunk, score in similar_chunks]
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
 # Route to get chat history
 @app.post("/chat-history")
 async def get_chat_history(request: SessionRequest):

 import dotenv
 import pickle
 import uuid
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
     build_faiss_index,
     retrieve_similar_chunks,
     agentic_rag,
+    tools
 )
 from sentence_transformers import SentenceTransformer
 import shutil
             # Recreate non-pickled objects
             if data.get("chunks") and data.get("file_path") and os.path.exists(data["file_path"]):
                 # Recreate model, embeddings and index
+                model = SentenceTransformer('BAAI/bge-large-en-v1.5')
+                embeddings, _ = create_embeddings(data["chunks"], model)  # Unpack tuple
                 index = build_faiss_index(embeddings)
                 # Recreate LLM
             raise ValueError("GROQ_API_KEY is not set in the environment variables")
         # Process the PDF
+        documents = process_pdf_file(file_path)  # Returns list of Document objects
+        chunks = chunk_text(documents, max_length=1000)  # Updated to handle documents
         # Create embeddings
+        model = SentenceTransformer('BAAI/bge-large-en-v1.5')  # Updated embedding model
+        embeddings, chunks_with_metadata = create_embeddings(chunks, model)  # Unpack tuple
+        # Build FAISS index
+        index = build_faiss_index(embeddings)  # Pass only embeddings array
         # Initialize LLM
         llm = model_selection(model_name)
         session_data = {
             "file_path": file_path,
             "file_name": file.filename,
+            "chunks": chunks_with_metadata,  # Store chunks with metadata
             "model": model,
             "index": index,
             "llm": llm,
             session["index"],
             session["chunks"],
             session["model"],
+            k=10
         )
         # Generate response using agentic_rag
         response = agentic_rag(
             session["llm"],
             tools,
             query=request.query,
+            context_chunks=similar_chunks,  # Pass the list of tuples
             Use_Tavily=request.use_search
         )
         return {
             "status": "success",
             "answer": response["output"],
+            "context_used": [{"text": chunk, "score": float(score)} for chunk, score, _ in similar_chunks]
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
 # Route to get chat history
 @app.post("/chat-history")
 async def get_chat_history(request: SessionRequest):

preprocessing.py CHANGED Viewed

@@ -25,49 +25,70 @@ def estimate_tokens(text):
     return len(text) // 4
 def process_pdf_file(file_path):
-    """Load a PDF file and extract its text."""
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"The file {file_path} does not exist.")
     loader = PyMuPDFLoader(file_path)
     documents = loader.load()
-    text = "".join(doc.page_content for doc in documents)
-    return text
-def chunk_text(text, max_length=1500):
-    """Split text into chunks based on paragraphs, respecting max_length."""
-    paragraphs = text.split("\n\n")
     chunks = []
-    current_chunk = ""
-    for paragraph in paragraphs:
-        if len(current_chunk) + len(paragraph) <= max_length:
-            current_chunk += paragraph + "\n\n"
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = paragraph + "\n\n"
-    if current_chunk:
-        chunks.append(current_chunk.strip())
     return chunks
-def create_embeddings(texts, model):
-    """Create embeddings for a list of texts using the provided model."""
     embeddings = model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
-    return embeddings.cpu().numpy()
 def build_faiss_index(embeddings):
-    """Build a FAISS index from embeddings for similarity search."""
     dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
     index.add(embeddings)
     return index
-def retrieve_similar_chunks(query, index, texts, model, k=3, max_chunk_length=3500):
     """Retrieve top k similar chunks to the query from the FAISS index."""
     query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
     distances, indices = index.search(query_embedding, k)
-    return [(texts[i][:max_chunk_length], distances[0][j]) for j, i in enumerate(indices[0])]
-def agentic_rag(llm, tools, query, context, Use_Tavily=False):
-    # Define the prompt template for the agent
     search_instructions = (
         "Use the search tool if the context is insufficient to answer the question or you are unsure. Give source links if you use the search tool."
         if Use_Tavily
@@ -80,35 +101,30 @@ def agentic_rag(llm, tools, query, context, Use_Tavily=False):
         Instructions:
         1. Use the provided context to answer the user's question.
         2. Provide a clear answer, if you don't know the answer, say 'I don't know'.
         """),
         ("human", "Context: {context}\n\nQuestion: {input}"),
         MessagesPlaceholder(variable_name="chat_history"),
         MessagesPlaceholder(variable_name="agent_scratchpad"),
     ])
-    # Only use tools when Tavily is enabled
-    agent_tools = tools if Use_Tavily else []
     try:
-        # Create the agent and executor with appropriate tools
         agent = create_tool_calling_agent(llm, agent_tools, prompt)
         agent_executor = AgentExecutor(agent=agent, tools=agent_tools, memory=memory, verbose=True)
-        # Execute the agent
         return agent_executor.invoke({
-            "input": query,
             "context": context,
             "search_instructions": search_instructions
         })
     except Exception as e:
         print(f"Error during agent execution: {str(e)}")
-        # Fallback to direct LLM call without agent framework
         fallback_prompt = ChatPromptTemplate.from_messages([
             ("system", "You are a helpful assistant. Use the provided context to answer the user's question."),
             ("human", "Context: {context}\n\nQuestion: {input}")
         ])
         response = llm.invoke(fallback_prompt.format(context=context, input=query))
-        return {"output": response.content}
 if __name__ == "__main__":
     # Process PDF and prepare index

     return len(text) // 4
 def process_pdf_file(file_path):
+    """Load a PDF file and extract its text with metadata."""
     if not os.path.exists(file_path):
         raise FileNotFoundError(f"The file {file_path} does not exist.")
     loader = PyMuPDFLoader(file_path)
     documents = loader.load()
+    return documents  # Return list of Document objects with metadata
+def chunk_text(documents, max_length=1000):
+    """Split documents into chunks with metadata."""
     chunks = []
+    for doc in documents:
+        text = doc.page_content
+        metadata = doc.metadata
+        paragraphs = text.split("\n\n")
+        current_chunk = ""
+        current_metadata = metadata.copy()
+        for paragraph in paragraphs:
+            if estimate_tokens(current_chunk + paragraph) <= max_length // 4:
+                current_chunk += paragraph + "\n\n"
+            else:
+                chunks.append({"text": current_chunk.strip(), "metadata": current_metadata})
+                current_chunk = paragraph + "\n\n"
+        if current_chunk:
+            chunks.append({"text": current_chunk.strip(), "metadata": current_metadata})
     return chunks
+def create_embeddings(chunks, model):
+    """Create embeddings for a list of chunk texts."""
+    texts = [chunk["text"] for chunk in chunks]
     embeddings = model.encode(texts, show_progress_bar=True, convert_to_tensor=True)
+    return embeddings.cpu().numpy(), chunks
 def build_faiss_index(embeddings):
+    """Build a FAISS HNSW index from embeddings for similarity search."""
     dim = embeddings.shape[1]
+    index = faiss.IndexHNSWFlat(dim, 32)  # 32 = number of neighbors in HNSW graph
+    index.hnsw.efConstruction = 200  # Higher = better quality, slower build
+    index.hnsw.efSearch = 50  # Higher = better accuracy, slower search
     index.add(embeddings)
     return index
+def retrieve_similar_chunks(query, index, chunks, model, k=10, max_chunk_length=1000):
     """Retrieve top k similar chunks to the query from the FAISS index."""
     query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
     distances, indices = index.search(query_embedding, k)
+    return [(chunks[i]["text"][:max_chunk_length], distances[0][j], chunks[i]["metadata"]) for j, i in enumerate(indices[0])]
+def agentic_rag(llm, tools, query, context_chunks, Use_Tavily=False):
+    # Sort chunks by relevance (lower distance = more relevant)
+    context_chunks = sorted(context_chunks, key=lambda x: x[1])  # Sort by distance
+    context = ""
+    total_tokens = 0
+    max_tokens = 7000  # Leave room for prompt and response
+    # Aggregate relevant chunks until token limit is reached
+    for chunk, _, _ in context_chunks:  # Unpack three elements
+        chunk_tokens = estimate_tokens(chunk)
+        if total_tokens + chunk_tokens <= max_tokens:
+            context += chunk + "\n\n"
+            total_tokens += chunk_tokens
+        else:
+            break
+    # Define prompt template
     search_instructions = (
         "Use the search tool if the context is insufficient to answer the question or you are unsure. Give source links if you use the search tool."
         if Use_Tavily
         Instructions:
         1. Use the provided context to answer the user's question.
         2. Provide a clear answer, if you don't know the answer, say 'I don't know'.
+        3. Prioritize information from the most relevant context chunks.
         """),
         ("human", "Context: {context}\n\nQuestion: {input}"),
         MessagesPlaceholder(variable_name="chat_history"),
         MessagesPlaceholder(variable_name="agent_scratchpad"),
     ])
+    agent_tools = tools if Use_Tavily else []
     try:
         agent = create_tool_calling_agent(llm, agent_tools, prompt)
         agent_executor = AgentExecutor(agent=agent, tools=agent_tools, memory=memory, verbose=True)
         return agent_executor.invoke({
+            "input": query,
             "context": context,
             "search_instructions": search_instructions
         })
     except Exception as e:
         print(f"Error during agent execution: {str(e)}")
         fallback_prompt = ChatPromptTemplate.from_messages([
             ("system", "You are a helpful assistant. Use the provided context to answer the user's question."),
             ("human", "Context: {context}\n\nQuestion: {input}")
         ])
         response = llm.invoke(fallback_prompt.format(context=context, input=query))
+        return {"output": response.content}
 if __name__ == "__main__":
     # Process PDF and prepare index