Spaces:

Rohit1412
/

gemma3-27b-RAG

Running

App Files Files Community

Rohit1412 commited on Mar 16

Commit

d5ba7eb

verified ·

1 Parent(s): 4d1faa0

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -32

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import gradio as gr
 import torch
 from sentence_transformers import SentenceTransformer, util
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import PyPDF2
 import os
 import time
@@ -13,12 +15,21 @@ logger = logging.getLogger(__name__)
 # Load models
 retriever_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-gen_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
-gen_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
 # Cache for document embeddings
 embedding_cache = {}
 def extract_text_from_pdf(pdf_file):
     """Extract text from a PDF file, returning a list of page texts."""
     pages = []
@@ -34,7 +45,7 @@ def extract_text_from_pdf(pdf_file):
         pages.append(f"Error reading PDF: {str(e)}")
     return pages
-def chunk_text(text, chunk_size=1500):
     """Split text into chunks of approximately chunk_size characters."""
     words = text.split()
     chunks = []
@@ -46,7 +57,7 @@ def chunk_text(text, chunk_size=1500):
             current_chunk = []
             current_length = 0
         current_chunk.append(word)
-        current_length += len(word) + 1  # +1 for space
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
@@ -64,7 +75,7 @@ def get_document_embeddings(documents):
     return torch.stack(embeddings)
 def rag_pipeline(question, pdf_files):
-    """Optimized RAG pipeline with improved prompting and fallback."""
     start_time = time.time()
     documents = []
@@ -95,45 +106,51 @@ def rag_pipeline(question, pdf_files):
     # Retrieve top 3 chunks using cosine similarity
     cos_scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
-    top_results = torch.topk(cos_scores, k=min(5, len(documents)))
     retrieved_context = ""
     for score, idx in zip(top_results.values, top_results.indices):
         retrieved_context += f"- {documents[idx]} (score: {score:.2f})\n"
-    # Log retrieved context for debugging
     logger.info(f"Retrieved context:\n{retrieved_context}")
-    # Improved prompt with fallback
-    if retrieved_context.strip():
-        prompt = (
-            f"Based on the following context, provide a concise and accurate answer to the question.\n\n"
-            f"Context:\n{retrieved_context}\n\n"
-            f"Question: {question}\n\n"
-            f"Answer:"
         )
-    else:
-        prompt = (
-            f"No relevant context found. Provide a general answer to the question based on your knowledge.\n\n"
-            f"Question: {question}\n\n"
-            f"Answer:"
         )
-    # Generate answer with more tokens
-    inputs = gen_tokenizer(prompt, return_tensors="pt")
-    outputs = gen_model.generate(**inputs, max_new_tokens=1500, num_beams=2)
-    answer = gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Log processing time
     logger.info(f"Processing time: {time.time() - start_time:.2f} seconds")
-    return answer if answer else "Unable to generate a meaningful response."
 # Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("# Improved Lightweight Local RAG Pipeline with PDF Input")
     gr.Markdown(
-        "Upload one or more PDF files (or leave blank for default AI/Data Science documents), enter your question, "
-        "and get an answer generated using an optimized retrieval step (all-MiniLM-L6-v2) and a small "
-        "generator model (flan-t5-small). Designed for 2 vCPUs and 16GB RAM."
     )
     with gr.Row():
         with gr.Column():
@@ -145,4 +162,4 @@ with gr.Blocks() as demo:
     submit_button.click(fn=rag_pipeline, inputs=[question_input, pdf_input], outputs=response_output)
-demo.launch()

 import gradio as gr
 import torch
 from sentence_transformers import SentenceTransformer, util
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
 import PyPDF2
 import os
 import time
 # Load models
 retriever_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+gen_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+gen_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype=torch.float16)
 # Cache for document embeddings
 embedding_cache = {}
+# LangChain wrapper for Phi-1
+class Phi1LLM:
+    def __call__(self, prompt, **kwargs):
+        inputs = gen_tokenizer(prompt, return_tensors="pt")
+        outputs = gen_model.generate(**inputs, max_new_tokens=150, num_beams=2)
+        return gen_tokenizer.decode(outputs[0], skip_special_tokens=True)
+phi1_llm = Phi1LLM()
 def extract_text_from_pdf(pdf_file):
     """Extract text from a PDF file, returning a list of page texts."""
     pages = []
         pages.append(f"Error reading PDF: {str(e)}")
     return pages
+def chunk_text(text, chunk_size=500):
     """Split text into chunks of approximately chunk_size characters."""
     words = text.split()
     chunks = []
             current_chunk = []
             current_length = 0
         current_chunk.append(word)
+        current_length += len(word) + 1
     if current_chunk:
         chunks.append(" ".join(current_chunk))
     return chunks
     return torch.stack(embeddings)
 def rag_pipeline(question, pdf_files):
+    """RAG pipeline with multi-step thinking using Phi-1 and LangChain."""
     start_time = time.time()
     documents = []
     # Retrieve top 3 chunks using cosine similarity
     cos_scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
+    top_results = torch.topk(cos_scores, k=min(3, len(documents)))
     retrieved_context = ""
     for score, idx in zip(top_results.values, top_results.indices):
         retrieved_context += f"- {documents[idx]} (score: {score:.2f})\n"
     logger.info(f"Retrieved context:\n{retrieved_context}")
+    # Step 1: Initial Answer
+    initial_prompt = PromptTemplate(
+        input_variables=["context", "question"],
+        template=(
+            "Using the following context, provide a concise answer to the question:\n\n"
+            "Context:\n{context}\n\n"
+            "Question: {question}\n\n"
+            "Answer:"
         )
+    )
+    initial_chain = LLMChain(llm=phi1_llm, prompt=initial_prompt)
+    initial_answer = initial_chain.run(context=retrieved_context, question=question)
+    # Step 2: Refine Answer
+    refine_prompt = PromptTemplate(
+        input_variables=["context", "question", "initial_answer"],
+        template=(
+            "Given the context and initial answer, refine and improve the response to the question:\n\n"
+            "Context:\n{context}\n\n"
+            "Question: {question}\n\n"
+            "Initial Answer: {initial_answer}\n\n"
+            "Refined Answer:"
         )
+    )
+    refine_chain = LLMChain(llm=phi1_llm, prompt=refine_prompt)
+    refined_answer = refine_chain.run(context=retrieved_context, question=question, initial_answer=initial_answer)
+    logger.info(f"Initial answer: {initial_answer}")
+    logger.info(f"Refined answer: {refined_answer}")
     logger.info(f"Processing time: {time.time() - start_time:.2f} seconds")
+    return refined_answer if refined_answer else "Unable to generate a meaningful response."
 # Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# RAG Pipeline with microsoft/phi-1 and Multi-Step Thinking")
     gr.Markdown(
+        "Upload PDFs (or use default AI/Data Science docs), ask a question, "
+        "and get refined answers using Phi-1 with multi-step reasoning on 2 vCPUs and 16GB RAM."
     )
     with gr.Row():
         with gr.Column():
     submit_button.click(fn=rag_pipeline, inputs=[question_input, pdf_input], outputs=response_output)
+demo.launch(share=True, debug=True)