Spaces:

Jesudian
/

HealthcareGuidelinesRAGChatbot

Running

App Files Files Community

Jesudian commited on 7 days ago

Commit

ceea46c

verified ·

1 Parent(s): 4028c58

Upload 6 files

Browse files

initiated the project

Files changed (7) hide show

.gitattributes +1 -0
README.md +85 -7
app.py +56 -0
clinical_guidelines.pdf +3 -0
rag_pipeline.py +78 -0
requirements.txt +9 -0
utils.py +72 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+clinical_guidelines.pdf filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,91 @@
 ---
-title: HealthcareGuidelinesRAGChatbot
-emoji: 💻
-colorFrom: red
-colorTo: green
 sdk: gradio
-sdk_version: 5.32.0
 app_file: app.py
 pinned: false
-short_description: This application provides a Retrieval-Augmented Generation
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Healthcare Guidelines RAG Chatbot
+emoji: 📄
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
+sdk_version: 5.29.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+# Healthcare Guidelines RAG Chatbot
+This application provides a Retrieval-Augmented Generation (RAG) system for healthcare clinical guidelines. It uses FAISS for efficient similarity search and a Hugging Face model for generating responses.
+## Features
+- PDF document processing and chunking
+- FAISS-based semantic search
+- Response generation using FLAN-T5
+- Gradio web interface
+- Strict grounding in provided guidelines
+- Safety measures and disclaimers
+## Setup
+### Option 1: Local Setup
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Place your clinical guidelines PDF in the directory:
+```
+# Copy your PDF to clinical_guidelines.pdf
+```
+1. Build the Docker image:
+```bash
+docker build -t healthcare-rag-chatbot .
+```
+2. Run the container:
+```bash
+docker run -p 7860:7860 -v $(pwd)/data:/app/data healthcare-rag-chatbot
+```
+The `-v` flag mounts your local `data` directory to the container, allowing you to easily update the clinical guidelines PDF without rebuilding the image.
+## Usage
+1. Run the application:
+```bash
+# If using local setup:
+python app.py
+# If using Docker:
+# The application will start automatically when running the container
+```
+2. Open your browser and navigate to the provided local URL (typically http://127.0.0.1:7860)
+3. Enter your question about clinical guidelines in the input box
+## Important Notes
+- The system will only provide information that is explicitly stated in the provided guidelines
+- All responses include a medical disclaimer
+- The system will respond with "This information is not available in the current guidelines" when uncertain
+## Deployment
+To deploy on Hugging Face Spaces:
+1. Create a new Space on Hugging Face
+2. Connect your GitHub repository
+3. Select Gradio as the SDK
+4. The app will automatically deploy
+## Safety Measures
+- All responses are strictly grounded in the provided guidelines
+- A medical disclaimer is included with every response
+- The system explicitly states when information is not available
+- No medical advice is provided without proper context
+## License
+This project is for educational purposes only. Always consult healthcare professionals for medical advice.

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import gradio as gr
+from rag_pipeline import HealthcareRAG
+import os
+# Initialize RAG system
+rag = HealthcareRAG()
+def process_query(query: str) -> tuple:
+    """Process user query and return response with retrieved chunks."""
+    result = rag.query(query)
+    # Format retrieved chunks for display
+    chunks_text = "\n\n---\n\n".join(result["retrieved_chunks"])
+    return result["response"], chunks_text
+# Create Gradio interface
+with gr.Blocks(title="Healthcare Guidelines Assistant") as demo:
+    gr.Markdown("""
+    # Healthcare Guidelines Assistant
+    This assistant provides information based on clinical guidelines.
+    Please note that this is for educational purposes only and not medical advice.
+    **DISCLAIMER**: This information is for educational purposes only and not medical advice.
+    Always consult with healthcare professionals for medical decisions.
+    """)
+    with gr.Row():
+        with gr.Column():
+            query_input = gr.Textbox(
+                label="Your Question",
+                placeholder="Enter your question about clinical guidelines...",
+                lines=3
+            )
+            submit_btn = gr.Button("Submit")
+    with gr.Row():
+        with gr.Column():
+            response_output = gr.Textbox(
+                label="Response",
+                lines=5
+            )
+            chunks_output = gr.Textbox(
+                label="Retrieved Guidelines",
+                lines=10
+            )
+    submit_btn.click(
+        fn=process_query,
+        inputs=query_input,
+        outputs=[response_output, chunks_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

clinical_guidelines.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74e988ffe0603263ce131d043a187302d5845fbc4ff30aa74f9190ce99ff5016
+size 4917213

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import torch
+from utils import DocumentProcessor
+import os
+from typing import List, Dict
+class HealthcareRAG:
+    def __init__(self,
+                 model_name: str = "google/flan-t5-base",
+                 index_path: str = "faiss_index.bin",
+                 pdf_path: str = "clinical_guidelines.pdf"):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
+        self.doc_processor = DocumentProcessor()
+        # Initialize or load FAISS index
+        if os.path.exists(index_path):
+            self.index = self.doc_processor.load_index(index_path)
+            # Load chunks from a saved file (you might want to save/load chunks separately)
+            self.chunks = []  # Load your chunks here
+        else:
+            self.chunks, self.index = self.doc_processor.process_document(pdf_path, index_path)
+    def generate_response(self, query: str, retrieved_chunks: List[str]) -> str:
+        """Generate response using the LLM."""
+        if not retrieved_chunks:
+            return "This information is not available in the current guidelines."
+        # Prepare context and prompt
+        context = "\n".join(retrieved_chunks)
+        prompt = f"""Based on the following clinical guidelines, answer the question.
+        If the information is not explicitly stated in the guidelines, respond with "This information is not available in the current guidelines."
+        Guidelines:
+        {context}
+        Question: {query}
+        Answer:"""
+        # Generate response
+        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(self.device)
+        outputs = self.model.generate(
+            **inputs,
+            max_length=200,
+            num_beams=4,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True
+        )
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Add disclaimer
+        if response and response != "This information is not available in the current guidelines.":
+            response += "\n\nDISCLAIMER: This information is for educational purposes only and not medical advice."
+        return response
+    def query(self, user_query: str) -> Dict[str, str]:
+        """Process user query and return response with retrieved chunks."""
+        # Retrieve relevant chunks
+        retrieved_chunks = self.doc_processor.retrieve_chunks(
+            user_query,
+            self.index,
+            self.chunks
+        )
+        # Generate response
+        response = self.generate_response(user_query, retrieved_chunks)
+        return {
+            "response": response,
+            "retrieved_chunks": retrieved_chunks
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==4.19.2
+sentence-transformers==2.5.1
+faiss-cpu==1.7.4
+PyMuPDF==1.23.26
+transformers==4.38.2
+torch==2.2.1
+numpy==1.26.4
+pandas==2.2.1
+python-dotenv==1.0.1

utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import fitz  # PyMuPDF
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import faiss
+import os
+from typing import List, Tuple
+class DocumentProcessor:
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
+        self.model = SentenceTransformer(model_name)
+        self.chunk_size = 500  # tokens
+        self.chunk_overlap = 50  # tokens
+    def extract_text_from_pdf(self, pdf_path: str) -> str:
+        """Extract text from PDF file."""
+        doc = fitz.open(pdf_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text
+    def chunk_text(self, text: str) -> List[str]:
+        """Split text into overlapping chunks."""
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
+            chunk = " ".join(words[i:i + self.chunk_size])
+            chunks.append(chunk)
+        return chunks
+    def create_embeddings(self, chunks: List[str]) -> np.ndarray:
+        """Create embeddings for text chunks."""
+        return self.model.encode(chunks)
+    def build_faiss_index(self, embeddings: np.ndarray) -> faiss.Index:
+        """Build and return a FAISS index."""
+        dimension = embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(embeddings.astype('float32'))
+        return index
+    def save_index(self, index: faiss.Index, path: str):
+        """Save FAISS index to disk."""
+        faiss.write_index(index, path)
+    def load_index(self, path: str) -> faiss.Index:
+        """Load FAISS index from disk."""
+        return faiss.read_index(path)
+    def process_document(self, pdf_path: str, index_path: str) -> Tuple[List[str], faiss.Index]:
+        """Process document and create FAISS index."""
+        # Extract and chunk text
+        text = self.extract_text_from_pdf(pdf_path)
+        chunks = self.chunk_text(text)
+        # Create embeddings and index
+        embeddings = self.create_embeddings(chunks)
+        index = self.build_faiss_index(embeddings)
+        # Save index
+        self.save_index(index, index_path)
+        return chunks, index
+    def retrieve_chunks(self, query: str, index: faiss.Index, chunks: List[str], k: int = 5) -> List[str]:
+        """Retrieve most relevant chunks for a query."""
+        query_embedding = self.model.encode([query])
+        distances, indices = index.search(query_embedding.astype('float32'), k)
+        return [chunks[i] for i in indices[0]]