Spaces:

waghib
/

RAG-for-Diagnostic-Reasoning-for-Clinical-Notes

Sleeping

App Files Files Community

waghib commited on Apr 5

Commit

d0ba7fb

verified ·

1 Parent(s): c50df20

Updated app.py to improve the syntax compatible with spaces.

Browse files

Files changed (1) hide show

app.py +129 -56

app.py CHANGED Viewed

@@ -6,12 +6,13 @@ import torch
 import streamlit as st
 from dotenv import load_dotenv
 from langchain_groq import ChatGroq
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
-from langchain.docstore.document import Document
-from langchain.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain
 import numpy as np
 from sentence_transformers import util
 import time
@@ -19,15 +20,37 @@ import time
 # Set device for model (CUDA if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load environment variables
 load_dotenv()
 # Set up the clinical assistant LLM
-groq_api_key = os.getenv('GROQ_API_KEY')
-if not groq_api_key:
-    raise ValueError("API Key is not set in the secrets.")
-llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile")
 # Set up embeddings for clinical context (Bio_ClinicalBERT)
 embeddings = HuggingFaceEmbeddings(
@@ -38,38 +61,74 @@ embeddings = HuggingFaceEmbeddings(
 def load_clinical_data():
     """Load both flowcharts and patient cases"""
     docs = []
-    # Load diagnosis flowcharts
-    for fpath in glob.glob("./Diagnosis_flowchart/*.json"):
-        with open(fpath) as f:
-            data = json.load(f)
-            content = f"""
-            DIAGNOSTIC FLOWCHART: {Path(fpath).stem}
-            Diagnostic Path: {data['diagnostic']}
-            Key Criteria: {data['knowledge']}
-            """
             docs.append(Document(
-                page_content=content,
-                metadata={"source": fpath, "type": "flowchart"}
             ))
-    # Load patient cases
-    for category_dir in glob.glob("./Finished/*"):
-        if os.path.isdir(category_dir):
-            for case_file in glob.glob(f"{category_dir}/*.json"):
-                with open(case_file) as f:
-                    case_data = json.load(f)
-                    notes = "\n".join(
-                        f"{k}: {v}" for k, v in case_data.items() if k.startswith("input")
-                    )
-                    docs.append(Document(
-                        page_content=f"""
-                        PATIENT CASE: {Path(case_file).stem}
-                        Category: {Path(category_dir).name}
-                        Notes: {notes}
-                        """,
-                        metadata={"source": case_file, "type": "patient_case"}
-                    ))
     return docs
 def build_vectorstore():
@@ -88,31 +147,45 @@ def get_vectorstore():
 def run_rag_chat(query, vectorstore):
     """Run the Retrieval-Augmented Generation (RAG) for clinical questions"""
-    retriever = vectorstore.as_retriever()
-    prompt_template = ChatPromptTemplate.from_template("""
-    You are a clinical assistant AI. Based on the following clinical context, provide a reasoned and medically sound answer to the question.
-    <context>
-    {context}
-    </context>
-    Question: {input}
-    Answer:
-    """)
-    retrieved_docs = retriever.invoke(query, k=3)
-    retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])
-    chain = create_retrieval_chain(
-        retriever,
-        create_stuff_documents_chain(llm, prompt_template)
-    )
-    response = chain.invoke({"input": query, "context": retrieved_context})
-    return response
 def calculate_hit_rate(retriever, query, expected_docs, k=3):
     """Calculate the hit rate for top-k retrieved documents"""

 import streamlit as st
 from dotenv import load_dotenv
 from langchain_groq import ChatGroq
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_core.prompts import ChatPromptTemplate
 from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
 import numpy as np
 from sentence_transformers import util
 import time
 # Set device for model (CUDA if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load environment variables - works for both local and Hugging Face Spaces
 load_dotenv()
 # Set up the clinical assistant LLM
+# Try to get API key from Hugging Face Spaces secrets first, then fall back to .env file
+try:
+    # For Hugging Face Spaces
+    from huggingface_hub.inference_api import InferenceApi
+    import os
+    groq_api_key = os.environ.get('GROQ_API_KEY')
+    # If not found in environment, try to get from st.secrets (Streamlit Cloud/Spaces)
+    if not groq_api_key and hasattr(st, 'secrets') and 'GROQ_API_KEY' in st.secrets:
+        groq_api_key = st.secrets['GROQ_API_KEY']
+    if not groq_api_key:
+        st.warning("API Key is not set in the secrets. Using a placeholder for UI demonstration.")
+        # For UI demonstration without API key
+        class MockLLM:
+            def invoke(self, prompt):
+                return {"answer": "This is a placeholder response. Please set up your GROQ_API_KEY to get real responses."}
+        llm = MockLLM()
+    else:
+        llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama-3.3-70b-versatile")
+except Exception as e:
+    st.error(f"Error setting up LLM: {str(e)}")
+    class MockLLM:
+        def invoke(self, prompt):
+            return {"answer": f"Error setting up LLM: {str(e)}. Please check your API key configuration."}
+    llm = MockLLM()
 # Set up embeddings for clinical context (Bio_ClinicalBERT)
 embeddings = HuggingFaceEmbeddings(
 def load_clinical_data():
     """Load both flowcharts and patient cases"""
     docs = []
+    # Get the absolute path to the current script
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    # Try to handle potential errors with file loading
+    try:
+        # Load diagnosis flowcharts
+        flowchart_dir = os.path.join(current_dir, "Diagnosis_flowchart")
+        if os.path.exists(flowchart_dir):
+            for fpath in glob.glob(os.path.join(flowchart_dir, "*.json")):
+                try:
+                    with open(fpath, 'r', encoding='utf-8') as f:
+                        data = json.load(f)
+                        content = f"""
+                        DIAGNOSTIC FLOWCHART: {Path(fpath).stem}
+                        Diagnostic Path: {data.get('diagnostic', 'N/A')}
+                        Key Criteria: {data.get('knowledge', 'N/A')}
+                        """
+                        docs.append(Document(
+                            page_content=content,
+                            metadata={"source": fpath, "type": "flowchart"}
+                        ))
+                except Exception as e:
+                    st.warning(f"Error loading flowchart file {fpath}: {str(e)}")
+        else:
+            st.warning(f"Flowchart directory not found at {flowchart_dir}")
+        # Load patient cases
+        finished_dir = os.path.join(current_dir, "Finished")
+        if os.path.exists(finished_dir):
+            for category_dir in glob.glob(os.path.join(finished_dir, "*")):
+                if os.path.isdir(category_dir):
+                    for case_file in glob.glob(os.path.join(category_dir, "*.json")):
+                        try:
+                            with open(case_file, 'r', encoding='utf-8') as f:
+                                case_data = json.load(f)
+                                notes = "\n".join(
+                                    f"{k}: {v}" for k, v in case_data.items() if k.startswith("input")
+                                )
+                                docs.append(Document(
+                                    page_content=f"""
+                                    PATIENT CASE: {Path(case_file).stem}
+                                    Category: {Path(category_dir).name}
+                                    Notes: {notes}
+                                    """,
+                                    metadata={"source": case_file, "type": "patient_case"}
+                                ))
+                        except Exception as e:
+                            st.warning(f"Error loading case file {case_file}: {str(e)}")
+        else:
+            st.warning(f"Finished directory not found at {finished_dir}")
+        # If no documents were loaded, add a sample document for testing
+        if not docs:
+            st.warning("No clinical data files found. Using sample data for demonstration.")
             docs.append(Document(
+                page_content="""SAMPLE CLINICAL DATA: This is sample data for demonstration purposes.
+                This application requires clinical data files to be present in the correct directories.
+                Please ensure the Diagnosis_flowchart and Finished directories exist with proper JSON files.""",
+                metadata={"source": "sample", "type": "sample"}
             ))
+    except Exception as e:
+        st.error(f"Error loading clinical data: {str(e)}")
+        # Add a fallback document
+        docs.append(Document(
+            page_content="Error loading clinical data. This is a fallback document for demonstration purposes.",
+            metadata={"source": "error", "type": "error"}
+        ))
     return docs
 def build_vectorstore():
 def run_rag_chat(query, vectorstore):
     """Run the Retrieval-Augmented Generation (RAG) for clinical questions"""
+    try:
+        retriever = vectorstore.as_retriever()
+        prompt_template = ChatPromptTemplate.from_template("""
+        You are a clinical assistant AI. Based on the following clinical context, provide a reasoned and medically sound answer to the question.
+        <context>
+        {context}
+        </context>
+        Question: {input}
+        Answer:
+        """)
+        retrieved_docs = retriever.invoke(query, k=3)
+        retrieved_context = "\n".join([doc.page_content for doc in retrieved_docs])
+        # Create document chain first
+        document_chain = create_stuff_documents_chain(llm, prompt_template)
+        # Then create retrieval chain
+        chain = create_retrieval_chain(retriever, document_chain)
+        # Invoke the chain
+        response = chain.invoke({"input": query})
+        # Add retrieved documents to response for transparency
+        response["context"] = retrieved_docs
+        return response
+    except Exception as e:
+        st.error(f"Error in RAG processing: {str(e)}")
+        # Return a fallback response
+        return {
+            "answer": f"I encountered an error processing your query: {str(e)}",
+            "context": [],
+            "input": query
+        }
 def calculate_hit_rate(retriever, query, expected_docs, k=3):
     """Calculate the hit rate for top-k retrieved documents"""