Spaces:

techindia2025
/

medbot_2

Running on Zero

App Files Files Community

techindia2025 commited on May 22

Commit

c66e1bd

verified ·

1 Parent(s): 7dd1c93

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -100

app.py CHANGED Viewed

@@ -1,12 +1,7 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import spaces
-from langchain_community.llms import HuggingFacePipeline
-from langchain_core.prompts import PromptTemplate
-from langchain.chains import LLMChain
-from langchain_core.runnables import RunnableWithMessageHistory
-from langchain.memory import ConversationBufferMemory
 # Model configuration
 LLAMA_MODEL = "meta-llama/Llama-2-7b-chat-hf"
@@ -38,105 +33,110 @@ Patient information: {patient_info}
 <|im_start|>assistant
 """
-# Track conversation turns
 conversation_turns = 0
 patient_data = []
-# Create a GPU-decorated function for model loading
-@spaces.GPU
-def load_models():
-    print("Loading Llama-2 model...")
-    llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL)
-    llama_model = AutoModelForCausalLM.from_pretrained(
-        LLAMA_MODEL,
-        torch_dtype=torch.float16,
-        device_map="auto"
-    )
-    # Create a pipeline for LangChain
-    llama_pipeline = pipeline(
-        "text-generation",
-        model=llama_model,
-        tokenizer=llama_tokenizer,
-        max_new_tokens=512,
-        temperature=0.7,
-        top_p=0.9,
-        do_sample=True
-    )
-    llama_llm = HuggingFacePipeline(pipeline=llama_pipeline)
-    print("Llama-2 model loaded successfully!")
-    print("Loading Meditron model...")
-    meditron_tokenizer = AutoTokenizer.from_pretrained(MEDITRON_MODEL)
-    meditron_model = AutoModelForCausalLM.from_pretrained(
-        MEDITRON_MODEL,
-        torch_dtype=torch.float16,
-        device_map="auto"
-    )
-    # Create a pipeline for Meditron
-    meditron_pipeline = pipeline(
-        "text-generation",
-        model=meditron_model,
-        tokenizer=meditron_tokenizer,
-        max_new_tokens=256,
-        temperature=0.7,
-        top_p=0.9,
-        do_sample=True
-    )
-    meditron_llm = HuggingFacePipeline(pipeline=meditron_pipeline)
-    print("Meditron model loaded successfully!")
-    return llama_llm, meditron_llm, llama_tokenizer, meditron_tokenizer
-# Load models
-llama_llm, meditron_llm, llama_tokenizer, meditron_tokenizer = load_models()
-# Create LangChain conversation with memory
-memory = ConversationBufferMemory(return_messages=True)
-# Create a template for the Meditron model
-meditron_template = PromptTemplate(
-    input_variables=["patient_info"],
-    template=MEDITRON_PROMPT
-)
-meditron_chain = LLMChain(
-    llm=meditron_llm,
-    prompt=meditron_template,
-    verbose=True
-)
 @spaces.GPU
 def generate_response(message, history):
     global conversation_turns, patient_data
     conversation_turns += 1
-    # Store patient message
     patient_data.append(message)
-    # Format the prompt with system instructions
-    if conversation_turns >= 4:
-        # Add summarization instruction after 4 turns
-        prompt = f"{SYSTEM_PROMPT}\n\nNow summarize what you've learned and suggest when professional care may be needed.\n\n{message}"
-    else:
-        prompt = f"{SYSTEM_PROMPT}\n\n{message}"
     # Build the prompt with proper Llama-2 formatting
-    formatted_prompt = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n\n"
-    # Add conversation history
-    for user_msg, assistant_msg in history:
-        formatted_prompt += f"{user_msg} [/INST] {assistant_msg} </s><s>[INST] "
-    # Add the current user input
-    formatted_prompt += f"{message} [/INST] "
-    # Generate response using Llama model
-    inputs = llama_tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
     with torch.no_grad():
-        outputs = llama_llm.pipeline.model.generate(
-            inputs.input_ids,
-            attention_mask=inputs.attention_mask,
             max_new_tokens=512,
             temperature=0.7,
             top_p=0.9,
@@ -153,20 +153,8 @@ def generate_response(message, history):
         # Collect full patient conversation
         full_patient_info = "\n".join(patient_data) + "\n\nSummary: " + llama_response
-        # Get medicine suggestions using Meditron
-        inputs = meditron_tokenizer(MEDITRON_PROMPT.format(patient_info=full_patient_info), return_tensors="pt").to("cuda")
-        with torch.no_grad():
-            outputs = meditron_llm.pipeline.model.generate(
-                inputs.input_ids,
-                attention_mask=inputs.attention_mask,
-                max_new_tokens=256,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True
-            )
-        medicine_suggestions = meditron_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
         # Format final response
         final_response = (

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import spaces
 # Model configuration
 LLAMA_MODEL = "meta-llama/Llama-2-7b-chat-hf"
 <|im_start|>assistant
 """
+# Global variables to store models (will be loaded lazily)
+llama_model = None
+llama_tokenizer = None
+meditron_model = None
+meditron_tokenizer = None
 conversation_turns = 0
 patient_data = []
+def build_llama2_prompt(system_prompt, history, user_input):
+    """Format the conversation history and user input for Llama-2 chat models."""
+    prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
+    # Add conversation history
+    for user_msg, assistant_msg in history:
+        prompt += f"{user_msg} [/INST] {assistant_msg} </s><s>[INST] "
+    # Add the current user input
+    prompt += f"{user_input} [/INST] "
+    return prompt
+@spaces.GPU
+def load_models_if_needed():
+    """Load models only when GPU is available and only if not already loaded."""
+    global llama_model, llama_tokenizer, meditron_model, meditron_tokenizer
+    if llama_model is None:
+        print("Loading Llama-2 model...")
+        llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL)
+        llama_model = AutoModelForCausalLM.from_pretrained(
+            LLAMA_MODEL,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        print("Llama-2 model loaded successfully!")
+    if meditron_model is None:
+        print("Loading Meditron model...")
+        meditron_tokenizer = AutoTokenizer.from_pretrained(MEDITRON_MODEL)
+        meditron_model = AutoModelForCausalLM.from_pretrained(
+            MEDITRON_MODEL,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        print("Meditron model loaded successfully!")
+@spaces.GPU
+def get_meditron_suggestions(patient_info):
+    """Use Meditron model to generate medicine and remedy suggestions."""
+    load_models_if_needed()  # Ensure models are loaded
+    prompt = MEDITRON_PROMPT.format(patient_info=patient_info)
+    inputs = meditron_tokenizer(prompt, return_tensors="pt")
+    # Move inputs to the same device as the model
+    if torch.cuda.is_available():
+        inputs = {k: v.to(meditron_model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = meditron_model.generate(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=256,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True,
+            pad_token_id=meditron_tokenizer.eos_token_id
+        )
+    suggestion = meditron_tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    return suggestion
 @spaces.GPU
 def generate_response(message, history):
+    """Generate a response using both models."""
     global conversation_turns, patient_data
+    # Load models if needed
+    load_models_if_needed()
+    # Track conversation turns
     conversation_turns += 1
+    # Store the entire conversation for reference
     patient_data.append(message)
     # Build the prompt with proper Llama-2 formatting
+    prompt = build_llama2_prompt(SYSTEM_PROMPT, history, message)
+    # Add summarization instruction after 4 turns
+    if conversation_turns >= 4:
+        prompt = prompt.replace("[/INST] ", "[/INST] Now summarize what you've learned and suggest when professional care may be needed. ")
+    inputs = llama_tokenizer(prompt, return_tensors="pt")
+    # Move inputs to the same device as the model
+    if torch.cuda.is_available():
+        inputs = {k: v.to(llama_model.device) for k, v in inputs.items()}
+    # Generate the Llama-2 response
     with torch.no_grad():
+        outputs = llama_model.generate(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
             max_new_tokens=512,
             temperature=0.7,
             top_p=0.9,
         # Collect full patient conversation
         full_patient_info = "\n".join(patient_data) + "\n\nSummary: " + llama_response
+        # Get medicine suggestions
+        medicine_suggestions = get_meditron_suggestions(full_patient_info)
         # Format final response
         final_response = (