techindia2025 commited on
Commit
7dd1c93
·
verified ·
1 Parent(s): b6f3058

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -60
app.py CHANGED
@@ -1,16 +1,17 @@
1
- from langchain.chains import ConversationChain, LLMChain
2
- from langchain.prompts import PromptTemplate
3
- from langchain.llms import HuggingFacePipeline
4
- from langchain.memory import ConversationBufferMemory
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
6
- import torch
7
  import gradio as gr
 
 
 
 
 
 
 
 
8
 
9
  # Model configuration
10
  LLAMA_MODEL = "meta-llama/Llama-2-7b-chat-hf"
11
  MEDITRON_MODEL = "epfl-llm/meditron-7b"
12
 
13
- # System prompts
14
  SYSTEM_PROMPT = """You are a professional virtual doctor. Your goal is to collect detailed information about the user's health condition, symptoms, medical history, medications, lifestyle, and other relevant data.
15
  Ask 1-2 follow-up questions at a time to gather more details about:
16
  - Detailed description of symptoms
@@ -37,55 +38,61 @@ Patient information: {patient_info}
37
  <|im_start|>assistant
38
  """
39
 
40
- print("Loading Llama-2 model...")
41
- # Create LangChain wrapper for Llama-2
42
- llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL)
43
- llama_model = AutoModelForCausalLM.from_pretrained(
44
- LLAMA_MODEL,
45
- torch_dtype=torch.float16,
46
- device_map="auto"
47
- )
48
 
49
- # Create a pipeline for LangChain
50
- llama_pipeline = pipeline(
51
- "text-generation",
52
- model=llama_model,
53
- tokenizer=llama_tokenizer,
54
- max_new_tokens=512,
55
- temperature=0.7,
56
- top_p=0.9,
57
- do_sample=True
58
- )
59
- llama_llm = HuggingFacePipeline(pipeline=llama_pipeline)
60
- print("Llama-2 model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- print("Loading Meditron model...")
63
- meditron_tokenizer = AutoTokenizer.from_pretrained(MEDITRON_MODEL)
64
- meditron_model = AutoModelForCausalLM.from_pretrained(
65
- MEDITRON_MODEL,
66
- torch_dtype=torch.float16,
67
- device_map="auto"
68
- )
69
- # Create a pipeline for Meditron
70
- meditron_pipeline = pipeline(
71
- "text-generation",
72
- model=meditron_model,
73
- tokenizer=meditron_tokenizer,
74
- max_new_tokens=256,
75
- temperature=0.7,
76
- top_p=0.9,
77
- do_sample=True
78
- )
79
- meditron_llm = HuggingFacePipeline(pipeline=meditron_pipeline)
80
- print("Meditron model loaded successfully!")
81
 
82
  # Create LangChain conversation with memory
83
  memory = ConversationBufferMemory(return_messages=True)
84
- conversation = ConversationChain(
85
- llm=llama_llm,
86
- memory=memory,
87
- verbose=True
88
- )
89
 
90
  # Create a template for the Meditron model
91
  meditron_template = PromptTemplate(
@@ -98,10 +105,7 @@ meditron_chain = LLMChain(
98
  verbose=True
99
  )
100
 
101
- # Track conversation turns
102
- conversation_turns = 0
103
- patient_data = []
104
-
105
  def generate_response(message, history):
106
  global conversation_turns, patient_data
107
  conversation_turns += 1
@@ -116,16 +120,53 @@ def generate_response(message, history):
116
  else:
117
  prompt = f"{SYSTEM_PROMPT}\n\n{message}"
118
 
119
- # Generate response using LangChain conversation
120
- llama_response = conversation.predict(input=prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  # After 4 turns, add medicine suggestions from Meditron
123
  if conversation_turns >= 4:
124
  # Collect full patient conversation
125
  full_patient_info = "\n".join(patient_data) + "\n\nSummary: " + llama_response
126
 
127
- # Get medicine suggestions using LangChain
128
- medicine_suggestions = meditron_chain.run(patient_info=full_patient_info)
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  # Format final response
131
  final_response = (
@@ -151,4 +192,4 @@ demo = gr.ChatInterface(
151
  )
152
 
153
  if __name__ == "__main__":
154
- demo.launch()
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ import spaces
5
+ from langchain_community.llms import HuggingFacePipeline
6
+ from langchain_core.prompts import PromptTemplate
7
+ from langchain.chains import LLMChain
8
+ from langchain_core.runnables import RunnableWithMessageHistory
9
+ from langchain.memory import ConversationBufferMemory
10
 
11
  # Model configuration
12
  LLAMA_MODEL = "meta-llama/Llama-2-7b-chat-hf"
13
  MEDITRON_MODEL = "epfl-llm/meditron-7b"
14
 
 
15
  SYSTEM_PROMPT = """You are a professional virtual doctor. Your goal is to collect detailed information about the user's health condition, symptoms, medical history, medications, lifestyle, and other relevant data.
16
  Ask 1-2 follow-up questions at a time to gather more details about:
17
  - Detailed description of symptoms
 
38
  <|im_start|>assistant
39
  """
40
 
41
+ # Track conversation turns
42
+ conversation_turns = 0
43
+ patient_data = []
 
 
 
 
 
44
 
45
+ # Create a GPU-decorated function for model loading
46
+ @spaces.GPU
47
+ def load_models():
48
+ print("Loading Llama-2 model...")
49
+ llama_tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL)
50
+ llama_model = AutoModelForCausalLM.from_pretrained(
51
+ LLAMA_MODEL,
52
+ torch_dtype=torch.float16,
53
+ device_map="auto"
54
+ )
55
+
56
+ # Create a pipeline for LangChain
57
+ llama_pipeline = pipeline(
58
+ "text-generation",
59
+ model=llama_model,
60
+ tokenizer=llama_tokenizer,
61
+ max_new_tokens=512,
62
+ temperature=0.7,
63
+ top_p=0.9,
64
+ do_sample=True
65
+ )
66
+ llama_llm = HuggingFacePipeline(pipeline=llama_pipeline)
67
+ print("Llama-2 model loaded successfully!")
68
+
69
+ print("Loading Meditron model...")
70
+ meditron_tokenizer = AutoTokenizer.from_pretrained(MEDITRON_MODEL)
71
+ meditron_model = AutoModelForCausalLM.from_pretrained(
72
+ MEDITRON_MODEL,
73
+ torch_dtype=torch.float16,
74
+ device_map="auto"
75
+ )
76
+ # Create a pipeline for Meditron
77
+ meditron_pipeline = pipeline(
78
+ "text-generation",
79
+ model=meditron_model,
80
+ tokenizer=meditron_tokenizer,
81
+ max_new_tokens=256,
82
+ temperature=0.7,
83
+ top_p=0.9,
84
+ do_sample=True
85
+ )
86
+ meditron_llm = HuggingFacePipeline(pipeline=meditron_pipeline)
87
+ print("Meditron model loaded successfully!")
88
+
89
+ return llama_llm, meditron_llm, llama_tokenizer, meditron_tokenizer
90
 
91
+ # Load models
92
+ llama_llm, meditron_llm, llama_tokenizer, meditron_tokenizer = load_models()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  # Create LangChain conversation with memory
95
  memory = ConversationBufferMemory(return_messages=True)
 
 
 
 
 
96
 
97
  # Create a template for the Meditron model
98
  meditron_template = PromptTemplate(
 
105
  verbose=True
106
  )
107
 
108
+ @spaces.GPU
 
 
 
109
  def generate_response(message, history):
110
  global conversation_turns, patient_data
111
  conversation_turns += 1
 
120
  else:
121
  prompt = f"{SYSTEM_PROMPT}\n\n{message}"
122
 
123
+ # Build the prompt with proper Llama-2 formatting
124
+ formatted_prompt = f"<s>[INST] <<SYS>>\n{SYSTEM_PROMPT}\n<</SYS>>\n\n"
125
+
126
+ # Add conversation history
127
+ for user_msg, assistant_msg in history:
128
+ formatted_prompt += f"{user_msg} [/INST] {assistant_msg} </s><s>[INST] "
129
+
130
+ # Add the current user input
131
+ formatted_prompt += f"{message} [/INST] "
132
+
133
+ # Generate response using Llama model
134
+ inputs = llama_tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
135
+
136
+ with torch.no_grad():
137
+ outputs = llama_llm.pipeline.model.generate(
138
+ inputs.input_ids,
139
+ attention_mask=inputs.attention_mask,
140
+ max_new_tokens=512,
141
+ temperature=0.7,
142
+ top_p=0.9,
143
+ do_sample=True,
144
+ pad_token_id=llama_tokenizer.eos_token_id
145
+ )
146
+
147
+ # Decode and extract Llama-2's response
148
+ full_response = llama_tokenizer.decode(outputs[0], skip_special_tokens=False)
149
+ llama_response = full_response.split('[/INST]')[-1].split('</s>')[0].strip()
150
 
151
  # After 4 turns, add medicine suggestions from Meditron
152
  if conversation_turns >= 4:
153
  # Collect full patient conversation
154
  full_patient_info = "\n".join(patient_data) + "\n\nSummary: " + llama_response
155
 
156
+ # Get medicine suggestions using Meditron
157
+ inputs = meditron_tokenizer(MEDITRON_PROMPT.format(patient_info=full_patient_info), return_tensors="pt").to("cuda")
158
+
159
+ with torch.no_grad():
160
+ outputs = meditron_llm.pipeline.model.generate(
161
+ inputs.input_ids,
162
+ attention_mask=inputs.attention_mask,
163
+ max_new_tokens=256,
164
+ temperature=0.7,
165
+ top_p=0.9,
166
+ do_sample=True
167
+ )
168
+
169
+ medicine_suggestions = meditron_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
170
 
171
  # Format final response
172
  final_response = (
 
192
  )
193
 
194
  if __name__ == "__main__":
195
+ demo.launch()