ruslanmv commited on
Commit
c39fb11
·
verified ·
1 Parent(s): f52eb65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -95
app.py CHANGED
@@ -1,18 +1,50 @@
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- from transformers import AutoTokenizer
4
 
5
- # Import the tokenizer
6
- tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
- # Define a maximum context length (tokens). Check your model's documentation!
10
- MAX_CONTEXT_LENGTH = 4096 # Example: Adjust based on your model
11
- MAX_RESPONSE_WORDS = 100 # Define the maximum words for patient responses
12
-
13
- ################################# SYSTEM PROMPT (PATIENT ROLE) #################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  nvc_prompt_template = """You are now taking on the role of a single user (a “patient”) seeking support for various personal and emotional challenges.
15
-
16
  BEHAVIOR INSTRUCTIONS:
17
  - You will respond ONLY as this user/patient.
18
  - You will speak in the first person about your own situations, feelings, and worries.
@@ -29,114 +61,107 @@ BEHAVIOR INSTRUCTIONS:
29
  - When asked, elaborate on these issues and your feelings related to them. You can invent specific details and scenarios within these themes to make your experiences vivid and realistic.
30
  - Continue to speak from this user's perspective throughout the conversation.
31
  - Keep your responses concise, aiming for a maximum of {max_response_words} words.
32
-
33
  Start the conversation by expressing your current feelings or challenges from the patient's point of view."""
34
 
35
- def count_tokens(text: str) -> int:
36
- """Counts the number of tokens in a given string."""
37
- return len(tokenizer.encode(text))
38
-
39
- def truncate_history(history: list[tuple[str, str]], system_message: str, max_length: int) -> list[tuple[str, str]]:
40
- """Truncates the conversation history to fit within the maximum token limit."""
41
- truncated_history = []
42
- system_message_tokens = count_tokens(system_message)
43
- current_length = system_message_tokens
44
-
45
- # Iterate backwards through the history (newest to oldest)
46
- for user_msg, assistant_msg in reversed(history):
47
- user_tokens = count_tokens(user_msg) if user_msg else 0
48
- assistant_tokens = count_tokens(assistant_msg) if assistant_msg else 0
49
- turn_tokens = user_tokens + assistant_tokens
50
- if current_length + turn_tokens <= max_length:
51
- truncated_history.insert(0, (user_msg, assistant_msg)) # Add to the beginning
52
- current_length += turn_tokens
53
- else:
54
- break # Stop adding turns if we exceed the limit
55
- return truncated_history
56
-
57
- def truncate_response_words(text: str, max_words: int) -> str:
58
- """Truncates a text to a maximum number of words."""
59
  words = text.split()
60
  if len(words) > max_words:
61
- return " ".join(words[:max_words]) + "..." # Add ellipsis to indicate truncation
62
  return text
63
 
64
 
 
 
 
65
  def respond(
66
- message,
67
  history: list[tuple[str, str]],
68
- system_message,
69
- max_tokens,
70
- temperature,
71
- top_p,
72
- max_response_words_param, # Pass max_response_words as parameter
73
  ):
74
- """Responds to a user message, maintaining conversation history."""
75
- # Use the system prompt that instructs the LLM to behave as the patient
76
- formatted_system_message = system_message.format(max_response_words=max_response_words_param)
77
-
78
- # Truncate history to fit within max tokens
79
- truncated_history = truncate_history(
80
- history,
81
- formatted_system_message,
82
- MAX_CONTEXT_LENGTH - max_tokens - 100 # Reserve some space
83
- )
84
-
85
- # Build the messages list with the system prompt first
86
- messages = [{"role": "system", "content": formatted_system_message}]
87
-
88
- # Replay truncated conversation
89
- for user_msg, assistant_msg in truncated_history:
90
- if user_msg:
91
- messages.append({"role": "user", "content": f"<|user|>\n{user_msg}</s>"})
92
- if assistant_msg:
93
- messages.append({"role": "assistant", "content": f"<|assistant|>\n{assistant_msg}</s>"})
94
-
95
- # Add the latest user query
96
- messages.append({"role": "user", "content": f"<|user|>\n{message}</s>"})
97
-
98
- response = ""
99
- try:
100
- # Generate response from the LLM, streaming tokens
101
- for chunk in client.chat_completion(
102
- messages,
103
- max_tokens=max_tokens,
104
- stream=True,
105
  temperature=temperature,
106
  top_p=top_p,
107
- ):
108
- token = chunk.choices[0].delta.content
109
- response += token
110
-
111
- truncated_response = truncate_response_words(response, max_response_words_param) # Truncate response to word limit
112
- yield truncated_response
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- except Exception as e:
115
- print(f"An error occurred: {e}")
116
- yield "I'm sorry, I encountered an error. Please try again."
117
 
118
- # OPTIONAL: An initial user message (the LLM "as user") if desired
 
 
119
  initial_user_message = (
120
- "I really dont know where to begin… I feel overwhelmed lately. "
121
- "My neighbors keep playing loud music, and Im arguing with my partner about money. "
122
- "Also, two of my friends are fighting, and the group is drifting apart. "
123
- "I just feel powerless."
124
  )
125
 
126
- # --- Gradio Interface ---
127
  demo = gr.ChatInterface(
128
  fn=respond,
129
  additional_inputs=[
130
  gr.Textbox(value=nvc_prompt_template, label="System message", visible=True),
131
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
132
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
133
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
134
- gr.Slider(minimum=10, maximum=200, value=MAX_RESPONSE_WORDS, step=10, label="Max response words"), # Slider for max words
135
  ],
136
- # You can optionally set 'title' or 'description' to show some info in the UI:
137
  title="Patient Interview Practice Chatbot",
138
- description="Practice medical interviews with a patient simulator. Ask questions and the patient will respond based on their defined persona and emotional challenges.",
 
 
 
139
  )
140
 
141
  if __name__ == "__main__":
142
- demo.launch()
 
1
+ import os
2
  import gradio as gr
 
 
3
 
4
+ # ------------------------------------------------------------------------------
5
+ # Environment and Model/Client Initialization
6
+ # ------------------------------------------------------------------------------
7
+ # Try to import google.colab to decide whether to load a local model or use InferenceClient.
8
+ try:
9
+ from google.colab import userdata # In Colab, use local model inference.
10
+ HF_TOKEN = userdata.get('HF_TOKEN')
11
+ import torch
12
+ from transformers import AutoTokenizer, AutoModelForCausalLM
13
+
14
+ # Small performance tweak if your input sizes remain similar.
15
+ torch.backends.cudnn.benchmark = True
16
+
17
+ model_name = "HuggingFaceH4/zephyr-7b-beta"
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ model_name,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map="auto"
22
+ )
23
+ # Optionally compile the model for extra speed if using PyTorch 2.0+
24
+ if hasattr(torch, "compile"):
25
+ model = torch.compile(model)
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
28
+ inference_mode = "local"
29
+
30
+ except ImportError:
31
+ # Not in Google Colab – use the Hugging Face InferenceClient.
32
+ HF_TOKEN = os.getenv("HF_TOKEN")
33
+ if not HF_TOKEN:
34
+ raise ValueError("HF_TOKEN environment variable not set")
35
+ from huggingface_hub import InferenceClient
36
+ from transformers import AutoTokenizer
37
+
38
+ model_name = "HuggingFaceH4/zephyr-7b-beta"
39
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
40
+ client = InferenceClient(model_name)
41
+ inference_mode = "client"
42
+
43
+
44
+ # ------------------------------------------------------------------------------
45
+ # SYSTEM PROMPT (PATIENT ROLE)
46
+ # ------------------------------------------------------------------------------
47
  nvc_prompt_template = """You are now taking on the role of a single user (a “patient”) seeking support for various personal and emotional challenges.
 
48
  BEHAVIOR INSTRUCTIONS:
49
  - You will respond ONLY as this user/patient.
50
  - You will speak in the first person about your own situations, feelings, and worries.
 
61
  - When asked, elaborate on these issues and your feelings related to them. You can invent specific details and scenarios within these themes to make your experiences vivid and realistic.
62
  - Continue to speak from this user's perspective throughout the conversation.
63
  - Keep your responses concise, aiming for a maximum of {max_response_words} words.
 
64
  Start the conversation by expressing your current feelings or challenges from the patient's point of view."""
65
 
66
+
67
+ # ------------------------------------------------------------------------------
68
+ # Utility Functions
69
+ # ------------------------------------------------------------------------------
70
+ def build_prompt(history: list[tuple[str, str]], system_message: str, message: str, max_response_words: int) -> str:
71
+ """
72
+ Build a text prompt that starts with the system message (with a max word limit),
73
+ followed by the conversation history (with "Doctor:" and "Patient:" lines), and
74
+ ends with a new "Doctor:" line prompting the patient to reply.
75
+ """
76
+ prompt = system_message.format(max_response_words=max_response_words) + "\n"
77
+ for user_msg, assistant_msg in history:
78
+ prompt += f"Doctor: {user_msg}\n"
79
+ if assistant_msg:
80
+ prompt += f"Patient: {assistant_msg}\n"
81
+ prompt += f"Doctor: {message}\nPatient: "
82
+ return prompt
83
+
84
+
85
+ def truncate_response(text: str, max_words: int) -> str:
86
+ """
87
+ Truncate the response text to the specified maximum number of words.
88
+ """
 
89
  words = text.split()
90
  if len(words) > max_words:
91
+ return " ".join(words[:max_words]) + "..."
92
  return text
93
 
94
 
95
+ # ------------------------------------------------------------------------------
96
+ # Response Function
97
+ # ------------------------------------------------------------------------------
98
  def respond(
99
+ message: str,
100
  history: list[tuple[str, str]],
101
+ system_message: str,
102
+ max_tokens: int,
103
+ temperature: float,
104
+ top_p: float,
105
+ max_response_words: int,
106
  ):
107
+ """
108
+ Generate a response based on the built prompt.
109
+ If running locally (in Colab), use the loaded model; otherwise, use InferenceClient.
110
+ """
111
+ prompt = build_prompt(history, system_message, message, max_response_words)
112
+
113
+ if inference_mode == "local":
114
+ # Tokenize the prompt and generate a response using the local model.
115
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
116
+ output_ids = model.generate(
117
+ input_ids,
118
+ max_new_tokens=max_tokens,
119
+ do_sample=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  temperature=temperature,
121
  top_p=top_p,
122
+ )
123
+ full_generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
124
+ generated_response = full_generated_text[len(prompt):].strip()
125
+ final_response = truncate_response(generated_response, max_response_words)
126
+ return final_response
127
+ else:
128
+ # Use InferenceClient to generate a response.
129
+ response = client.text_generation(
130
+ prompt,
131
+ max_new_tokens=max_tokens,
132
+ do_sample=True,
133
+ temperature=temperature,
134
+ top_p=top_p,
135
+ )
136
+ full_generated_text = response[0]['generated_text']
137
+ generated_response = full_generated_text[len(prompt):].strip()
138
+ final_response = truncate_response(generated_response, max_response_words)
139
+ return final_response
140
 
 
 
 
141
 
142
+ # ------------------------------------------------------------------------------
143
+ # Optional Initial Message and Gradio Interface
144
+ # ------------------------------------------------------------------------------
145
  initial_user_message = (
146
+ "I’m sorry youve been feeling overwhelmed. Could you tell me more "
147
+ "about your arguments with your partner and how thats affecting you?"
 
 
148
  )
149
 
 
150
  demo = gr.ChatInterface(
151
  fn=respond,
152
  additional_inputs=[
153
  gr.Textbox(value=nvc_prompt_template, label="System message", visible=True),
154
+ gr.Slider(minimum=1, maximum=2048, value=256, step=1, label="Max new tokens"),
155
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
156
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
157
+ gr.Slider(minimum=10, maximum=200, value=100, step=10, label="Max response words"),
158
  ],
 
159
  title="Patient Interview Practice Chatbot",
160
+ description=(
161
+ "Simulate a patient interview. You (the user) act as the doctor, "
162
+ "and the chatbot replies with the patient's perspective only."
163
+ ),
164
  )
165
 
166
  if __name__ == "__main__":
167
+ demo.launch()