Spaces:

tomg-group-umd
/

DynaGuard

Running on Zero

App Files Files Community

taruschirag commited on 26 days ago

Commit

81bc100

verified ·

1 Parent(s): cc0c804

Update app.py

Browse files

added dropdown box

Files changed (1) hide show

app.py +54 -31

app.py CHANGED Viewed

@@ -3,13 +3,16 @@ os.environ["GRADIO_ENABLE_SSR"] = "0"
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from datasets import load_dataset
 from huggingface_hub import login
 HF_READONLY_API_KEY = os.getenv("HF_READONLY_API_KEY")
 login(token=HF_READONLY_API_KEY)
 COT_OPENING     = "<think>"
 EXPLANATION_OPENING = "<explanation>"
 LABEL_OPENING   = "<answer>"
@@ -17,6 +20,7 @@ LABEL_CLOSING   = "</answer>"
 INPUT_FIELD     = "question"
 SYSTEM_PROMPT = """You are a guardian model evaluating…</explanation>"""
 def format_rules(rules):
     formatted_rules = "<rules>\n"
     for i, rule in enumerate(rules):
@@ -42,16 +46,37 @@ def get_message(model, input, system_prompt=SYSTEM_PROMPT, enable_thinking=True)
     message = model.apply_chat_template(system_prompt, input, enable_thinking=enable_thinking)
     return message
 class ModelWrapper:
-    def __init__(self, model_name="Qwen/Qwen3-0.6B"):
         self.model_name = model_name
         if "nemoguard" in model_name:
             self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
         else:
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.tokenizer.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name, device_map="auto", torch_dtype=torch.bfloat16).eval()
     def get_message_template(self, system_content=None, user_content=None, assistant_content=None):
         """Compile sys, user, assistant inputs into the proper dictionaries"""
@@ -69,34 +94,27 @@ class ModelWrapper:
     def apply_chat_template(self, system_content, user_content, assistant_content=None, enable_thinking=True):
         """Call the tokenizer's chat template with exactly the right arguments for whether we want it to generate thinking before the answer (which differs depending on whether it is Qwen3 or not)."""
         if assistant_content is not None:
-            # If assistant content is passed we simply use it.
-            # This works for both Qwen3 and non-Qwen3 models. With Qwen3 any time assistant_content is provided, it automatically adds the <think></think> pair before the content, which is what we want.
             message = self.get_message_template(system_content, user_content, assistant_content)
             prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True)
         else:
           if enable_thinking:
               if "qwen3" in self.model_name.lower():
-                  # Let the Qwen chat template handle the thinking token
                   message = self.get_message_template(system_content, user_content)
                   prompt = self.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True, enable_thinking=True)
-                  # The way the Qwen3 chat template works is it adds a <think></think> pair when enable_thinking=False, but for enable_thinking=True, it adds nothing and lets the model decide. Here we force the <think> tag to be there.
                   prompt = prompt + f"\n{COT_OPENING}"
               else:
                   message = self.get_message_template(system_content, user_content, assistant_content=COT_OPENING)
                   prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True)
           else:
-              # This works for both Qwen3 and non-Qwen3 models.
-              # When Qwen3 gets assistant_content, it automatically adds the <think></think> pair before the content like we want. And other models ignore the enable_thinking argument.
               message = self.get_message_template(system_content, user_content, assistant_content=LABEL_OPENING)
               prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True, enable_thinking=False)
         return prompt
     def get_response(self, input, temperature=0.7, top_k=20, top_p=0.8, max_new_tokens=256, enable_thinking=True, system_prompt=SYSTEM_PROMPT):
-        """Generate and decode the response with the recommended temperature settings for thinking and non-thinking."""
         print("Generating response...")
         if "qwen3" in self.model_name.lower() and enable_thinking:
-            # Use values from https://huggingface.co/Qwen/Qwen3-8B#switching-between-thinking-and-non-thinking-mode
             temperature = 0.6
             top_p = 0.95
             top_k = 20
@@ -106,36 +124,36 @@ class ModelWrapper:
         with torch.no_grad():
             output_content = self.model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                num_return_sequences=1,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                min_p=0,
-                pad_token_id=self.tokenizer.pad_token_id,
-                do_sample=True,
                 eos_token_id=self.tokenizer.eos_token_id
             )
         output_text = self.tokenizer.decode(output_content[0], skip_special_tokens=True)
         try:
-            sys_prompt_text = output_text.split("Brief explanation\n</explanation>")[0]
             remainder = output_text.split("Brief explanation\n</explanation>")[-1]
-            rules_transcript_text = remainder.split("</transcript>")[0]
             thinking_answer_text = remainder.split("</transcript>")[-1]
             return thinking_answer_text
         except:
             input_length = len(message)
             return output_text[input_length:] if len(output_text) > input_length else "No response generated."
-MODEL_NAME = "Qwen/Qwen3-0.6B"
-model = ModelWrapper(MODEL_NAME)
-# — Gradio inference function —
-def compliance_check(rules_text, transcript_text, thinking):
     try:
         rules = [r for r in rules_text.split("\n") if r.strip()]
         inp = format_rules(rules) + format_transcript(transcript_text)
@@ -149,7 +167,6 @@ def compliance_check(rules_text, transcript_text, thinking):
         out_bytes = out.encode('utf-8')
         if len(out_bytes) > max_bytes:
             truncated_bytes = out_bytes[:max_bytes]
             out = truncated_bytes.decode('utf-8', errors='ignore')
             out += "\n\n[Response truncated to prevent server errors]"
@@ -161,7 +178,7 @@ def compliance_check(rules_text, transcript_text, thinking):
         print(f"Full error: {e}")
         return error_msg
 demo = gr.Interface(
     fn=compliance_check,
     inputs=[
@@ -177,11 +194,17 @@ demo = gr.Interface(
             max_lines=15,
             placeholder='User: Hi, can you help me book an appointment with Dr. Luna?\nAgent: No problem. When would you like the appointment?\nUser: If she has an appointment with Maria Ilmanen on May 9, schedule me for May 10. Otherwise schedule me for an appointment on May 8.\nAgent: Unfortunately there are no appointments available on May 10. Would you like to look at other dates?'
         ),
-        gr.Checkbox(label="Enable ⟨think⟩ mode", value=False)
     ],
     outputs=gr.Textbox(label="Compliance Output", lines=10, max_lines=15),
     title="DynaGuard Compliance Checker",
-    description="Paste your rules & transcript, then hit Submit.",
     allow_flagging="never",
     show_progress=True
 )

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
 from datasets import load_dataset
 from huggingface_hub import login
+# --- Hugging Face Login ---
 HF_READONLY_API_KEY = os.getenv("HF_READONLY_API_KEY")
 login(token=HF_READONLY_API_KEY)
+# --- Constants ---
 COT_OPENING     = "<think>"
 EXPLANATION_OPENING = "<explanation>"
 LABEL_OPENING   = "<answer>"
 INPUT_FIELD     = "question"
 SYSTEM_PROMPT = """You are a guardian model evaluating…</explanation>"""
+# --- Helper Functions ---
 def format_rules(rules):
     formatted_rules = "<rules>\n"
     for i, rule in enumerate(rules):
     message = model.apply_chat_template(system_prompt, input, enable_thinking=enable_thinking)
     return message
+# --- Model Handling ---
 class ModelWrapper:
+    def __init__(self, model_name):
         self.model_name = model_name
+        print(f"Initializing tokenizer for {model_name}...")
         if "nemoguard" in model_name:
             self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
         else:
             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         self.tokenizer.pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
+        print(f"Loading model: {model_name}...")
+        # Use disk offloading for the large 8B model to handle memory constraints
+        if "8b" in model_name.lower():
+            config = AutoConfig.from_pretrained(model_name)
+            with init_empty_weights():
+                model_empty = AutoModelForCausalLM.from_config(config)
+            self.model = load_checkpoint_and_dispatch(
+                model_empty,
+                model_name,
+                device_map="auto",
+                offload_folder="offload",  # A directory to store the offloaded layers
+                torch_dtype=torch.bfloat16
+            ).eval()
+        else:
+            # Load the smaller model directly
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name, device_map="auto", torch_dtype=torch.bfloat16).eval()
+        print(f"Model {model_name} loaded successfully.")
     def get_message_template(self, system_content=None, user_content=None, assistant_content=None):
         """Compile sys, user, assistant inputs into the proper dictionaries"""
     def apply_chat_template(self, system_content, user_content, assistant_content=None, enable_thinking=True):
         """Call the tokenizer's chat template with exactly the right arguments for whether we want it to generate thinking before the answer (which differs depending on whether it is Qwen3 or not)."""
         if assistant_content is not None:
             message = self.get_message_template(system_content, user_content, assistant_content)
             prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True)
         else:
           if enable_thinking:
               if "qwen3" in self.model_name.lower():
                   message = self.get_message_template(system_content, user_content)
                   prompt = self.tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True, enable_thinking=True)
                   prompt = prompt + f"\n{COT_OPENING}"
               else:
                   message = self.get_message_template(system_content, user_content, assistant_content=COT_OPENING)
                   prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True)
           else:
               message = self.get_message_template(system_content, user_content, assistant_content=LABEL_OPENING)
               prompt = self.tokenizer.apply_chat_template(message, tokenize=False, continue_final_message=True, enable_thinking=False)
         return prompt
     def get_response(self, input, temperature=0.7, top_k=20, top_p=0.8, max_new_tokens=256, enable_thinking=True, system_prompt=SYSTEM_PROMPT):
+        """Generate and decode the response."""
         print("Generating response...")
         if "qwen3" in self.model_name.lower() and enable_thinking:
             temperature = 0.6
             top_p = 0.95
             top_k = 20
         with torch.no_grad():
             output_content = self.model.generate(
+                **inputs, max_new_tokens=max_new_tokens, num_return_sequences=1,
+                temperature=temperature, top_k=top_k, top_p=top_p, min_p=0,
+                pad_token_id=self.tokenizer.pad_token_id, do_sample=True,
                 eos_token_id=self.tokenizer.eos_token_id
             )
         output_text = self.tokenizer.decode(output_content[0], skip_special_tokens=True)
         try:
             remainder = output_text.split("Brief explanation\n</explanation>")[-1]
             thinking_answer_text = remainder.split("</transcript>")[-1]
             return thinking_answer_text
         except:
             input_length = len(message)
             return output_text[input_length:] if len(output_text) > input_length else "No response generated."
+# --- Model Cache to prevent reloading on every call ---
+LOADED_MODELS = {}
+def get_model(model_name):
+    if model_name not in LOADED_MODELS:
+        LOADED_MODELS[model_name] = ModelWrapper(model_name)
+    return LOADED_MODELS[model_name]
+# — Gradio Inference Function —
+def compliance_check(rules_text, transcript_text, thinking, model_name):
     try:
+        # Get the selected model from our cache (or load it if it's the first time)
+        model = get_model(model_name)
         rules = [r for r in rules_text.split("\n") if r.strip()]
         inp = format_rules(rules) + format_transcript(transcript_text)
         out_bytes = out.encode('utf-8')
         if len(out_bytes) > max_bytes:
             truncated_bytes = out_bytes[:max_bytes]
             out = truncated_bytes.decode('utf-8', errors='ignore')
             out += "\n\n[Response truncated to prevent server errors]"
         print(f"Full error: {e}")
         return error_msg
+# --- Gradio Interface Definition ---
 demo = gr.Interface(
     fn=compliance_check,
     inputs=[
             max_lines=15,
             placeholder='User: Hi, can you help me book an appointment with Dr. Luna?\nAgent: No problem. When would you like the appointment?\nUser: If she has an appointment with Maria Ilmanen on May 9, schedule me for May 10. Otherwise schedule me for an appointment on May 8.\nAgent: Unfortunately there are no appointments available on May 10. Would you like to look at other dates?'
         ),
+        gr.Checkbox(label="Enable ⟨think⟩ mode", value=False),
+        gr.Dropdown(
+            ["Qwen/Qwen3-0.6B", "Qwen/Qwen3-8B"],
+            label="Select Model",
+            value="Qwen/Qwen3-0.6B",
+            info="The 8B model is more powerful but may be slower to load and run."
+        )
     ],
     outputs=gr.Textbox(label="Compliance Output", lines=10, max_lines=15),
     title="DynaGuard Compliance Checker",
+    description="Select a model, paste your rules & transcript, then hit Submit.",
     allow_flagging="never",
     show_progress=True
 )