Spaces:

ShenghaoYummy
/

AI-chatbot

Sleeping

App Files Files Community

ShenghaoYummy commited on 27 days ago

Commit

426176a

verified ·

1 Parent(s): f54d28c

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -32

app.py CHANGED Viewed

@@ -15,8 +15,15 @@ def load_model():
     # Ensure pad token is set
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     print("Loading base model...")
     base_model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL_ID,
@@ -29,6 +36,12 @@ def load_model():
     print("Loading PEFT adapter...")
     model = PeftModel.from_pretrained(base_model, MODEL_ID)
     print("Model loaded successfully!")
     return model, tokenizer
@@ -42,32 +55,45 @@ def generate(message, history):
     history: List of [user_message, assistant_message] pairs
     returns: assistant's reply (string)
     """
-    # Use ChatML format that your model was trained on
     DEFAULT_SYSTEM_PROMPT = "You are a helpful e-commerce customer service assistant. Provide accurate, helpful, and friendly responses to customer inquiries about products, orders, shipping, returns, and general shopping assistance."
-    # Build conversation in ChatML format
     conversation = f"<|system|>\n{DEFAULT_SYSTEM_PROMPT}\n"
-    # Add history
-    for user_msg, assistant_msg in history:
-        conversation += f"<|user|>\n{user_msg}\n<|assistant|>\n{assistant_msg}\n"
-    # Add current message
-    conversation += f"<|user|>\n{message}\n<|assistant|>\n"
-    # Tokenize
     inputs = tokenizer(
         conversation,
         return_tensors="pt",
-        max_length=512,
         truncation=True,
-        padding=True
     ).to(model.device)
-    # Generate response
     with torch.no_grad():
         outputs = model.generate(
-            **inputs,
             max_new_tokens=300,
             do_sample=True,
             temperature=0.8,
@@ -75,26 +101,28 @@ def generate(message, history):
             top_k=50,
             repetition_penalty=1.1,
             pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
         )
-    # Decode and extract assistant response
-    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Extract only the new assistant response
-    if "<|assistant|>" in full_text:
-        # Get the last assistant response
-        assistant_parts = full_text.split("<|assistant|>")
-        if len(assistant_parts) > 1:
-            reply = assistant_parts[-1].strip()
-            # Remove any trailing tokens
-            if "<|user|>" in reply:
-                reply = reply.split("<|user|>")[0].strip()
-        else:
-            reply = "I apologize, but I couldn't generate a proper response. Please try again."
     else:
         reply = "I apologize, but I couldn't generate a proper response. Please try again."
     return reply
 # Build Gradio ChatInterface
@@ -110,15 +138,15 @@ demo = (
             "I need help with my order",
             "What payment methods do you accept?"
         ],
-        type="messages",
     )
-    .queue(api_open=True)   # allow direct HTTP POST to /api/predict
 )
 # Launch the app
 if __name__ == "__main__":
     demo.launch(
-        server_name="0.0.0.0",  # Allow external access
-        server_port=7860,       # Default Gradio port
-        share=False             # Set to True if you want a public link
     )

     # Ensure pad token is set
     if tokenizer.pad_token is None:
+        print("Tokenizer pad_token not set. Setting to eos_token.")
         tokenizer.pad_token = tokenizer.eos_token
+        # It's also good to ensure the model's config reflects this if it's used during generation
+        # model.config.pad_token_id = tokenizer.pad_token_id
+        # (Do this after model is loaded if needed, but usually tokenizer.pad_token_id in generate is enough)
+    print(f"Tokenizer pad_token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}")
+    print(f"Tokenizer eos_token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}")
     print("Loading base model...")
     base_model = AutoModelForCausalLM.from_pretrained(
         BASE_MODEL_ID,
     print("Loading PEFT adapter...")
     model = PeftModel.from_pretrained(base_model, MODEL_ID)
+    # If you had to set tokenizer.pad_token, ensure the merged model's config is also aware
+    # This is more relevant if not passing pad_token_id directly to generate, but good for consistency
+    if model.config.pad_token_id is None and tokenizer.pad_token_id is not None:
+        print(f"Setting model.config.pad_token_id to: {tokenizer.pad_token_id}")
+        model.config.pad_token_id = tokenizer.pad_token_id
     print("Model loaded successfully!")
     return model, tokenizer
     history: List of [user_message, assistant_message] pairs
     returns: assistant's reply (string)
     """
     DEFAULT_SYSTEM_PROMPT = "You are a helpful e-commerce customer service assistant. Provide accurate, helpful, and friendly responses to customer inquiries about products, orders, shipping, returns, and general shopping assistance."
     conversation = f"<|system|>\n{DEFAULT_SYSTEM_PROMPT}\n"
+    if history: # Ensure history is not None or empty before iterating
+        for user_msg, assistant_msg in history:
+            # Ensure messages are strings
+            user_msg_str = str(user_msg) if user_msg is not None else ""
+            assistant_msg_str = str(assistant_msg) if assistant_msg is not None else ""
+            conversation += f"<|user|>\n{user_msg_str}\n<|assistant|>\n{assistant_msg_str}\n"
+    message_str = str(message) if message is not None else ""
+    conversation += f"<|user|>\n{message_str}\n<|assistant|>\n"
+    print(f"--- Constructed Prompt ---\n{conversation}\n--------------------------")
     inputs = tokenizer(
         conversation,
         return_tensors="pt",
+        max_length=512, # Max length of context + new tokens for some models, but here it's input context length
         truncation=True,
+        padding=True # Pad to max_length or longest in batch if dynamic
     ).to(model.device)
+    input_length = inputs["input_ids"].shape[1]
+    # Ensure eos_token_id is correctly set for generation
+    # If your model was trained to use <|end|> as an EOS token, its ID should be tokenizer.eos_token_id
+    eos_token_id_to_use = tokenizer.eos_token_id
+    # Example: if <|end|> has a specific ID different from the default eos_token
+    # end_custom_token_id = tokenizer.convert_tokens_to_ids("<|end|>")
+    # if end_custom_token_id != tokenizer.unk_token_id: # Check if token exists
+    #     eos_token_id_to_use = end_custom_token_id
+    # print(f"Using EOS token ID for generation: {eos_token_id_to_use}")
     with torch.no_grad():
         outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
             max_new_tokens=300,
             do_sample=True,
             temperature=0.8,
             top_k=50,
             repetition_penalty=1.1,
             pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=eos_token_id_to_use, # Use the determined EOS token ID
         )
+    new_tokens = outputs[0][input_length:]
+    generated_reply_part = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
+    print(f"--- Raw Generated Reply Part (after skip_special_tokens=True) ---\n{generated_reply_part}\n----------------------------------------------------------------")
+    end_token_marker = "<|end|>" # The specific string marker you're looking for
+    first_end_token_pos = generated_reply_part.find(end_token_marker)
+    if first_end_token_pos != -1:
+        reply = generated_reply_part[:first_end_token_pos].strip()
     else:
+        reply = generated_reply_part # Use the whole string if <|end|> isn't found
+    # Fallback if the reply is empty after processing
+    if not reply:
+        print("Warning: Reply became empty after processing. Using fallback.")
         reply = "I apologize, but I couldn't generate a proper response. Please try again."
+    print(f"--- Final Reply ---\n{reply}\n-------------------")
     return reply
 # Build Gradio ChatInterface
             "I need help with my order",
             "What payment methods do you accept?"
         ],
+        type="messages", # Ensures history is a list of lists/tuples
     )
+    .queue(api_open=True)
 )
 # Launch the app
 if __name__ == "__main__":
     demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
     )