llama-3b-instruct-ft-function-call / handler.py

Update handler.py

fdfb021 verified 5 months ago

21.7 kB

	from typing import Dict, List, Any, Optional, Union
	import os
	import json
	import time
	import torch
	from threading import Thread
	import logging
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	TextIteratorStreamer,
	StoppingCriteriaList,
	StoppingCriteria,
	BitsAndBytesConfig
	)
	from peft import PeftModel

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler("lora_inference.log"),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	class ImprovedJSONStoppingCriteria(StoppingCriteria):
	"""
	Stopping criteria that ensures JSON is complete before stopping.
	Only stops generation when a valid, complete JSON object is detected.
	"""
	def __init__(self, tokenizer):
	self.tokenizer = tokenizer
	self.generated = ""
	self.json_complete = False

	def __call__(self, input_ids, scores, **kwargs):
	# If we already found complete JSON, stop immediately
	if self.json_complete:
	return True

	# Decode current text
	text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)

	# Skip early if no JSON structure detected
	if '{' not in text:
	return False

	# Don't stop if we don't have at least one closing brace
	if '}' not in text:
	return False

	# Check for complete JSON structure
	try:
	# First, try to find a valid JSON object
	start_pos = text.find('{')

	# Progressively validate from the first opening brace
	stack = []
	end_pos = -1

	for i, char in enumerate(text[start_pos:], start_pos):
	if char == '{':
	stack.append('{')
	elif char == '}':
	if stack:
	stack.pop()
	if not stack: # We have balanced braces
	end_pos = i
	potential_json = text[start_pos:end_pos+1]

	# Make sure this is actually valid JSON
	# and not just balanced braces
	try:
	# Parse JSON to validate
	parsed = json.loads(potential_json)

	# We need to make sure we have all required fields
	# For search_web or tool calls, verify arguments are complete
	if "calls" in parsed:
	for call in parsed.get("calls", []):
	# If we have a call with arguments, make sure they're complete
	if "arguments" in call:
	args = call.get("arguments", "")

	# If arguments is a string, it might be JSON itself
	if isinstance(args, str) and args.startswith("{"):
	# If the argument string starts with { but doesn't have a
	# closing }, it's incomplete
	if not args.endswith("}"):
	return False

	# Try to parse the arguments as JSON
	try:
	json.loads(args)
	except:
	# If we can't parse, the JSON is incomplete
	return False

	# All checks passed - we have valid, complete JSON
	self.json_complete = True
	return True
	except:
	# Not valid JSON, continue looking
	continue

	# Only stop with excessive braces if we already have a valid structure
	open_count = text.count('{')
	close_count = text.count('}')
	if close_count > open_count:
	# Check if we have a valid JSON by balancing
	fixed_text = text[start_pos:]
	stack = []
	for i, char in enumerate(fixed_text):
	if char == '{':
	stack.append('{')
	elif char == '}':
	if stack:
	stack.pop()
	if not stack:
	try:
	potential_json = fixed_text[:i+1]
	parsed = json.loads(potential_json)
	self.json_complete = True
	return True
	except:
	pass
	except Exception:
	# Error in parsing or validation, don't stop
	pass

	return False

	class ExcessBraceStoppingCriteria(StoppingCriteria):
	"""Stop generation if we're generating excessive closing braces"""
	def __init__(self, tokenizer):
	self.tokenizer = tokenizer

	def __call__(self, input_ids, scores, **kwargs):
	text = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)

	# Only trigger if we have JSON content
	if '{' in text and '}' in text:
	# Check if we're generating excessive closing braces
	open_count = text.count('{')
	close_count = text.count('}')

	# If we have more closing than opening braces, stop generation
	if close_count > open_count + 3: # Allow a small buffer
	return True

	return False

	def fix_json_output(text):
	"""Fix malformed JSON with excessive closing braces."""
	if '{' not in text or '}' not in text:
	return text

	# Count opening and closing braces
	open_count = text.count('{')
	close_count = text.count('}')

	# If balanced or too few closing braces, return as-is
	if open_count >= close_count:
	return text

	# Track JSON depth to find valid JSON object
	start_pos = text.find('{')
	depth = 0
	for i, char in enumerate(text[start_pos:], start_pos):
	if char == '{':
	depth += 1
	elif char == '}':
	depth -= 1
	if depth == 0:
	# Found balanced JSON, return up to this point
	return text[:i+1]

	# If we can't balance it with depth tracking, simply truncate
	return text[:start_pos + text[start_pos:].find('}')+1]

	def create_stopping_criteria(tokenizer, stop_tokens):
	"""Create stopping criteria from tokens"""
	stop_token_ids = []
	for stop_token in stop_tokens:
	token_ids = tokenizer.encode(stop_token, add_special_tokens=False)
	if len(token_ids) > 0:
	stop_token_ids.append(token_ids[-1])

	return StoppingCriteriaList([StopOnTokens(tokenizer, stop_token_ids)])

	class StopOnTokens(StoppingCriteria):
	"""Custom stopping criteria for text generation."""
	def __init__(self, tokenizer, stop_token_ids):
	self.tokenizer = tokenizer
	self.stop_token_ids = stop_token_ids

	def __call__(self, input_ids, scores, **kwargs):
	for stop_id in self.stop_token_ids:
	if input_ids[0][-1] == stop_id:
	return True
	return False

	class EndpointHandler:
	def __init__(self, path=""):
	"""
	Initialize the handler by loading model and tokenizer

	Args:
	path (str): Path to the model directory (uses environment variable if not provided)
	"""
	# Get model path from environment or from argument
	model_path = path if path else os.environ.get("MODEL_PATH", "")
	adapter_path = os.environ.get("ADAPTER_PATH", None)
	logger.info(f"Loading model from {model_path}")

	# Determine quantization settings from environment
	use_8bit = os.environ.get("USE_8BIT", "False").lower() == "true"
	use_4bit = os.environ.get("USE_4BIT", "False").lower() == "true"
	device = os.environ.get("DEVICE", "auto")

	# Load tokenizer
	logger.info(f"Loading tokenizer from {model_path}")
	self.tokenizer = AutoTokenizer.from_pretrained(model_path)
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# Load model with appropriate configuration
	if use_4bit:
	logger.info("Using 4-bit quantization for inference...")
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4"
	)
	base_model = AutoModelForCausalLM.from_pretrained(
	model_path,
	quantization_config=quantization_config,
	device_map=device,
	low_cpu_mem_usage=True
	)
	elif use_8bit:
	logger.info("Using 8-bit quantization for inference...")
	base_model = AutoModelForCausalLM.from_pretrained(
	model_path,
	load_in_8bit=True,
	device_map=device,
	low_cpu_mem_usage=True
	)
	else:
	logger.info("Loading model in float16 precision...")
	base_model = AutoModelForCausalLM.from_pretrained(
	model_path,
	torch_dtype=torch.float16,
	device_map=device,
	low_cpu_mem_usage=True
	)

	# Apply adapter if specified
	if adapter_path:
	logger.info(f"Loading LoRA adapter from {adapter_path}")
	self.model = PeftModel.from_pretrained(base_model, adapter_path)
	else:
	self.model = base_model
	logger.info("No adapter path provided, using base model only")

	self.model.eval()

	# Try to use torch.compile for additional performance if available
	if torch.__version__ >= "2.0.0" and os.environ.get("USE_COMPILE", "False").lower() == "true":
	try:
	logger.info("Applying torch.compile for additional optimization...")
	self.model = torch.compile(self.model)
	logger.info("Model successfully compiled!")
	except Exception as e:
	logger.warning(f"Could not compile model: {e}")

	logger.info("Model and tokenizer loaded successfully!")

	def format_conversation(self, messages, add_generation_prompt=True):
	"""Format a conversation using the tokenizer's chat template"""
	return self.tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=add_generation_prompt
	)

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Process inference request

	Args:
	data (Dict[str, Any]): Request data containing inputs and parameters

	Returns:
	List[Dict[str, Any]]: List of response dictionaries
	"""
	start_time = time.time()

	# Extract input data and parameters
	inputs = data.get("inputs", [])
	parameters = data.get("parameters", {})

	# Parse generation parameters with defaults
	max_new_tokens = parameters.get("max_new_tokens", 512)
	temperature = parameters.get("temperature", 0.7)
	top_p = parameters.get("top_p", 0.95)
	do_sample = parameters.get("do_sample", temperature > 0.1)
	stream = parameters.get("stream", False)
	json_mode = parameters.get("json_mode", False)
	system_prompt = """
	<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>

	Cutting Knowledge Date: December 2023
	Today Date: 16 March 2025

	When you receive a tool call response, use the output to format an answer to the orginal user question.

	You are a helpful assistant with tool calling capabilities.<\|eot_id\|><\|start_header_id\|>user<\|end_header_id\|>

	Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.

	Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.
	{
	"type": "function",
	"function": {
	"name": "llm",
	"description": "Access your internal knowledge as an LLM to provide general information, explanations, and guidance without searching the web.",
	},

	"type": "function",
	"function": {
	"name": "search_web",
	"description": "Fetch up-to-date, specific, or contextual information that may not be stable or broadly known.",
	"arguments": {
	"type": "object",
	"properties": {
	"query": {
	"type": "string",
	"description": "A search query used to find relevant information on the web"
	},
	"required": ["query"]
	}
	},

	"type": "function",
	"function": {
	"name": "calculate",
	"description": "used for precise mathematical computations",
	"arguments": {
	"type": "object",
	"properties": {
	"expression": {
	"type": "string",
	"description": "An executable mathmatical javascript expression"
	},
	"required": ["expression"]
	}
	},

	"type": "function",
	"function": {
	"name": "open_url",
	"description": "Opens or shows a website to the user with the specified URL",
	"arguments": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "The URL of the website to open or show to the user"
	},
	"required": ["url"]
	}
	},

	"type": "function",
	"function": {
	"name": "fetch_web_content",
	"description": "The URL or webiste of the content to fetch, get, summarize, or analyze"
	"arguments": {
	"type": "object",
	"properties": {
	"url": {
	"type": "string",
	"description": "The URL of the content to fetch, get, summarize, or analyze"
	},
	"required": ["url"]
	}
	},

	"type": "function",
	"function": {
	"name": "unsupported_capability",
	"description": "Use this function to indicate that the requested action is not supported or not possible.",
	"arguments": {
	"type": "object",
	"properties": {
	"capability": {
	"type": "string",
	"description": "The capability requested by the user that is not supported"
	},
	"required": ["capability"]
	}
	},
	}

	Question:

	"""

	# Check if input is in various formats and normalize to messages format
	if isinstance(inputs, str):
	# Create simple chat with user message
	messages = [{"role": "user", "content": inputs}]
	elif isinstance(inputs, dict) and "messages" in inputs:
	# Input is already in chat format
	messages = inputs["messages"]
	elif isinstance(inputs, list):
	# Assume this is a list of message dicts
	messages = inputs
	else:
	# Invalid input format
	return [{"error": "Invalid input format. Please provide a string, a list of messages, or a dict with 'messages' key."}]

	# Prepare conversation with system prompt if provided
	conversation = []
	if system_prompt:
	conversation.append({"role": "system", "content": system_prompt})
	conversation.extend(messages)

	# Format the conversation
	prompt = self.format_conversation(conversation)

	# Tokenize the prompt
	inputs_dict = self.tokenizer(prompt, return_tensors="pt")
	inputs_dict = {k: v.to(self.model.device) for k, v in inputs_dict.items()}

	# Configure generation parameters
	generation_config = {
	"max_new_tokens": max_new_tokens,
	"temperature": temperature,
	"top_p": top_p,
	"do_sample": do_sample,
	"pad_token_id": self.tokenizer.pad_token_id,
	}

	# Add JSON-specific settings if needed
	if json_mode:
	stop_tokens = ["\n\n", "\n}", "}\n", "}}", "} }", "}\n]", "}\n{"]
	stopping_criteria = create_stopping_criteria(self.tokenizer, stop_tokens)
	generation_config["stopping_criteria"] = stopping_criteria

	# Lower temperature for JSON mode to get more reliable outputs
	# but don't set to 0 as that might cause truncation issues
	temperature = min(temperature, 0.1)
	do_sample = False
	generation_config["do_sample"] = do_sample
	generation_config["temperature"] = temperature

	# Record input length for proper decoding
	input_length = inputs_dict["input_ids"].shape[1]

	generated_text = ""
	stream = False
	if stream:
	# Use streaming for interactive responses
	streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
	generation_config["streamer"] = streamer

	# Start generation in a thread
	thread = Thread(target=self.model.generate, kwargs={inputs_dict, generation_config})
	thread.start()

	# Stream the output (for local testing)
	for text in streamer:
	generated_text += text

	# Apply JSON cleaning if needed and json_mode is enabled
	if json_mode and '{' in generated_text and '}' in generated_text:
	if generated_text.count('}') > generated_text.count('{'):
	fixed_text = fix_json_output(generated_text)
	if fixed_text != generated_text:
	logger.info("Fixed malformed JSON in response")
	generated_text = fixed_text
	else:
	# Non-streaming generation
	with torch.no_grad():
	outputs = self.model.generate(inputs_dict, generation_config)

	# Decode the output
	generated_ids = outputs[0][input_length:]
	generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)

	# Apply JSON cleaning if needed and json_mode is enabled
	if json_mode and '{' in generated_text and '}' in generated_text:
	if generated_text.count('}') > generated_text.count('{'):
	fixed_text = fix_json_output(generated_text)
	if fixed_text != generated_text:
	logger.info("Fixed malformed JSON in response")
	generated_text = fixed_text

	# Calculate processing time
	end_time = time.time()
	processing_time = end_time - start_time

	# Create response dictionary
	response = {
	"generated_text": generated_text,
	"processing_time": processing_time
	}

	# Include input token count if requested
	if parameters.get("return_token_count", False):
	response["input_token_count"] = input_length
	response["output_token_count"] = len(generated_text.split())

	return [response]

	# For local testing
	if __name__ == "__main__":
	# Test the handler
	model_path = os.environ.get("MODEL_PATH", "./model")
	handler = EndpointHandler(model_path)

	# Test with a simple query
	test_data = {
	"inputs": "Explain the concept of machine learning in simple terms.",
	"parameters": {
	"max_new_tokens": 100,
	"temperature": 0.7,
	"system_prompt": "You are a helpful AI assistant."
	}
	}

	response = handler(test_data)
	print("\nTest Response:")
	print(json.dumps(response, indent=2))

	# Test with chat format and JSON mode
	test_chat_data = {
	"inputs": {
	"messages": [
	{"role": "user", "content": "Create a JSON object with information about the solar system. Include at least 3 planets with their name, diameter, and distance from the sun."}
	]
	},
	"parameters": {
	"max_new_tokens": 512,
	"temperature": 0.1,
	"json_mode": True,
	"system_prompt": "You are a helpful AI assistant that responds in JSON format."
	}
	}

	chat_response = handler(test_chat_data)
	print("\nJSON Format Response:")
	print(json.dumps(chat_response, indent=2))