""" Main Integration Module for Agent Tuning Optimization Framework This module provides functionality for integrating all components of the framework and running end-to-end experiments. """ import os import json import argparse from typing import List, Dict, Any, Union, Optional, Tuple from models.llm_interface import LLMInterface from data.trajectory_data import Trajectory, TrajectoryDataset, create_synthetic_dataset from training.negative_samples import create_negative_sample_generator from training.synthetic_trajectories import create_synthetic_trajectory_generator from training.agent_tuner import create_agent_tuner from evaluation.evaluators import create_agent_evaluator def run_experiment( experiment_config: Dict[str, Any], output_dir: str ) -> Dict[str, Any]: """ Run an end-to-end experiment with the framework. Args: experiment_config: Experiment configuration output_dir: Directory to save results Returns: Dictionary of experiment results """ print(f"Starting experiment: {experiment_config['name']}") # Create output directory os.makedirs(output_dir, exist_ok=True) # Save experiment configuration with open(f"{output_dir}/experiment_config.json", "w") as f: json.dump(experiment_config, f, indent=2) # Initialize LLM interface print("Initializing LLM interface...") llm_config = experiment_config.get("llm", {}) llm_interface = LLMInterface( model_name=llm_config.get("model_name", "gpt2"), model_type=llm_config.get("model_type", "causal"), device=llm_config.get("device", "cpu"), max_length=llm_config.get("max_length", 512), temperature=llm_config.get("temperature", 0.7) ) # Load or create dataset print("Preparing dataset...") dataset_config = experiment_config.get("dataset", {}) if dataset_config.get("path"): # Load existing dataset dataset = TrajectoryDataset(dataset_config.get("name", "experiment_dataset")) dataset.load_from_json(dataset_config["path"]) else: # Create synthetic dataset dataset = create_synthetic_dataset(dataset_config.get("num_trajectories", 20)) print(f"Dataset loaded with {len(dataset.trajectories)} trajectories") # Generate negative samples print("Generating negative samples...") negative_config = experiment_config.get("negative_samples", {}) if negative_config.get("enabled", True): negative_generator = create_negative_sample_generator( negative_config.get("method", "response_degradation") ) positive_trajectories = dataset.get_trajectories(positive_only=True) negative_trajectories = negative_generator.batch_generate( positive_trajectories, **negative_config.get("params", {}) ) # Add negative trajectories to dataset for trajectory in negative_trajectories: dataset.add_trajectory(trajectory) print(f"Added {len(negative_trajectories)} negative trajectories") # Generate synthetic trajectories print("Generating synthetic trajectories...") synthetic_config = experiment_config.get("synthetic_trajectories", {}) if synthetic_config.get("enabled", True): synthetic_generator = create_synthetic_trajectory_generator( synthetic_config.get("method", "template"), llm_interface if synthetic_config.get("method") in ["llm", "hybrid"] else None ) # Generate from task descriptions task_descriptions = [t.task_description for t in dataset.get_trajectories(positive_only=True)] task_descriptions = list(set(task_descriptions)) # Remove duplicates synthetic_trajectories = synthetic_generator.batch_generate( task_descriptions, **synthetic_config.get("params", {}) ) # Add synthetic trajectories to dataset for trajectory in synthetic_trajectories: dataset.add_trajectory(trajectory) print(f"Added {len(synthetic_trajectories)} synthetic trajectories") # Save the enhanced dataset dataset.save_to_json(f"{output_dir}/enhanced_dataset.json") # Analyze dataset dataset_stats = dataset.analyze_dataset() with open(f"{output_dir}/dataset_stats.json", "w") as f: json.dump(dataset_stats, f, indent=2) # Split dataset for training and evaluation all_trajectories = dataset.get_trajectories() split_idx = int(len(all_trajectories) * 0.8) # 80% for training train_trajectories = all_trajectories[:split_idx] eval_trajectories = all_trajectories[split_idx:] print(f"Split dataset: {len(train_trajectories)} for training, {len(eval_trajectories)} for evaluation") # Tune agent print("Tuning agent...") tuning_config = experiment_config.get("tuning", {}) tuner = create_agent_tuner(tuning_config.get("method", "supervised")) tuned_model, tuning_metrics = tuner.tune( model_name=llm_config.get("model_name", "gpt2"), trajectories=train_trajectories, output_dir=f"{output_dir}/tuned_model", **tuning_config.get("params", {}) ) # Save tuning metrics with open(f"{output_dir}/tuning_metrics.json", "w") as f: # Convert any non-serializable values to strings serializable_metrics = {} for k, v in tuning_metrics.items(): if isinstance(v, (int, float, str, bool, list, dict)) or v is None: serializable_metrics[k] = v else: serializable_metrics[k] = str(v) json.dump(serializable_metrics, f, indent=2) # Create tuned model interface tuned_llm_interface = LLMInterface( model_name=f"{output_dir}/tuned_model", model_type=llm_config.get("model_type", "causal"), device=llm_config.get("device", "cpu"), max_length=llm_config.get("max_length", 512), temperature=llm_config.get("temperature", 0.7) ) # Evaluate agent print("Evaluating agent...") eval_config = experiment_config.get("evaluation", {}) evaluator = create_agent_evaluator(eval_config.get("method", "quality")) eval_results = evaluator.evaluate( llm_interface=tuned_llm_interface, test_trajectories=eval_trajectories, **eval_config.get("params", {}) ) # Visualize evaluation results evaluator.visualize_results( results=eval_results, output_dir=f"{output_dir}/evaluation" ) # Save evaluation results with open(f"{output_dir}/evaluation_results.json", "w") as f: # Create a simplified version without large data simplified_results = {} if "aggregated" in eval_results: simplified_results["aggregated"] = eval_results["aggregated"] if "metrics" in eval_results: # Include only essential metrics simplified_results["metrics"] = [ {k: v for k, v in m.items() if k not in ["generated_responses"]} for m in eval_results["metrics"] ] json.dump(simplified_results, f, indent=2) # Comparative evaluation (if configured) if eval_config.get("comparative", {}).get("enabled", False): print("Performing comparative evaluation...") # Create baseline model interface baseline_llm_interface = LLMInterface( model_name=llm_config.get("model_name", "gpt2"), model_type=llm_config.get("model_type", "causal"), device=llm_config.get("device", "cpu"), max_length=llm_config.get("max_length", 512), temperature=llm_config.get("temperature", 0.7) ) # Create comparative evaluator comparative_evaluator = create_agent_evaluator("comparative") # Evaluate and compare comparative_results = comparative_evaluator.evaluate( llm_interfaces={ "baseline": baseline_llm_interface, "tuned": tuned_llm_interface }, test_trajectories=eval_trajectories, **eval_config.get("comparative", {}).get("params", {}) ) # Visualize comparative results comparative_evaluator.visualize_results( results=comparative_results, output_dir=f"{output_dir}/comparative" ) # Save comparative results with open(f"{output_dir}/comparative_results.json", "w") as f: # Create a simplified version simplified_comparative = { "comparative": comparative_results.get("comparative", {}) } json.dump(simplified_comparative, f, indent=2) print(f"Experiment completed. Results saved to {output_dir}") return { "dataset_stats": dataset_stats, "tuning_metrics": tuning_metrics, "evaluation_results": eval_results } def main(): """Main function for running the framework from command line.""" parser = argparse.ArgumentParser(description="Agent Tuning Optimization Framework") parser.add_argument("--config", type=str, required=True, help="Path to experiment configuration file") parser.add_argument("--output", type=str, default="./experiment_results", help="Directory to save results") args = parser.parse_args() # Load experiment configuration with open(args.config, "r") as f: experiment_config = json.load(f) # Run experiment run_experiment(experiment_config, args.output) if __name__ == "__main__": main()