from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from peft import get_peft_model, LoraConfig, TaskType import torch model_id = "microsoft/phi-3-mini-4k-instruct" dataset_path = "../0_data_gen/instruct_dataset.jsonl" # Carga dataset personalizado data = load_dataset("json", data_files=dataset_path) # Tokenización tokenizer = AutoTokenizer.from_pretrained(model_id) def tokenize(example): return tokenizer(f"<|user|>{example['instruction']}<|assistant|>{example['response']}", truncation=True, padding="max_length", max_length=512) tokenized = data["train"].map(tokenize) # Carga modelo + PEFT model = AutoModelForCausalLM.from_pretrained(model_id) peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05) model = get_peft_model(model, peft_config) # Entrenamiento training_args = TrainingArguments( output_dir="./model", per_device_train_batch_size=2, num_train_epochs=3, save_total_limit=1, logging_steps=10, learning_rate=2e-4, fp16=torch.cuda.is_available() ) trainer = Trainer(model=model, args=training_args, train_dataset=tokenized) trainer.train()