Spaces:
No application file
No application file
Upload 6 files
Browse files- 0_data_gen/app.py +14 -0
- 0_data_gen/generate_dataset.py +22 -0
- 1_train/train.py +35 -0
- 2_space/README.md +6 -0
- 2_space/app.py +14 -0
- 2_space/requirements.txt +5 -0
0_data_gen/app.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
model_id = "Hodely/AmInSide1.0" # Reemplaza con tu repo real
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
7 |
+
model = AutoModelForCausalLM.from_pretrained(model_id)
|
8 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
9 |
+
|
10 |
+
def chat(user_input):
|
11 |
+
result = pipe(user_input, max_new_tokens=200, temperature=0.7, do_sample=True)
|
12 |
+
return result[0]['generated_text']
|
13 |
+
|
14 |
+
gr.Interface(fn=chat, inputs="text", outputs="text", title="🧠 AmInSide1.0").launch()
|
0_data_gen/generate_dataset.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
import json
|
3 |
+
|
4 |
+
generator = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct", tokenizer="mistralai/Mistral-7B-Instruct")
|
5 |
+
|
6 |
+
seed_prompts = [
|
7 |
+
"Dame 10 preguntas filosóficas con respuestas profundas.",
|
8 |
+
"Genera 5 ejemplos tipo ChatGPT con tono sarcástico pero sabio.",
|
9 |
+
"Crea 10 instrucciones para IA educativa con respuestas creativas."
|
10 |
+
]
|
11 |
+
|
12 |
+
output = []
|
13 |
+
for prompt in seed_prompts:
|
14 |
+
result = generator(prompt, max_new_tokens=512)[0]["generated_text"]
|
15 |
+
# ¡Aquí puedes separar y limpiar! Por ahora simplificamos
|
16 |
+
output.append({"instruction": prompt, "response": result})
|
17 |
+
|
18 |
+
with open("instruct_dataset.jsonl", "w", encoding="utf-8") as f:
|
19 |
+
for example in output:
|
20 |
+
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
21 |
+
|
22 |
+
print("✅ Dataset generado.")
|
1_train/train.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
|
3 |
+
from peft import get_peft_model, LoraConfig, TaskType
|
4 |
+
import torch
|
5 |
+
|
6 |
+
model_id = "microsoft/phi-3-mini-4k-instruct"
|
7 |
+
dataset_path = "../0_data_gen/instruct_dataset.jsonl"
|
8 |
+
|
9 |
+
# Carga dataset personalizado
|
10 |
+
data = load_dataset("json", data_files=dataset_path)
|
11 |
+
|
12 |
+
# Tokenización
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
14 |
+
def tokenize(example):
|
15 |
+
return tokenizer(f"<|user|>{example['instruction']}<|assistant|>{example['response']}", truncation=True, padding="max_length", max_length=512)
|
16 |
+
tokenized = data["train"].map(tokenize)
|
17 |
+
|
18 |
+
# Carga modelo + PEFT
|
19 |
+
model = AutoModelForCausalLM.from_pretrained(model_id)
|
20 |
+
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.05)
|
21 |
+
model = get_peft_model(model, peft_config)
|
22 |
+
|
23 |
+
# Entrenamiento
|
24 |
+
training_args = TrainingArguments(
|
25 |
+
output_dir="./model",
|
26 |
+
per_device_train_batch_size=2,
|
27 |
+
num_train_epochs=3,
|
28 |
+
save_total_limit=1,
|
29 |
+
logging_steps=10,
|
30 |
+
learning_rate=2e-4,
|
31 |
+
fp16=torch.cuda.is_available()
|
32 |
+
)
|
33 |
+
|
34 |
+
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized)
|
35 |
+
trainer.train()
|
2_space/README.md
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AmInSide1.0
|
2 |
+
Este es el modelo de lenguaje creado por Hodely. Entrenado con ejemplos generados por otro modelo, afinado para combinar inteligencia, humor y creatividad.
|
3 |
+
|
4 |
+
🔹 Entrenado con LoRA
|
5 |
+
🔹 Base: Phi-3 / Mistral
|
6 |
+
🔹 Fine-tuned en estilo instructivo con dataset propio
|
2_space/app.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
model_id = "Hodely/AmInSide1.0" # Reemplaza con tu repo real
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
7 |
+
model = AutoModelForCausalLM.from_pretrained(model_id)
|
8 |
+
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
9 |
+
|
10 |
+
def chat(user_input):
|
11 |
+
result = pipe(user_input, max_new_tokens=200, temperature=0.7, do_sample=True)
|
12 |
+
return result[0]['generated_text']
|
13 |
+
|
14 |
+
gr.Interface(fn=chat, inputs="text", outputs="text", title="🧠 AmInSide1.0").launch()
|
2_space/requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
gradio
|
3 |
+
peft
|
4 |
+
datasets
|
5 |
+
accelerate
|