Spaces:
Sleeping
Sleeping
import os | |
import math | |
import json | |
import logging | |
from datasets import load_dataset, concatenate_datasets #list_datasets, load_from_disk | |
from transformers import ( | |
AutoTokenizer, | |
AutoModelForCausalLM, | |
DataCollatorForLanguageModeling, | |
TrainingArguments, | |
Trainer, | |
pipeline, | |
) | |
import evaluate # type: ignore | |
from huggingface_hub import login | |
from translate import Translator | |
from datasets import load_dataset, DownloadConfig | |
# Modelo base | |
MODEL_KEY = "EleutherAI/gpt-neo-125M" | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_KEY) | |
model = AutoModelForCausalLM.from_pretrained(MODEL_KEY) | |
generator = pipeline("text-generation", model=model, tokenizer=tokenizer) | |
# Mapa de dominios y estilos por dataset | |
context_map = { | |
"imdb": "Dom: Cine | Estilo: Opinión", | |
"daily_dialog": "Dom: Conversación | Estilo: Diálogo diario", | |
"go_emotions": "Dom: Emociones | Estilo: Clasificación emocional", | |
"wikitext": "Dom: Enciclopedia | Estilo: Conocimiento general", | |
} | |
# Dataset de prueba | |
available_datasets = list(context_map.keys()) | |
# Función para generar texto | |
def generate_text(dataset_name, sample_index, max_length): | |
dataset = load_dataset(dataset_name, split="train[:1%]") # Ligero | |
if sample_index >= len(dataset): | |
return "Índice fuera de rango." | |
example = dataset[sample_index] | |
text = example.get("text") or example.get("utterance") or example.get("content") or str(example) | |
context = context_map.get(dataset_name, "Dom: Desconocido | Estilo: Desconocido") | |
prompt = f"{context} | Entrada: {text}" | |
output = generator(prompt, max_length=int(max_length), num_return_sequences=1)[0]["generated_text"] | |
return output | |
# Traducción | |
def translate_text(text, lang): | |
translator = Translator(to_lang=lang) | |
try: | |
return translator.translate(text) | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Interfaz con Gradio | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🧠 MultiDomain Text Generator + Translator") | |
with gr.Tab("Generar desde dataset"): | |
dataset_name = gr.Dropdown(choices=available_datasets, value="imdb", label="Elige dataset") | |
sample_index = gr.Slider(minimum=30, maximum=200, step=1, label="Índice del ejemplo", value=0) | |
max_len = gr.Slider(label="Longitud máxima", minimum=50, maximum=1024, step=4, value=104) | |
output_text = gr.Textbox(label="Texto generado") | |
btn_generate = gr.Button("Generar texto") | |
btn_generate.click(generate_text, inputs=[dataset_name, sample_index, max_len], outputs=output_text) | |
with gr.Tab("Traducir texto"): | |
input_text = gr.Textbox(label="Texto a traducir") | |
lang = gr.Textbox(label="Código de idioma destino", value="en") | |
output_translation = gr.Textbox(label="Texto traducido") | |
btn_translate = gr.Button("Traducir") | |
btn_translate.click(translate_text, inputs=[input_text, lang], outputs=output_translation) | |
demo.launch() | |