Spaces:

BugZoid
/

text-humanizer

Running

File size: 5,463 Bytes

d665c22
fcb0322
 
8bf558e
fcb0322
 
 
d665c22
fcb0322
 
 
 
 
 
 
 
b564db7
fcb0322
 
987baef
223938e
 
 
 
 
 
 
 
 
 
 
fcb0322
 
 
223938e
 
fcb0322
 
 
223938e
fcb0322
 
 
 
 
8bf558e
223938e
fcb0322
223938e
 
 
 
fcb0322
 
223938e
 
987baef
fcb0322
 
 
 
223938e
 
8bf558e
 
223938e
8bf558e
 
fcb0322
8bf558e
fcb0322
8bf558e
fcb0322
 
 
 
 
8bf558e
223938e
fcb0322
223938e
 
 
 
 
 
fcb0322
 
223938e
 
987baef
fcb0322
 
987baef
8bf558e
fcb0322
8bf558e
223938e
 
fcb0322
 
 
 
 
 
8bf558e
fcb0322
 
 
 
8bf558e
 
 
223938e
 
 
 
 
fcb0322
 
 
 
 
987baef
fcb0322
 
 
 
 
 
 
223938e
fcb0322
 
 
 
 
 
 
 
223938e
fcb0322
 
223938e
fcb0322

import streamlit as st
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    T5ForConditionalGeneration, 
    T5Tokenizer
)

# Initialize session state for models if not already done
if 'models_loaded' not in st.session_state:
    # Load the main T5 model and tokenizer (using t5-base for better quality)
    st.session_state.t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
    st.session_state.t5_model = T5ForConditionalGeneration.from_pretrained("t5-base")
    
    # Load the paraphrasing model and tokenizer
    st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
    
    st.session_state.models_loaded = True

def ensure_minimum_length(text, original_text):
    """
    Garante que o texto gerado tenha pelo menos o mesmo tamanho do original
    """
    while len(text.split()) < len(original_text.split()):
        missing_words = len(original_text.split()) - len(text.split())
        if missing_words > 0:
            text = text + " " + original_text[-missing_words:]
    return text

def paraphrase_text(text, original_text):
    """
    Apply paraphrasing to the input text using BART model
    """
    min_length = len(original_text.split())
    
    inputs = st.session_state.paraphrase_tokenizer.encode(
        text,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    )
    
    outputs = st.session_state.paraphrase_model.generate(
        inputs,
        max_length=1024,
        min_length=min_length,  # Força o tamanho mínimo igual ao original
        do_sample=True,
        temperature=0.3,
        top_p=0.95,
        repetition_penalty=1.2,
        length_penalty=2.0  # Aumentado para favorecer textos mais longos
    )
    
    result = st.session_state.paraphrase_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return ensure_minimum_length(result, original_text)

def humanize_text(text):
    """
    Humanize the input text using T5 model
    """
    min_length = len(text.split())
    
    prompt = (
        f"reescreva o seguinte texto em português de forma mais natural e humana, "
        f"mantendo todas as informações e expandindo com detalhes relevantes: {text}"
    )
    
    input_ids = st.session_state.t5_tokenizer(
        prompt,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    ).input_ids
    
    outputs = st.session_state.t5_model.generate(
        input_ids,
        max_length=1024,
        min_length=min_length,  # Força o tamanho mínimo igual ao original
        do_sample=True,
        temperature=0.3,
        top_p=0.95,
        num_beams=5,
        no_repeat_ngram_size=3,
        repetition_penalty=1.2,
        length_penalty=2.0  # Aumentado para favorecer textos mais longos
    )
    
    result = st.session_state.t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return ensure_minimum_length(result, text)

# UI Components
st.set_page_config(page_title="Advanced Text Humanizer", page_icon="🤖")

st.title("🤖 → 🧑 Humanizador de Texto Avançado")
st.markdown("""
Este aplicativo transforma textos robotizados em linguagem mais natural e humana, 
mantendo todas as informações originais e garantindo que o texto final seja pelo menos 
do mesmo tamanho que o original.
""")

# Input area with expanded capabilities
input_text = st.text_area(
    "Cole seu texto de robô aqui:",
    height=150,
    help="Cole seu texto aqui para transformá-lo em uma versão mais natural e humana."
)

# Advanced settings in sidebar
with st.sidebar:
    st.header("Configurações Avançadas")
    use_paraphrase = st.checkbox("Ativar Paráfrase", value=True)
    show_original = st.checkbox("Mostrar Texto Original", value=False)
    
    # Adicionar informações sobre o texto
    if input_text:
        st.write("Informações do texto:")
        st.write(f"Palavras no original: {len(input_text.split())}")

# Process button with error handling
if st.button("Humanizar", type="primary"):
    if not input_text:
        st.warning("⚠️ Por favor, cole um texto de robô primeiro!")
    else:
        with st.spinner("Processando o texto..."):
            try:
                # First humanization pass
                humanized_text = humanize_text(input_text)
                
                # Optional paraphrasing pass
                if use_paraphrase:
                    final_text = paraphrase_text(humanized_text, input_text)
                else:
                    final_text = humanized_text
                
                # Display results
                st.success("✨ Texto humanizado:")
                if show_original:
                    st.text("Texto original:")
                    st.info(input_text)
                    st.write(f"Palavras no original: {len(input_text.split())}")
                st.markdown("**Resultado:**")
                st.write(final_text)
                st.write(f"Palavras no resultado: {len(final_text.split())}")
                
            except Exception as e:
                st.error(f"❌ Ocorreu um erro durante o processamento: {str(e)}")

# Footer
st.markdown("---")
st.markdown(
    """
    <div style='text-align: center'>
        <small>Desenvolvido com ❤️ usando Streamlit e Transformers</small>
    </div>
    """,
    unsafe_allow_html=True
)