Spaces:

aeresd
/

test_1

Sleeping

File size: 6,025 Bytes

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st
from PIL import Image
import pytesseract
import pandas as pd
import plotly.express as px

# Step 1: Emoji translation model (fine-tuned)
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
emoji_model = AutoModelForCausalLM.from_pretrained(
    emoji_model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
emoji_model.eval()

# Step 2: Offensive text classification model options
model_options = {
    "Toxic-BERT": "unitary/toxic-bert",
    "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
    "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
}

# Page configuration
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")

# Initialize history
if "history" not in st.session_state:
    st.session_state.history = []

# Classification function
def classify_emoji_text(text: str):
    prompt = f"Input: {text}\nOutput:"
    input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
    with torch.no_grad():
        output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
    decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    translated = decoded.split("Output:")[-1].strip()

    result = classifier(translated)[0]
    label = result["label"]
    score = result["score"]
    suggestion = (
        f"The sentence was flagged as '{label}' due to potentially offensive content."
        " Consider replacing emotionally charged or abusive terms."
    )

    st.session_state.history.append({
        "text": text,
        "translated": translated,
        "label": label,
        "score": score,
        "suggestion": suggestion
    })
    return translated, label, score, suggestion

# Sidebar settings
st.sidebar.header("Settings")
selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys()))
selected_model_id = model_options[selected_model]
classifier = pipeline(
    "text-classification",
    model=selected_model_id,
    device=0 if torch.cuda.is_available() else -1
)

# Main page title
st.title("🚨 Emoji Offensive Text Detector & Analysis")

# Input and classification section
st.markdown("## Input or Upload Text for Classification")
col1, col2 = st.columns([2, 1])

with col1:
    user_input = st.text_area(
        "Enter sentence with emojis:",
        value="春竹你🐎是不是💩了,窩🌿泥🐎SB",
        height=150
    )
    if st.button("Analyze Text"):
        with st.spinner("Processing..."):
            try:
                translated, label, score, suggestion = classify_emoji_text(user_input)
                st.markdown("### Translated Sentence:")
                st.code(translated)
                st.markdown(f"**Prediction:** {label}")
                st.markdown(f"**Confidence:** {score:.2%}")
                st.markdown("**Model Explanation:**")
                st.info(suggestion)
            except Exception as e:
                st.error(f"Error: {e}")

with col2:
    st.markdown("### Or Upload an Image")
    uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"])
    if uploaded_file:
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        with st.spinner("Running OCR..."):
            ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
            st.markdown("#### OCR Extracted Text:")
            st.code(ocr_text)
            translated, label, score, suggestion = classify_emoji_text(ocr_text)
            st.markdown("#### Translated:")
            st.code(translated)
            st.markdown(f"**Prediction:** {label}")
            st.markdown(f"**Confidence:** {score:.2%}")
            st.markdown("**Model Explanation:**")
            st.info(suggestion)

st.markdown("---")

# Analysis dashboard
st.markdown("## Analysis Dashboard")
if st.session_state.history:
    df = pd.DataFrame(st.session_state.history)
    st.markdown("### History Records")
    for item in st.session_state.history:
        st.markdown(
            f"- **Input:** `{item['text']}` | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}"
        )
        st.markdown(f"  - Translated: `{item['translated']}`")
        st.markdown(f"  - Suggestion: {item['suggestion']} ")

    # Radar chart
    radar_df = pd.DataFrame({
        "Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"],
        "Score": [0.7, 0.4, 0.3, 0.5, 0.6]
    })
    radar_fig = px.line_polar(
        radar_df,
        r='Score',
        theta='Category',
        line_close=True,
        title="Risk Radar by Category",
        color_discrete_sequence=['black']
    )
    st.plotly_chart(radar_fig)

    # Analyze words related to each offensive category
    st.markdown("### Top Offensive Terms by Category")
    categories = df['label'].unique()
    for cat in categories:
        st.markdown(f"**{cat}**")
        # collect max score per word in texts of this category
        word_scores = {}
        for _, row in df[df['label'] == cat].iterrows():
            words = row['text'].split()
            for w in words:
                word_scores[w] = max(word_scores.get(w, 0), row['score'])
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        # display top 5 by default
        for w, s in sorted_words[:5]:
            st.markdown(f"- `{w}` ({s:.2%})")
        # show more if exists
        if len(sorted_words) > 5:
            with st.expander("Show more"):
                for w, s in sorted_words[5:]:
                    st.markdown(f"- `{w}` ({s:.2%})")
else:
    st.info("No data available. Please analyze some text first.")