File size: 6,025 Bytes
5464ca6
 
5a8b969
98b3199
 
 
 
444b661
5981972
5464ca6
 
 
 
 
5a8b969
5464ca6
 
444b661
5981972
5a8b969
 
 
 
 
444b661
5981972
5a8b969
 
5981972
98b3199
 
5a8b969
5981972
5a8b969
5981972
5464ca6
 
dc1bdc8
 
5981972
444b661
5981972
5464ca6
 
5981972
 
 
 
dc1bdc8
5981972
 
 
 
 
 
 
 
98b3199
5981972
 
 
aa53269
5981972
 
 
 
 
aa53269
5981972
 
 
 
 
 
aa53269
 
d6593c8
 
 
 
 
5981972
 
98b3199
5981972
 
 
 
 
 
 
98b3199
5981972
444b661
aa53269
5981972
 
aa53269
98b3199
aa53269
5981972
aa53269
5981972
aa53269
5981972
 
aa53269
5981972
 
 
 
68c3cca
aa53269
 
5981972
 
aa53269
 
5981972
aa53269
5981972
 
 
 
 
aa53269
5981972
aa53269
5981972
 
aa53269
32f4abb
 
 
 
 
5981972
32f4abb
 
aa53269
5981972
d6593c8
 
5981972
 
 
d6593c8
 
 
 
 
 
 
 
 
 
 
 
5981972
d6593c8
 
aa53269
5981972
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st
from PIL import Image
import pytesseract
import pandas as pd
import plotly.express as px

# Step 1: Emoji translation model (fine-tuned)
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
emoji_model = AutoModelForCausalLM.from_pretrained(
    emoji_model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
emoji_model.eval()

# Step 2: Offensive text classification model options
model_options = {
    "Toxic-BERT": "unitary/toxic-bert",
    "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
    "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
}

# Page configuration
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")

# Initialize history
if "history" not in st.session_state:
    st.session_state.history = []

# Classification function
def classify_emoji_text(text: str):
    prompt = f"Input: {text}\nOutput:"
    input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
    with torch.no_grad():
        output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
    decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    translated = decoded.split("Output:")[-1].strip()

    result = classifier(translated)[0]
    label = result["label"]
    score = result["score"]
    suggestion = (
        f"The sentence was flagged as '{label}' due to potentially offensive content."
        " Consider replacing emotionally charged or abusive terms."
    )

    st.session_state.history.append({
        "text": text,
        "translated": translated,
        "label": label,
        "score": score,
        "suggestion": suggestion
    })
    return translated, label, score, suggestion

# Sidebar settings
st.sidebar.header("Settings")
selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys()))
selected_model_id = model_options[selected_model]
classifier = pipeline(
    "text-classification",
    model=selected_model_id,
    device=0 if torch.cuda.is_available() else -1
)

# Main page title
st.title("🚨 Emoji Offensive Text Detector & Analysis")

# Input and classification section
st.markdown("## Input or Upload Text for Classification")
col1, col2 = st.columns([2, 1])

with col1:
    user_input = st.text_area(
        "Enter sentence with emojis:",
        value="春竹你🐎是不是💩了,窩🌿泥🐎SB",
        height=150
    )
    if st.button("Analyze Text"):
        with st.spinner("Processing..."):
            try:
                translated, label, score, suggestion = classify_emoji_text(user_input)
                st.markdown("### Translated Sentence:")
                st.code(translated)
                st.markdown(f"**Prediction:** {label}")
                st.markdown(f"**Confidence:** {score:.2%}")
                st.markdown("**Model Explanation:**")
                st.info(suggestion)
            except Exception as e:
                st.error(f"Error: {e}")

with col2:
    st.markdown("### Or Upload an Image")
    uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"])
    if uploaded_file:
        image = Image.open(uploaded_file)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        with st.spinner("Running OCR..."):
            ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
            st.markdown("#### OCR Extracted Text:")
            st.code(ocr_text)
            translated, label, score, suggestion = classify_emoji_text(ocr_text)
            st.markdown("#### Translated:")
            st.code(translated)
            st.markdown(f"**Prediction:** {label}")
            st.markdown(f"**Confidence:** {score:.2%}")
            st.markdown("**Model Explanation:**")
            st.info(suggestion)

st.markdown("---")

# Analysis dashboard
st.markdown("## Analysis Dashboard")
if st.session_state.history:
    df = pd.DataFrame(st.session_state.history)
    st.markdown("### History Records")
    for item in st.session_state.history:
        st.markdown(
            f"- **Input:** `{item['text']}` | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}"
        )
        st.markdown(f"  - Translated: `{item['translated']}`")
        st.markdown(f"  - Suggestion: {item['suggestion']} ")

    # Radar chart
    radar_df = pd.DataFrame({
        "Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"],
        "Score": [0.7, 0.4, 0.3, 0.5, 0.6]
    })
    radar_fig = px.line_polar(
        radar_df,
        r='Score',
        theta='Category',
        line_close=True,
        title="Risk Radar by Category",
        color_discrete_sequence=['black']
    )
    st.plotly_chart(radar_fig)

    # Analyze words related to each offensive category
    st.markdown("### Top Offensive Terms by Category")
    categories = df['label'].unique()
    for cat in categories:
        st.markdown(f"**{cat}**")
        # collect max score per word in texts of this category
        word_scores = {}
        for _, row in df[df['label'] == cat].iterrows():
            words = row['text'].split()
            for w in words:
                word_scores[w] = max(word_scores.get(w, 0), row['score'])
        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
        # display top 5 by default
        for w, s in sorted_words[:5]:
            st.markdown(f"- `{w}` ({s:.2%})")
        # show more if exists
        if len(sorted_words) > 5:
            with st.expander("Show more"):
                for w, s in sorted_words[5:]:
                    st.markdown(f"- `{w}` ({s:.2%})")
else:
    st.info("No data available. Please analyze some text first.")