|
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
import streamlit as st |
|
from PIL import Image |
|
import pytesseract |
|
import pandas as pd |
|
import plotly.express as px |
|
|
|
|
|
OFFENSIVE_CATEGORIES = { |
|
"Insult": ["蠢货", "白痴", "废物"], |
|
"Abuse": ["去死", "打死", "宰了你"], |
|
"Discrimination": ["女司机", "娘娘腔", "黑鬼"], |
|
"HateSpeech": ["灭族", "屠杀", "灭绝"], |
|
"Vulgarity": ["艹", "sb", "尼玛"] |
|
} |
|
|
|
|
|
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned" |
|
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True) |
|
emoji_model = AutoModelForCausalLM.from_pretrained( |
|
emoji_model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 |
|
).to("cuda" if torch.cuda.is_available() else "cpu") |
|
emoji_model.eval() |
|
|
|
model_options = { |
|
"Toxic-BERT": "unitary/toxic-bert", |
|
"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive", |
|
"BERT Emotion": "bhadresh-savani/bert-base-go-emotion" |
|
} |
|
|
|
|
|
def dynamic_scoring(text: str, classifier): |
|
scores = {k: 0.0 for k in OFFENSIVE_CATEGORIES} |
|
|
|
for category, keywords in OFFENSIVE_CATEGORIES.items(): |
|
for kw in keywords: |
|
if kw in text: |
|
scores[category] += 0.3 |
|
|
|
words = text.split() |
|
for word in words: |
|
try: |
|
res = classifier(word)[0] |
|
if res["label"] in scores: |
|
scores[res["label"]] += res["score"] * 0.7 |
|
except: pass |
|
|
|
max_score = max(scores.values()) or 1 |
|
return {k: round(v/max_score, 2) for k,v in scores.items()} |
|
|
|
|
|
def classify_emoji_text(text: str): |
|
prompt = f"输入:{text}\n输出:" |
|
input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device) |
|
with torch.no_grad(): |
|
output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False) |
|
decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
translated_text = decoded.split("输出:")[-1].strip() if "输出:" in decoded else decoded.strip() |
|
|
|
result = classifier(translated_text)[0] |
|
label = result["label"] |
|
score = result["score"] |
|
reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases." |
|
|
|
|
|
category_scores = dynamic_scoring(translated_text, classifier) |
|
|
|
st.session_state.history.append({ |
|
"text": text, |
|
"translated": translated_text, |
|
"label": label, |
|
"score": score, |
|
"reason": reasoning, |
|
"scores": category_scores |
|
}) |
|
return translated_text, label, score, reasoning, category_scores |
|
|
|
|
|
def generate_radar_chart(scores_dict: dict): |
|
radar_df = pd.DataFrame({ |
|
"Category": list(scores_dict.keys()), |
|
"Score": list(scores_dict.values()) |
|
}) |
|
|
|
fig = px.line_polar( |
|
radar_df, |
|
r='Score', |
|
theta='Category', |
|
line_close=True, |
|
color_discrete_sequence=['#FF6B6B'], |
|
title="🛡️ Multi-Dimensional Offensive Analysis" |
|
) |
|
fig.update_layout( |
|
polar=dict( |
|
radialaxis=dict( |
|
visible=True, |
|
range=[0, 1], |
|
tickvals=[0, 0.3, 0.7, 1], |
|
ticktext=["Safe", "Caution", "Risk", "Danger"] |
|
)), |
|
showlegend=False |
|
) |
|
return fig |
|
|
|
|
|
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide") |
|
|
|
with st.sidebar: |
|
st.header("🧠 Configuration") |
|
selected_model = st.selectbox("Choose classification model", list(model_options.keys())) |
|
selected_model_id = model_options[selected_model] |
|
classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1) |
|
|
|
if "history" not in st.session_state: |
|
st.session_state.history = [] |
|
|
|
|
|
st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard") |
|
|
|
|
|
st.subheader("1. 输入与分类") |
|
default_text = "你是🐷" |
|
text = st.text_area("Enter sentence with emojis:", value=default_text, height=150) |
|
|
|
if st.button("🚦 Analyze Text"): |
|
with st.spinner("🔍 Processing..."): |
|
try: |
|
translated, label, score, reason, category_scores = classify_emoji_text(text) |
|
|
|
st.markdown("**Translated sentence:**") |
|
st.code(translated, language="text") |
|
|
|
st.plotly_chart(generate_radar_chart(category_scores)) |
|
|
|
|
|
st.markdown("---") |
|
st.subheader("2. 图片 OCR & 分类") |
|
uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"]) |
|
if uploaded_file: |
|
image = Image.open(uploaded_file) |
|
st.image(image, caption="Uploaded Screenshot", use_column_width=True) |
|
with st.spinner("🧠 Extracting text via OCR..."): |
|
ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip() |
|
if ocr_text: |
|
st.markdown("**Extracted Text:**") |
|
st.code(ocr_text) |
|
translated, label, score, reason = classify_emoji_text(ocr_text) |
|
st.markdown("**Translated sentence:**") |
|
st.code(translated, language="text") |
|
st.markdown(f"**Prediction:** {label}") |
|
st.markdown(f"**Confidence Score:** {score:.2%}") |
|
st.markdown("**Model Explanation:**") |
|
st.info(reason) |
|
else: |
|
st.info("⚠️ No text detected in the image.") |
|
|
|
|
|
st.markdown("---") |
|
st.subheader("3. Violation Analysis Dashboard") |
|
if st.session_state.history: |
|
|
|
df = pd.DataFrame(st.session_state.history) |
|
st.markdown("### 🧾 Offensive Terms & Suggestions") |
|
for item in st.session_state.history: |
|
st.markdown(f"- 🔹 **Input:** {item['text']}") |
|
st.markdown(f" - ✨ **Translated:** {item['translated']}") |
|
st.markdown(f" - ❗ **Label:** {item['label']} with **{item['score']:.2%}** confidence") |
|
st.markdown(f" - 🔧 **Suggestion:** {item['reason']}") |
|
|
|
|
|
radar_df = pd.DataFrame({ |
|
"Category": ["Insult","Abuse","Discrimination","Hate Speech","Vulgarity"], |
|
"Score": [0.7,0.4,0.3,0.5,0.6] |
|
}) |
|
radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="⚠️ Risk Radar by Category") |
|
radar_fig.update_traces(line_color='black') |
|
st.plotly_chart(radar_fig) |
|
|
|
|
|
st.markdown("### 🧬 Word-level Offensive Correlation") |
|
|
|
|
|
last_translated_text = st.session_state.history[-1]["translated"] |
|
words = last_translated_text.split() |
|
|
|
|
|
word_scores = [] |
|
for word in words: |
|
try: |
|
res = classifier(word)[0] |
|
word_scores.append({ |
|
"Word": word, |
|
"Label": res["label"], |
|
"Score": res["score"] |
|
}) |
|
except Exception: |
|
continue |
|
|
|
if word_scores: |
|
word_df = pd.DataFrame(word_scores) |
|
word_df = word_df.sort_values(by="Score", ascending=False).reset_index(drop=True) |
|
|
|
max_display = 5 |
|
|
|
show_more = st.toggle("Show more words", value=False) |
|
|
|
display_df = word_df if show_more else word_df.head(max_display) |
|
|
|
st.markdown( |
|
display_df.to_html(index=False, border=0), |
|
unsafe_allow_html=True |
|
) |
|
else: |
|
st.info("❕ No word-level analysis available.") |
|
else: |
|
st.info("⚠️ No classification data available yet.") |