|
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
import streamlit as st |
|
from PIL import Image |
|
import pytesseract |
|
import pandas as pd |
|
import plotly.express as px |
|
|
|
|
|
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned" |
|
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True) |
|
emoji_model = AutoModelForCausalLM.from_pretrained( |
|
emoji_model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 |
|
).to("cuda" if torch.cuda.is_available() else "cpu") |
|
emoji_model.eval() |
|
|
|
|
|
model_options = { |
|
"Toxic-BERT": "unitary/toxic-bert", |
|
"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive", |
|
"BERT Emotion": "bhadresh-savani/bert-base-go-emotion" |
|
} |
|
|
|
|
|
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide") |
|
|
|
|
|
if "history" not in st.session_state: |
|
st.session_state.history = [] |
|
|
|
|
|
def classify_emoji_text(text: str): |
|
prompt = f"Input: {text}\nOutput:" |
|
input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device) |
|
with torch.no_grad(): |
|
output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False) |
|
decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
translated = decoded.split("Output:")[-1].strip() |
|
|
|
result = classifier(translated)[0] |
|
label = result["label"] |
|
score = result["score"] |
|
suggestion = ( |
|
f"The sentence was flagged as '{label}' due to potentially offensive content." |
|
" Consider replacing emotionally charged or abusive terms." |
|
) |
|
|
|
st.session_state.history.append({ |
|
"text": text, |
|
"translated": translated, |
|
"label": label, |
|
"score": score, |
|
"suggestion": suggestion |
|
}) |
|
return translated, label, score, suggestion |
|
|
|
|
|
st.sidebar.header("Settings") |
|
selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys())) |
|
selected_model_id = model_options[selected_model] |
|
classifier = pipeline( |
|
"text-classification", |
|
model=selected_model_id, |
|
device=0 if torch.cuda.is_available() else -1 |
|
) |
|
|
|
|
|
st.title("🚨 Emoji Offensive Text Detector & Analysis") |
|
|
|
|
|
st.markdown("## Input or Upload Text for Classification") |
|
col1, col2 = st.columns([2, 1]) |
|
|
|
with col1: |
|
user_input = st.text_area( |
|
"Enter sentence with emojis:", |
|
value="春竹你🐎是不是💩了,窩🌿泥🐎SB", |
|
height=150 |
|
) |
|
if st.button("Analyze Text"): |
|
with st.spinner("Processing..."): |
|
try: |
|
translated, label, score, suggestion = classify_emoji_text(user_input) |
|
st.markdown("### Translated Sentence:") |
|
st.code(translated) |
|
st.markdown(f"**Prediction:** {label}") |
|
st.markdown(f"**Confidence:** {score:.2%}") |
|
st.markdown("**Model Explanation:**") |
|
st.info(suggestion) |
|
except Exception as e: |
|
st.error(f"Error: {e}") |
|
|
|
with col2: |
|
st.markdown("### Or Upload an Image") |
|
uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"]) |
|
if uploaded_file: |
|
image = Image.open(uploaded_file) |
|
st.image(image, caption="Uploaded Image", use_column_width=True) |
|
with st.spinner("Running OCR..."): |
|
ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip() |
|
st.markdown("#### OCR Extracted Text:") |
|
st.code(ocr_text) |
|
translated, label, score, suggestion = classify_emoji_text(ocr_text) |
|
st.markdown("#### Translated:") |
|
st.code(translated) |
|
st.markdown(f"**Prediction:** {label}") |
|
st.markdown(f"**Confidence:** {score:.2%}") |
|
st.markdown("**Model Explanation:**") |
|
st.info(suggestion) |
|
|
|
st.markdown("---") |
|
|
|
|
|
st.markdown("## Analysis Dashboard") |
|
if st.session_state.history: |
|
df = pd.DataFrame(st.session_state.history) |
|
st.markdown("### History Records") |
|
for item in st.session_state.history: |
|
st.markdown( |
|
f"- **Input:** `{item['text']}` | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}" |
|
) |
|
st.markdown(f" - Translated: `{item['translated']}`") |
|
st.markdown(f" - Suggestion: {item['suggestion']} ") |
|
|
|
|
|
radar_df = pd.DataFrame({ |
|
"Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"], |
|
"Score": [0.7, 0.4, 0.3, 0.5, 0.6] |
|
}) |
|
radar_fig = px.line_polar( |
|
radar_df, |
|
r='Score', |
|
theta='Category', |
|
line_close=True, |
|
title="Risk Radar by Category", |
|
color_discrete_sequence=['black'] |
|
) |
|
st.plotly_chart(radar_fig) |
|
|
|
|
|
st.markdown("### Top Offensive Terms by Category") |
|
categories = df['label'].unique() |
|
for cat in categories: |
|
st.markdown(f"**{cat}**") |
|
|
|
word_scores = {} |
|
for _, row in df[df['label'] == cat].iterrows(): |
|
words = row['text'].split() |
|
for w in words: |
|
word_scores[w] = max(word_scores.get(w, 0), row['score']) |
|
sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True) |
|
|
|
for w, s in sorted_words[:5]: |
|
st.markdown(f"- `{w}` ({s:.2%})") |
|
|
|
if len(sorted_words) > 5: |
|
with st.expander("Show more"): |
|
for w, s in sorted_words[5:]: |
|
st.markdown(f"- `{w}` ({s:.2%})") |
|
else: |
|
st.info("No data available. Please analyze some text first.") |