File size: 6,025 Bytes
5464ca6 5a8b969 98b3199 444b661 5981972 5464ca6 5a8b969 5464ca6 444b661 5981972 5a8b969 444b661 5981972 5a8b969 5981972 98b3199 5a8b969 5981972 5a8b969 5981972 5464ca6 dc1bdc8 5981972 444b661 5981972 5464ca6 5981972 dc1bdc8 5981972 98b3199 5981972 aa53269 5981972 aa53269 5981972 aa53269 d6593c8 5981972 98b3199 5981972 98b3199 5981972 444b661 aa53269 5981972 aa53269 98b3199 aa53269 5981972 aa53269 5981972 aa53269 5981972 aa53269 5981972 68c3cca aa53269 5981972 aa53269 5981972 aa53269 5981972 aa53269 5981972 aa53269 5981972 aa53269 32f4abb 5981972 32f4abb aa53269 5981972 d6593c8 5981972 d6593c8 5981972 d6593c8 aa53269 5981972 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st
from PIL import Image
import pytesseract
import pandas as pd
import plotly.express as px
# Step 1: Emoji translation model (fine-tuned)
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
emoji_model = AutoModelForCausalLM.from_pretrained(
emoji_model_id,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
emoji_model.eval()
# Step 2: Offensive text classification model options
model_options = {
"Toxic-BERT": "unitary/toxic-bert",
"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
"BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
}
# Page configuration
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
# Initialize history
if "history" not in st.session_state:
st.session_state.history = []
# Classification function
def classify_emoji_text(text: str):
prompt = f"Input: {text}\nOutput:"
input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
with torch.no_grad():
output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
translated = decoded.split("Output:")[-1].strip()
result = classifier(translated)[0]
label = result["label"]
score = result["score"]
suggestion = (
f"The sentence was flagged as '{label}' due to potentially offensive content."
" Consider replacing emotionally charged or abusive terms."
)
st.session_state.history.append({
"text": text,
"translated": translated,
"label": label,
"score": score,
"suggestion": suggestion
})
return translated, label, score, suggestion
# Sidebar settings
st.sidebar.header("Settings")
selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys()))
selected_model_id = model_options[selected_model]
classifier = pipeline(
"text-classification",
model=selected_model_id,
device=0 if torch.cuda.is_available() else -1
)
# Main page title
st.title("🚨 Emoji Offensive Text Detector & Analysis")
# Input and classification section
st.markdown("## Input or Upload Text for Classification")
col1, col2 = st.columns([2, 1])
with col1:
user_input = st.text_area(
"Enter sentence with emojis:",
value="春竹你🐎是不是💩了,窩🌿泥🐎SB",
height=150
)
if st.button("Analyze Text"):
with st.spinner("Processing..."):
try:
translated, label, score, suggestion = classify_emoji_text(user_input)
st.markdown("### Translated Sentence:")
st.code(translated)
st.markdown(f"**Prediction:** {label}")
st.markdown(f"**Confidence:** {score:.2%}")
st.markdown("**Model Explanation:**")
st.info(suggestion)
except Exception as e:
st.error(f"Error: {e}")
with col2:
st.markdown("### Or Upload an Image")
uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"])
if uploaded_file:
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
with st.spinner("Running OCR..."):
ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
st.markdown("#### OCR Extracted Text:")
st.code(ocr_text)
translated, label, score, suggestion = classify_emoji_text(ocr_text)
st.markdown("#### Translated:")
st.code(translated)
st.markdown(f"**Prediction:** {label}")
st.markdown(f"**Confidence:** {score:.2%}")
st.markdown("**Model Explanation:**")
st.info(suggestion)
st.markdown("---")
# Analysis dashboard
st.markdown("## Analysis Dashboard")
if st.session_state.history:
df = pd.DataFrame(st.session_state.history)
st.markdown("### History Records")
for item in st.session_state.history:
st.markdown(
f"- **Input:** `{item['text']}` | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}"
)
st.markdown(f" - Translated: `{item['translated']}`")
st.markdown(f" - Suggestion: {item['suggestion']} ")
# Radar chart
radar_df = pd.DataFrame({
"Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"],
"Score": [0.7, 0.4, 0.3, 0.5, 0.6]
})
radar_fig = px.line_polar(
radar_df,
r='Score',
theta='Category',
line_close=True,
title="Risk Radar by Category",
color_discrete_sequence=['black']
)
st.plotly_chart(radar_fig)
# Analyze words related to each offensive category
st.markdown("### Top Offensive Terms by Category")
categories = df['label'].unique()
for cat in categories:
st.markdown(f"**{cat}**")
# collect max score per word in texts of this category
word_scores = {}
for _, row in df[df['label'] == cat].iterrows():
words = row['text'].split()
for w in words:
word_scores[w] = max(word_scores.get(w, 0), row['score'])
sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
# display top 5 by default
for w, s in sorted_words[:5]:
st.markdown(f"- `{w}` ({s:.2%})")
# show more if exists
if len(sorted_words) > 5:
with st.expander("Show more"):
for w, s in sorted_words[5:]:
st.markdown(f"- `{w}` ({s:.2%})")
else:
st.info("No data available. Please analyze some text first.") |