|
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
import streamlit as st |
|
from PIL import Image |
|
import pytesseract |
|
import pandas as pd |
|
import plotly.express as px |
|
|
|
|
|
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned" |
|
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True) |
|
emoji_model = AutoModelForCausalLM.from_pretrained( |
|
emoji_model_id, |
|
trust_remote_code=True, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 |
|
).to("cuda" if torch.cuda.is_available() else "cpu") |
|
emoji_model.eval() |
|
|
|
|
|
model_options = { |
|
"Toxic-BERT": "unitary/toxic-bert", |
|
"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive", |
|
"BERT Emotion": "bhadresh-savani/bert-base-go-emotion" |
|
} |
|
|
|
|
|
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="๐จ", layout="wide") |
|
|
|
|
|
with st.sidebar: |
|
st.header("๐ง Navigation") |
|
section = st.radio("Select Mode:", ["๐ Text Moderation", "๐ Text Analysis"]) |
|
|
|
if section == "๐ Text Moderation": |
|
selected_model = st.selectbox("Choose classification model", list(model_options.keys())) |
|
selected_model_id = model_options[selected_model] |
|
classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1) |
|
|
|
elif section == "๐ Text Analysis": |
|
st.markdown("You can view the violation distribution chart and editing suggestions.") |
|
|
|
if "history" not in st.session_state: |
|
st.session_state.history = [] |
|
|
|
|
|
def classify_emoji_text(text: str): |
|
prompt = f"่พๅ
ฅ๏ผ{text}\n่พๅบ๏ผ" |
|
input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device) |
|
with torch.no_grad(): |
|
output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False) |
|
decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
translated_text = decoded.split("่พๅบ๏ผ")[-1].strip() if "่พๅบ๏ผ" in decoded else decoded.strip() |
|
|
|
result = classifier(translated_text)[0] |
|
label = result["label"] |
|
score = result["score"] |
|
reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases. Consider replacing emotionally charged, ambiguous, or abusive terms." |
|
|
|
st.session_state.history.append({"text": text, "translated": translated_text, "label": label, "score": score, "reason": reasoning}) |
|
return translated_text, label, score, reasoning |
|
|
|
|
|
if section == "๐ Text Moderation": |
|
st.title("๐ Offensive Text Classification") |
|
st.markdown("### โ๏ธ Input your sentence:") |
|
default_text = "ไฝ ๆฏ๐ท" |
|
text = st.text_area("Enter sentence with emojis:", value=default_text, height=150) |
|
|
|
if st.button("๐ฆ Analyze"): |
|
with st.spinner("๐ Processing..."): |
|
try: |
|
translated, label, score, reason = classify_emoji_text(text) |
|
st.markdown("### ๐ Translated sentence:") |
|
st.code(translated, language="text") |
|
|
|
st.markdown(f"### ๐ฏ Prediction: {label}") |
|
st.markdown(f"### ๐ Confidence Score: {score:.2%}") |
|
st.markdown(f"### ๐ง Model Explanation:") |
|
st.info(reason) |
|
|
|
except Exception as e: |
|
st.error(f"โ An error occurred during processing:\n\n{e}") |
|
|
|
st.markdown("---") |
|
st.markdown("### ๐ผ๏ธ Or upload a screenshot of bullet comments:") |
|
|
|
uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg", "jpeg", "png"]) |
|
|
|
if uploaded_file is not None: |
|
image = Image.open(uploaded_file) |
|
st.image(image, caption="Uploaded Screenshot", use_column_width=True) |
|
|
|
with st.spinner("๐ง Extracting text via OCR..."): |
|
ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng") |
|
st.markdown("#### ๐ Extracted Text:") |
|
st.code(ocr_text.strip()) |
|
|
|
translated, label, score, reason = classify_emoji_text(ocr_text.strip()) |
|
st.markdown("### ๐ Translated sentence:") |
|
st.code(translated, language="text") |
|
|
|
st.markdown(f"### ๐ฏ Prediction: {label}") |
|
st.markdown(f"### ๐ Confidence Score: {score:.2%}") |
|
st.markdown("### ๐ง Model Explanation:") |
|
st.info(reason) |
|
|
|
elif section == "๐ Text Analysis": |
|
st.title("๐ Violation Analysis Dashboard") |
|
if st.session_state.history: |
|
df = pd.DataFrame(st.session_state.history) |
|
|
|
|
|
st.markdown("### ๐งพ Offensive Terms & Suggestions") |
|
for item in st.session_state.history: |
|
st.markdown(f"- ๐น **Input:** {item['text']}") |
|
st.markdown(f" - โจ **Translated:** {item['translated']}") |
|
st.markdown(f" - โ **Label:** {item['label']} with **{item['score']:.2%}** confidence") |
|
st.markdown(f" - ๐ง **Suggestion:** {item['reason']}") |
|
|
|
radar_df = pd.DataFrame({ |
|
"Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"], |
|
"Score": [0.7, 0.4, 0.3, 0.5, 0.6] |
|
}) |
|
radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="โ ๏ธ Risk Radar by Category") |
|
radar_fig.update_traces(line_color='black') |
|
st.plotly_chart(radar_fig) |
|
else: |
|
st.info("โ ๏ธ No classification data available yet.") |