test_1 / app.py
aeresd's picture
Update app.py
d6593c8 verified
raw
history blame
6.03 kB
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st
from PIL import Image
import pytesseract
import pandas as pd
import plotly.express as px
# Step 1: Emoji translation model (fine-tuned)
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
emoji_model = AutoModelForCausalLM.from_pretrained(
emoji_model_id,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
emoji_model.eval()
# Step 2: Offensive text classification model options
model_options = {
"Toxic-BERT": "unitary/toxic-bert",
"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
"BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
}
# Page configuration
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
# Initialize history
if "history" not in st.session_state:
st.session_state.history = []
# Classification function
def classify_emoji_text(text: str):
prompt = f"Input: {text}\nOutput:"
input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
with torch.no_grad():
output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
translated = decoded.split("Output:")[-1].strip()
result = classifier(translated)[0]
label = result["label"]
score = result["score"]
suggestion = (
f"The sentence was flagged as '{label}' due to potentially offensive content."
" Consider replacing emotionally charged or abusive terms."
)
st.session_state.history.append({
"text": text,
"translated": translated,
"label": label,
"score": score,
"suggestion": suggestion
})
return translated, label, score, suggestion
# Sidebar settings
st.sidebar.header("Settings")
selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys()))
selected_model_id = model_options[selected_model]
classifier = pipeline(
"text-classification",
model=selected_model_id,
device=0 if torch.cuda.is_available() else -1
)
# Main page title
st.title("🚨 Emoji Offensive Text Detector & Analysis")
# Input and classification section
st.markdown("## Input or Upload Text for Classification")
col1, col2 = st.columns([2, 1])
with col1:
user_input = st.text_area(
"Enter sentence with emojis:",
value="春竹你🐎是不是💩了,窩🌿泥🐎SB",
height=150
)
if st.button("Analyze Text"):
with st.spinner("Processing..."):
try:
translated, label, score, suggestion = classify_emoji_text(user_input)
st.markdown("### Translated Sentence:")
st.code(translated)
st.markdown(f"**Prediction:** {label}")
st.markdown(f"**Confidence:** {score:.2%}")
st.markdown("**Model Explanation:**")
st.info(suggestion)
except Exception as e:
st.error(f"Error: {e}")
with col2:
st.markdown("### Or Upload an Image")
uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"])
if uploaded_file:
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
with st.spinner("Running OCR..."):
ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
st.markdown("#### OCR Extracted Text:")
st.code(ocr_text)
translated, label, score, suggestion = classify_emoji_text(ocr_text)
st.markdown("#### Translated:")
st.code(translated)
st.markdown(f"**Prediction:** {label}")
st.markdown(f"**Confidence:** {score:.2%}")
st.markdown("**Model Explanation:**")
st.info(suggestion)
st.markdown("---")
# Analysis dashboard
st.markdown("## Analysis Dashboard")
if st.session_state.history:
df = pd.DataFrame(st.session_state.history)
st.markdown("### History Records")
for item in st.session_state.history:
st.markdown(
f"- **Input:** `{item['text']}` | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}"
)
st.markdown(f" - Translated: `{item['translated']}`")
st.markdown(f" - Suggestion: {item['suggestion']} ")
# Radar chart
radar_df = pd.DataFrame({
"Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"],
"Score": [0.7, 0.4, 0.3, 0.5, 0.6]
})
radar_fig = px.line_polar(
radar_df,
r='Score',
theta='Category',
line_close=True,
title="Risk Radar by Category",
color_discrete_sequence=['black']
)
st.plotly_chart(radar_fig)
# Analyze words related to each offensive category
st.markdown("### Top Offensive Terms by Category")
categories = df['label'].unique()
for cat in categories:
st.markdown(f"**{cat}**")
# collect max score per word in texts of this category
word_scores = {}
for _, row in df[df['label'] == cat].iterrows():
words = row['text'].split()
for w in words:
word_scores[w] = max(word_scores.get(w, 0), row['score'])
sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
# display top 5 by default
for w, s in sorted_words[:5]:
st.markdown(f"- `{w}` ({s:.2%})")
# show more if exists
if len(sorted_words) > 5:
with st.expander("Show more"):
for w, s in sorted_words[5:]:
st.markdown(f"- `{w}` ({s:.2%})")
else:
st.info("No data available. Please analyze some text first.")