Spaces:

aeresd
/

test_1

Sleeping

File size: 6,563 Bytes

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st
from PIL import Image
import pytesseract
import pandas as pd
import plotly.express as px

# ✅ Step 1: Emoji 翻译模型（你自己训练的模型）
emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
emoji_model = AutoModelForCausalLM.from_pretrained(
    emoji_model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
emoji_model.eval()

# ✅ Step 2: 可选择的冒犯性文本识别模型
model_options = {
    "Toxic-BERT": "unitary/toxic-bert",
    "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
    "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
}

# ✅ 页面配置
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")

# ✅ 侧边栏：模型选择
with st.sidebar:
    st.header("🧠 Configuration")
    selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
    selected_model_id = model_options[selected_model]
    classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)

# 初始化历史记录
if "history" not in st.session_state:
    st.session_state.history = []

# 分类函数
def classify_emoji_text(text: str):
    prompt = f"输入：{text}\n输出："
    input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
    with torch.no_grad():
        output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
    decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()

    result = classifier(translated_text)[0]
    label = result["label"]
    score = result["score"]
    reasoning = (
        f"The sentence was flagged as '{label}' due to potentially offensive phrases. "
        "Consider replacing emotionally charged, ambiguous, or abusive terms."
    )

    st.session_state.history.append({
        "text": text,
        "translated": translated_text,
        "label": label,
        "score": score,
        "reason": reasoning
    })
    return translated_text, label, score, reasoning

# 主页面：输入与分析共存
st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")

# 文本输入
st.subheader("1. 输入与分类")
default_text = "你是🐷"
text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)

if st.button("🚦 Analyze Text"):
    with st.spinner("🔍 Processing..."):
        try:
            translated, label, score, reason = classify_emoji_text(text)
            st.markdown("**Translated sentence:**")
            st.code(translated, language="text")
            st.markdown(f"**Prediction:** {label}")
            st.markdown(f"**Confidence Score:** {score:.2%}")
            st.markdown("**Model Explanation:**")
            st.info(reason)
        except Exception as e:
            st.error(f"❌ An error occurred:\n{e}")

# 图片上传与 OCR
st.markdown("---")
st.subheader("2. 图片 OCR & 分类")
uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"])
if uploaded_file:
    image = Image.open(uploaded_file)
    st.image(image, caption="Uploaded Screenshot", use_column_width=True)
    with st.spinner("🧠 Extracting text via OCR..."):
        ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
        if ocr_text:
            st.markdown("**Extracted Text:**")
            st.code(ocr_text)
            translated, label, score, reason = classify_emoji_text(ocr_text)
            st.markdown("**Translated sentence:**")
            st.code(translated, language="text")
            st.markdown(f"**Prediction:** {label}")
            st.markdown(f"**Confidence Score:** {score:.2%}")
            st.markdown("**Model Explanation:**")
            st.info(reason)
        else:
            st.info("⚠️ No text detected in the image.")

# 分析仪表盘
st.markdown("---")
st.subheader("3. Violation Analysis Dashboard")
if st.session_state.history:
    # 展示历史记录
    df = pd.DataFrame(st.session_state.history)
    st.markdown("### 🧾 Offensive Terms & Suggestions")
    for item in st.session_state.history:
        st.markdown(f"- 🔹 **Input:** {item['text']}")
        st.markdown(f"   - ✨ **Translated:** {item['translated']}")
        st.markdown(f"   - ❗ **Label:** {item['label']} with **{item['score']:.2%}** confidence")
        st.markdown(f"   - 🔧 **Suggestion:** {item['reason']}")

    # 雷达图
    radar_df = pd.DataFrame({
        "Category": ["Insult","Abuse","Discrimination","Hate Speech","Vulgarity"],
        "Score": [0.7,0.4,0.3,0.5,0.6]
    })
    radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="⚠️ Risk Radar by Category")
    radar_fig.update_traces(line_color='black')
    st.plotly_chart(radar_fig)

    # —— 新增：单词级冒犯性相关性分析 —— #
    st.markdown("### 🧬 Word-level Offensive Correlation")

    # 取最近一次翻译文本，按空格拆分单词
    last_translated_text = st.session_state.history[-1]["translated"]
    words = last_translated_text.split()

    # 对每个单词进行分类并收集分数
    word_scores = []
    for word in words:
        try:
            res = classifier(word)[0]
            word_scores.append({
                "Word": word,
                "Label": res["label"],
                "Score": res["score"]
            })
        except Exception:
            continue

    if word_scores:
        word_df = pd.DataFrame(word_scores)
        word_df = word_df.sort_values(by="Score", ascending=False).reset_index(drop=True)

        max_display = 5
        # Streamlit 1.22+ 支持 st.toggle，若版本不支持可改用 checkbox
        show_more = st.toggle("Show more words", value=False)

        display_df = word_df if show_more else word_df.head(max_display)
        # 隐藏边框并渲染 HTML 表格
        st.markdown(
            display_df.to_html(index=False, border=0),
            unsafe_allow_html=True
        )
    else:
        st.info("❕ No word-level analysis available.")
else:
    st.info("⚠️ No classification data available yet.")