File size: 3,016 Bytes
bd9feeb
 
 
 
9284803
691ee4d
bd9feeb
 
 
 
 
 
 
 
9284803
bd9feeb
 
 
 
 
 
88f8bd0
 
 
b64c976
 
a8b7aaa
b64c976
 
 
 
 
88f8bd0
b64c976
 
 
 
e375d90
b64c976
 
 
 
 
 
bd9feeb
 
 
 
 
 
 
a8b7aaa
9284803
 
 
88f8bd0
b64c976
bd9feeb
b64c976
 
bd9feeb
 
b64c976
 
bd9feeb
88f8bd0
b64c976
 
bd9feeb
b64c976
 
bd9feeb
b64c976
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import streamlit as st

# ✅ Step 1: Emoji 翻译模型(你自己训练的模型)
emoji_model_id = "jenniferhk008/roberta-hfl-emoji-aug3epoch"
emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
emoji_model = AutoModelForCausalLM.from_pretrained(
    emoji_model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to("cuda" if torch.cuda.is_available() else "cpu")
emoji_model.eval()

# ✅ Step 2: 可选择的冒犯性文本识别模型
model_options = {
    "Toxic-BERT": "unitary/toxic-bert",
    "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
    "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
}

# ✅ 页面配置
st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")

# ✅ 页面标题
st.title("🧠 Emoji-based Offensive Language Classifier")

st.markdown("""
This application translates emojis in a sentence and classifies whether the final sentence is offensive or not using two AI models.
- The **first model** translates emoji or symbolic phrases into standard Chinese text.
- The **second model** performs offensive language detection.
""")

# Streamlit 侧边栏模型选择
selected_model = st.sidebar.selectbox("Choose classification model", list(model_options.keys()))
selected_model_id = model_options[selected_model]
classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)

# ✅ 输入区域
st.markdown("### ✍️ Input your sentence:")
default_text = "你是🐷"
text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)

# ✅ 主逻辑封装函数
def classify_emoji_text(text: str):
    prompt = f"输入:{text}\n输出:"
    input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
    with torch.no_grad():
        output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
    decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    translated_text = decoded.split("输出:")[-1].strip() if "输出:" in decoded else decoded.strip()

    result = classifier(translated_text)[0]
    label = result["label"]
    score = result["score"]

    return translated_text, label, score

# ✅ 触发按钮
if st.button("🚦 Analyze"):
    with st.spinner("🔍 Processing..."):
        try:
            translated, label, score = classify_emoji_text(text)
            st.markdown("### 🔄 Translated sentence:")
            st.code(translated, language="text")

            st.markdown(f"### 🎯 Prediction: `{label}`")
            st.markdown(f"### 📊 Confidence Score: `{score:.2%}`")

        except Exception as e:
            st.error(f"❌ An error occurred during processing:\n\n{e}")
else:
    st.info("👈 Please input text and click the button to classify.")