aeresd commited on
Commit
851f89d
·
verified ·
1 Parent(s): d6593c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -93
app.py CHANGED
@@ -6,7 +6,7 @@ import pytesseract
6
  import pandas as pd
7
  import plotly.express as px
8
 
9
- # Step 1: Emoji translation model (fine-tuned)
10
  emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
11
  emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
12
  emoji_model = AutoModelForCausalLM.from_pretrained(
@@ -16,148 +16,108 @@ emoji_model = AutoModelForCausalLM.from_pretrained(
16
  ).to("cuda" if torch.cuda.is_available() else "cpu")
17
  emoji_model.eval()
18
 
19
- # Step 2: Offensive text classification model options
20
  model_options = {
21
  "Toxic-BERT": "unitary/toxic-bert",
22
  "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
23
  "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
24
  }
25
 
26
- # Page configuration
27
  st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
28
 
29
- # Initialize history
30
  if "history" not in st.session_state:
31
  st.session_state.history = []
32
 
33
- # Classification function
34
  def classify_emoji_text(text: str):
35
- prompt = f"Input: {text}\nOutput:"
36
  input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
37
  with torch.no_grad():
38
  output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
39
  decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
40
- translated = decoded.split("Output:")[-1].strip()
41
 
42
- result = classifier(translated)[0]
43
  label = result["label"]
44
  score = result["score"]
45
- suggestion = (
46
- f"The sentence was flagged as '{label}' due to potentially offensive content."
47
- " Consider replacing emotionally charged or abusive terms."
48
- )
49
 
50
- st.session_state.history.append({
51
- "text": text,
52
- "translated": translated,
53
- "label": label,
54
- "score": score,
55
- "suggestion": suggestion
56
- })
57
- return translated, label, score, suggestion
58
 
59
- # Sidebar settings
60
- st.sidebar.header("Settings")
61
- selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys()))
62
  selected_model_id = model_options[selected_model]
63
- classifier = pipeline(
64
- "text-classification",
65
- model=selected_model_id,
66
- device=0 if torch.cuda.is_available() else -1
67
- )
68
-
69
- # Main page title
70
- st.title("🚨 Emoji Offensive Text Detector & Analysis")
71
 
72
- # Input and classification section
73
- st.markdown("## Input or Upload Text for Classification")
74
- col1, col2 = st.columns([2, 1])
75
 
 
 
 
76
  with col1:
77
- user_input = st.text_area(
78
- "Enter sentence with emojis:",
79
- value="春竹你🐎是不是💩了,窩🌿泥🐎SB",
80
- height=150
81
- )
82
- if st.button("Analyze Text"):
83
- with st.spinner("Processing..."):
84
  try:
85
- translated, label, score, suggestion = classify_emoji_text(user_input)
86
- st.markdown("### Translated Sentence:")
87
- st.code(translated)
88
- st.markdown(f"**Prediction:** {label}")
89
- st.markdown(f"**Confidence:** {score:.2%}")
90
- st.markdown("**Model Explanation:**")
91
- st.info(suggestion)
 
92
  except Exception as e:
93
- st.error(f"Error: {e}")
94
 
95
  with col2:
96
- st.markdown("### Or Upload an Image")
97
- uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"])
98
  if uploaded_file:
99
  image = Image.open(uploaded_file)
100
  st.image(image, caption="Uploaded Image", use_column_width=True)
101
- with st.spinner("Running OCR..."):
102
  ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
103
- st.markdown("#### OCR Extracted Text:")
104
  st.code(ocr_text)
105
- translated, label, score, suggestion = classify_emoji_text(ocr_text)
106
- st.markdown("#### Translated:")
107
  st.code(translated)
108
- st.markdown(f"**Prediction:** {label}")
109
- st.markdown(f"**Confidence:** {score:.2%}")
110
- st.markdown("**Model Explanation:**")
111
- st.info(suggestion)
112
 
113
  st.markdown("---")
114
 
115
- # Analysis dashboard
116
- st.markdown("## Analysis Dashboard")
117
  if st.session_state.history:
118
  df = pd.DataFrame(st.session_state.history)
119
- st.markdown("### History Records")
120
  for item in st.session_state.history:
121
- st.markdown(
122
- f"- **Input:** `{item['text']}` | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}"
123
- )
124
- st.markdown(f" - Translated: `{item['translated']}`")
125
- st.markdown(f" - Suggestion: {item['suggestion']} ")
126
 
127
- # Radar chart
128
  radar_df = pd.DataFrame({
129
- "Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"],
130
- "Score": [0.7, 0.4, 0.3, 0.5, 0.6]
131
  })
 
132
  radar_fig = px.line_polar(
133
  radar_df,
134
  r='Score',
135
  theta='Category',
136
  line_close=True,
137
- title="Risk Radar by Category",
138
  color_discrete_sequence=['black']
139
  )
140
  st.plotly_chart(radar_fig)
141
-
142
- # Analyze words related to each offensive category
143
- st.markdown("### Top Offensive Terms by Category")
144
- categories = df['label'].unique()
145
- for cat in categories:
146
- st.markdown(f"**{cat}**")
147
- # collect max score per word in texts of this category
148
- word_scores = {}
149
- for _, row in df[df['label'] == cat].iterrows():
150
- words = row['text'].split()
151
- for w in words:
152
- word_scores[w] = max(word_scores.get(w, 0), row['score'])
153
- sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
154
- # display top 5 by default
155
- for w, s in sorted_words[:5]:
156
- st.markdown(f"- `{w}` ({s:.2%})")
157
- # show more if exists
158
- if len(sorted_words) > 5:
159
- with st.expander("Show more"):
160
- for w, s in sorted_words[5:]:
161
- st.markdown(f"- `{w}` ({s:.2%})")
162
  else:
163
- st.info("No data available. Please analyze some text first.")
 
6
  import pandas as pd
7
  import plotly.express as px
8
 
9
+ # Step 1: Emoji 翻译模型(你自己训练的模型)
10
  emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
11
  emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
12
  emoji_model = AutoModelForCausalLM.from_pretrained(
 
16
  ).to("cuda" if torch.cuda.is_available() else "cpu")
17
  emoji_model.eval()
18
 
19
+ # Step 2: 可选择的冒犯性文本识别模型
20
  model_options = {
21
  "Toxic-BERT": "unitary/toxic-bert",
22
  "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
23
  "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
24
  }
25
 
26
+ # 页面配置
27
  st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
28
 
29
+ # 初始化历史记录
30
  if "history" not in st.session_state:
31
  st.session_state.history = []
32
 
33
+ # Emoji 文本翻译与分类函数
34
  def classify_emoji_text(text: str):
35
+ prompt = f"输入:{text}\n输出:"
36
  input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
37
  with torch.no_grad():
38
  output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
39
  decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
40
+ translated_text = decoded.split("输出:")[-1].strip() if "输出:" in decoded else decoded.strip()
41
 
42
+ result = classifier(translated_text)[0]
43
  label = result["label"]
44
  score = result["score"]
45
+ reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases. Consider replacing emotionally charged, ambiguous, or abusive terms."
 
 
 
46
 
47
+ st.session_state.history.append({"text": text, "translated": translated_text, "label": label, "score": score, "reason": reasoning})
48
+ return translated_text, label, score, reasoning
 
 
 
 
 
 
49
 
50
+ # 页面布局
51
+ st.sidebar.header("🧠 Settings")
52
+ selected_model = st.sidebar.selectbox("Choose classification model", list(model_options.keys()))
53
  selected_model_id = model_options[selected_model]
54
+ classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)
 
 
 
 
 
 
 
55
 
56
+ # 主页面:集成 Text Moderation 和 Text Analysis
57
+ st.title("🚨 Emoji Offensive Text Detector & Violation Analysis")
 
58
 
59
+ # 输入与分类
60
+ st.markdown("## ✍️ 输入或上传文本进行分类")
61
+ col1, col2 = st.columns([2,1])
62
  with col1:
63
+ text = st.text_area("Enter sentence with emojis:", value="你是🐷", height=150)
64
+ if st.button("🚦 Analyze Text"):
65
+ with st.spinner("🔍 Processing..."):
 
 
 
 
66
  try:
67
+ translated, label, score, reason = classify_emoji_text(text)
68
+ st.markdown("### 🔄 Translated sentence:")
69
+ st.code(translated, language="text")
70
+
71
+ st.markdown(f"### 🎯 Prediction: {label}")
72
+ st.markdown(f"### 📊 Confidence Score: {score:.2%}")
73
+ st.markdown("### 🧠 Model Explanation:")
74
+ st.info(reason)
75
  except Exception as e:
76
+ st.error(f"Error during processing: {e}")
77
 
78
  with col2:
79
+ st.markdown("### 🖼️ Or upload a screenshot:")
80
+ uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg","png","jpeg"])
81
  if uploaded_file:
82
  image = Image.open(uploaded_file)
83
  st.image(image, caption="Uploaded Image", use_column_width=True)
84
+ with st.spinner("🧠 Running OCR..."):
85
  ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
86
+ st.markdown("#### 📋 OCR Extracted Text:")
87
  st.code(ocr_text)
88
+ translated, label, score, reason = classify_emoji_text(ocr_text)
89
+ st.markdown("#### 🔄 Translated:")
90
  st.code(translated)
91
+ st.markdown(f"#### 🎯 Prediction: {label}")
92
+ st.markdown(f"#### 📊 Confidence: {score:.2%}")
93
+ st.markdown("#### 🧠 Explanation:")
94
+ st.info(reason)
95
 
96
  st.markdown("---")
97
 
98
+ # 违规分析仪表盘
99
+ st.markdown("## 📊 Violation Analysis Dashboard")
100
  if st.session_state.history:
101
  df = pd.DataFrame(st.session_state.history)
102
+ st.markdown("### 🧾 历史记录详情")
103
  for item in st.session_state.history:
104
+ st.markdown(f"- 🔹 **input:** {item['text']} | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}")
105
+ st.markdown(f" - **Translated:** {item['translated']}")
106
+ st.markdown(f" - **Suggestion:** {item['reason']}")
 
 
107
 
 
108
  radar_df = pd.DataFrame({
109
+ "Category": ["Insult","Abuse","Discrimination","Hate Speech","Vulgarity"],
110
+ "Score": [0.7,0.4,0.3,0.5,0.6]
111
  })
112
+ # 优化雷达图,设置线条为黑色
113
  radar_fig = px.line_polar(
114
  radar_df,
115
  r='Score',
116
  theta='Category',
117
  line_close=True,
118
+ title="⚠️ Risk Radar by Category",
119
  color_discrete_sequence=['black']
120
  )
121
  st.plotly_chart(radar_fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  else:
123
+ st.info("⚠️ No data available. Please analyze some text first.")