Spaces:

aeresd
/

test_1

Sleeping

App Files Files Community

aeresd commited on May 20

Commit

9284803

verified ·

1 Parent(s): 88f8bd0

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -153

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pytesseract
 import pandas as pd
 import plotly.express as px
-# ✅ Step 1: Emoji翻译模型
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
@@ -16,71 +16,29 @@ emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
-# ✅ Step 2: 分类模型配置
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
     "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
 }
-# 雷达图分类映射系统
-category_system = {
-    "Insult": ["侮辱", "贬低", "人身攻击"],
-    "Abuse": ["威胁", "暴力", "骚扰"],
-    "Discrimination": ["种族", "性别", "宗教"],
-    "Hate Speech": ["仇恨", "极端言论"],
-    "Vulgarity": ["脏话", "低俗", "性暗示"]
-}
-# 模型到分类系统的映射
-model_category_map = {
-    "Toxic-BERT": {
-        "toxic": ["Vulgarity"],
-        "severe_toxic": ["Abuse"],
-        "obscene": ["Vulgarity"],
-        "threat": ["Abuse", "Hate Speech"],
-        "insult": ["Insult"],
-        "identity_hate": ["Discrimination", "Hate Speech"]
-    },
-    "Roberta Offensive": {
-        "offensive": ["Insult", "Abuse"]
-    },
-    "BERT Emotion": {
-        "anger": ["Abuse"],
-        "disgust": ["Vulgarity"]
-    }
-}
 # ✅ 页面配置
 st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
-# ✅ 侧边栏配置
 with st.sidebar:
     st.header("🧠 Configuration")
     selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
     selected_model_id = model_options[selected_model]
-    # 动态调整分类器参数
-    classifier_config = {
-        "device": 0 if torch.cuda.is_available() else -1,
-        "top_k": None if selected_model == "Toxic-BERT" else 1
-    }
-    if selected_model == "Toxic-BERT":
-        classifier_config["function_to_apply"] = "sigmoid"
-    classifier = pipeline(
-        "text-classification",
-        model=selected_model_id,
-        **classifier_config
-    )
 # 初始化历史记录
 if "history" not in st.session_state:
     st.session_state.history = []
-# ✅ 核心分类函数
 def classify_emoji_text(text: str):
-    # Emoji翻译
     prompt = f"输入：{text}\n输出："
     input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
     with torch.no_grad():
@@ -88,49 +46,27 @@ def classify_emoji_text(text: str):
     decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
     translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()
-    # 整体分类
-    main_result = classifier(translated_text)[0]
-    # 元素级分析
-    elements = translated_text.split()
-    element_analysis = []
-    radar_scores = {category: 0.0 for category in category_system}
-    for elem in elements:
-        try:
-            results = classifier(elem)
-            for res in results:
-                for model_label in model_category_map.get(selected_model, {}):
-                    if res["label"] == model_label:
-                        score = res["score"]
-                        for category in model_category_map[selected_model][model_label]:
-                            if score > radar_scores[category]:
-                                radar_scores[category] = score
-                            element_analysis.append({
-                                "Element": elem,
-                                "Original": text.split()[elements.index(elem)] if len(text.split()) > elements.index(elem) else "",
-                                "Category": category,
-                                "Score": score
-                            })
-        except Exception as e:
-            continue
-    # 记录历史
     st.session_state.history.append({
         "text": text,
         "translated": translated_text,
-        "label": main_result["label"],
-        "score": main_result["score"],
-        "elements": element_analysis,
-        "radar": radar_scores
     })
-    return translated_text, main_result["label"], main_result["score"], radar_scores
-# ✅ 主界面
 st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
-# 文本输入模块
 st.subheader("1. 输入与分类")
 default_text = "你是🐷"
 text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
@@ -138,99 +74,95 @@ text = st.text_area("Enter sentence with emojis:", value=default_text, height=15
 if st.button("🚦 Analyze Text"):
     with st.spinner("🔍 Processing..."):
         try:
-            translated, label, score, radar = classify_emoji_text(text)
             st.markdown("**Translated sentence:**")
             st.code(translated, language="text")
-            col1, col2 = st.columns(2)
-            with col1:
-                st.metric("Prediction", f"{label} 🔴" if score > 0.5 else f"{label} 🟢")
-            with col2:
-                st.metric("Confidence", f"{score:.2%}")
             st.markdown("**Model Explanation:**")
-            st.info(f"文本被识别为「{label}」，建议检查以下内容：")
-            for cat, score in radar.items():
-                if score > 0.5:
-                    st.markdown(f"- ❗ **{cat}** 风险 ({score:.2%})")
         except Exception as e:
-            st.error(f"❌ Error: {e}")
-# 图片分析模块
 st.markdown("---")
 st.subheader("2. 图片 OCR & 分类")
 uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"])
 if uploaded_file:
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Screenshot", use_column_width=True)
     with st.spinner("🧠 Extracting text via OCR..."):
         ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
         if ocr_text:
             st.markdown("**Extracted Text:**")
             st.code(ocr_text)
-            try:
-                translated, label, score, radar = classify_emoji_text(ocr_text)
-                st.markdown(f"**Prediction:** {label} ({score:.2%})")
-            except Exception as e:
-                st.error(f"OCR分析错误: {e}")
         else:
-            st.info("⚠️ 未检测到文字内容")
-# 数据分析仪表盘
 st.markdown("---")
-st.subheader("3. 风险分析仪表盘")
 if st.session_state.history:
-    latest = st.session_state.history[-1]
     # 雷达图
-    st.markdown("### ⚠️ 风险雷达图")
     radar_df = pd.DataFrame({
-        "Category": latest["radar"].keys(),
-        "Score": latest["radar"].values()
     })
-    fig = px.line_polar(
-        radar_df,
-        r="Score",
-        theta="Category",
-        line_close=True,
-        range_r=[0,1],
-        template="plotly_dark"
-    )
-    fig.update_traces(fill="toself", line_color="red")
-    st.plotly_chart(fig, use_container_width=True)
-    # 元素贡献分析
-    st.markdown("### 🧩 风险元素分解表")
-    if latest["elements"]:
-        element_df = pd.DataFrame(latest["elements"])
-        element_df = element_df.sort_values(by=["Score", "Category"], ascending=False)
-        # 分组展示
-        for category in category_system:
-            cat_df = element_df[element_df["Category"] == category]
-            if not cat_df.empty:
-                with st.expander(f"{category} 风险元素 ({len(cat_df)}项)"):
-                    st.dataframe(
-                        cat_df[["Element", "Original", "Score"]]
-                            .style.highlight_between(subset="Score", color="#ffcccc"),
-                        use_container_width=True,
-                        hide_index=True
-                    )
     else:
-        st.info("✅ 未检测到高风险元素")
-    # 历史记录
-    st.markdown("### 📜 分析历史")
-    history_df = pd.DataFrame(st.session_state.history)
-    st.dataframe(
-        history_df[["text", "label", "score"]]
-            .style.applymap(lambda x: "color: red" if x == "OFFENSIVE" else ""),
-        use_container_width=True,
-        hide_index=True
-    )
 else:
-    st.info("🕑 等待首次分析结果...")

 import pandas as pd
 import plotly.express as px
+# ✅ Step 1: Emoji 翻译模型（你自己训练的模型）
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
+# ✅ Step 2: 可选择的冒犯性文本识别模型
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
     "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
 }
 # ✅ 页面配置
 st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
+# ✅ 侧边栏：模型选择
 with st.sidebar:
     st.header("🧠 Configuration")
     selected_model = st.selectbox("Choose classification model", list(model_options.keys()))
     selected_model_id = model_options[selected_model]
+    classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)
 # 初始化历史记录
 if "history" not in st.session_state:
     st.session_state.history = []
+# 分类函数
 def classify_emoji_text(text: str):
     prompt = f"输入：{text}\n输出："
     input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
     with torch.no_grad():
     decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
     translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()
+    result = classifier(translated_text)[0]
+    label = result["label"]
+    score = result["score"]
+    reasoning = (
+        f"The sentence was flagged as '{label}' due to potentially offensive phrases. "
+        "Consider replacing emotionally charged, ambiguous, or abusive terms."
+    )
     st.session_state.history.append({
         "text": text,
         "translated": translated_text,
+        "label": label,
+        "score": score,
+        "reason": reasoning
     })
+    return translated_text, label, score, reasoning
+# 主页面：输入与分析共存
 st.title("🚨 Emoji Offensive Text Detector & Analysis Dashboard")
+# 文本输入
 st.subheader("1. 输入与分类")
 default_text = "你是🐷"
 text = st.text_area("Enter sentence with emojis:", value=default_text, height=150)
 if st.button("🚦 Analyze Text"):
     with st.spinner("🔍 Processing..."):
         try:
+            translated, label, score, reason = classify_emoji_text(text)
             st.markdown("**Translated sentence:**")
             st.code(translated, language="text")
+            st.markdown(f"**Prediction:** {label}")
+            st.markdown(f"**Confidence Score:** {score:.2%}")
             st.markdown("**Model Explanation:**")
+            st.info(reason)
         except Exception as e:
+            st.error(f"❌ An error occurred:\n{e}")
+# 图片上传与 OCR
 st.markdown("---")
 st.subheader("2. 图片 OCR & 分类")
 uploaded_file = st.file_uploader("Upload an image (JPG/PNG)", type=["jpg","jpeg","png"])
 if uploaded_file:
     image = Image.open(uploaded_file)
     st.image(image, caption="Uploaded Screenshot", use_column_width=True)
     with st.spinner("🧠 Extracting text via OCR..."):
         ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
         if ocr_text:
             st.markdown("**Extracted Text:**")
             st.code(ocr_text)
+            translated, label, score, reason = classify_emoji_text(ocr_text)
+            st.markdown("**Translated sentence:**")
+            st.code(translated, language="text")
+            st.markdown(f"**Prediction:** {label}")
+            st.markdown(f"**Confidence Score:** {score:.2%}")
+            st.markdown("**Model Explanation:**")
+            st.info(reason)
         else:
+            st.info("⚠️ No text detected in the image.")
+# 分析仪表盘
 st.markdown("---")
+st.subheader("3. Violation Analysis Dashboard")
 if st.session_state.history:
+    # 展示历史记录
+    df = pd.DataFrame(st.session_state.history)
+    st.markdown("### 🧾 Offensive Terms & Suggestions")
+    for item in st.session_state.history:
+        st.markdown(f"- 🔹 **Input:** {item['text']}")
+        st.markdown(f"   - ✨ **Translated:** {item['translated']}")
+        st.markdown(f"   - ❗ **Label:** {item['label']} with **{item['score']:.2%}** confidence")
+        st.markdown(f"   - 🔧 **Suggestion:** {item['reason']}")
     # 雷达图
     radar_df = pd.DataFrame({
+        "Category": ["Insult","Abuse","Discrimination","Hate Speech","Vulgarity"],
+        "Score": [0.7,0.4,0.3,0.5,0.6]
     })
+    radar_fig = px.line_polar(radar_df, r='Score', theta='Category', line_close=True, title="⚠️ Risk Radar by Category")
+    radar_fig.update_traces(line_color='black')
+    st.plotly_chart(radar_fig)
+    # —— 新增：单词级冒犯性相关性分析 —— #
+    st.markdown("### 🧬 Word-level Offensive Correlation")
+    # 取最近一次翻译文本，按空格拆分单词
+    last_translated_text = st.session_state.history[-1]["translated"]
+    words = last_translated_text.split()
+    # 对每个单词进行分类并收集分数
+    word_scores = []
+    for word in words:
+        try:
+            res = classifier(word)[0]
+            word_scores.append({
+                "Word": word,
+                "Label": res["label"],
+                "Score": res["score"]
+            })
+        except Exception:
+            continue
+    if word_scores:
+        word_df = pd.DataFrame(word_scores)
+        word_df = word_df.sort_values(by="Score", ascending=False).reset_index(drop=True)
+        max_display = 5
+        # Streamlit 1.22+ 支持 st.toggle，若版本不支持可改用 checkbox
+        show_more = st.toggle("Show more words", value=False)
+        display_df = word_df if show_more else word_df.head(max_display)
+        # 隐藏边框并渲染 HTML 表格
+        st.markdown(
+            display_df.to_html(index=False, border=0),
+            unsafe_allow_html=True
+        )
     else:
+        st.info("❕ No word-level analysis available.")
 else:
+    st.info("⚠️ No classification data available yet.")