Spaces:

aeresd
/

test_1

Sleeping

App Files Files Community

aeresd commited on May 20

Commit

851f89d

verified ·

1 Parent(s): d6593c8

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -93

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pytesseract
 import pandas as pd
 import plotly.express as px
-# Step 1: Emoji translation model (fine-tuned)
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
@@ -16,148 +16,108 @@ emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
-# Step 2: Offensive text classification model options
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
     "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
 }
-# Page configuration
 st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
-# Initialize history
 if "history" not in st.session_state:
     st.session_state.history = []
-# Classification function
 def classify_emoji_text(text: str):
-    prompt = f"Input: {text}\nOutput:"
     input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
     with torch.no_grad():
         output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
     decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    translated = decoded.split("Output:")[-1].strip()
-    result = classifier(translated)[0]
     label = result["label"]
     score = result["score"]
-    suggestion = (
-        f"The sentence was flagged as '{label}' due to potentially offensive content."
-        " Consider replacing emotionally charged or abusive terms."
-    )
-    st.session_state.history.append({
-        "text": text,
-        "translated": translated,
-        "label": label,
-        "score": score,
-        "suggestion": suggestion
-    })
-    return translated, label, score, suggestion
-# Sidebar settings
-st.sidebar.header("Settings")
-selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys()))
 selected_model_id = model_options[selected_model]
-classifier = pipeline(
-    "text-classification",
-    model=selected_model_id,
-    device=0 if torch.cuda.is_available() else -1
-)
-# Main page title
-st.title("🚨 Emoji Offensive Text Detector & Analysis")
-# Input and classification section
-st.markdown("## Input or Upload Text for Classification")
-col1, col2 = st.columns([2, 1])
 with col1:
-    user_input = st.text_area(
-        "Enter sentence with emojis:",
-        value="春竹你🐎是不是💩了,窩🌿泥🐎SB",
-        height=150
-    )
-    if st.button("Analyze Text"):
-        with st.spinner("Processing..."):
             try:
-                translated, label, score, suggestion = classify_emoji_text(user_input)
-                st.markdown("### Translated Sentence:")
-                st.code(translated)
-                st.markdown(f"**Prediction:** {label}")
-                st.markdown(f"**Confidence:** {score:.2%}")
-                st.markdown("**Model Explanation:**")
-                st.info(suggestion)
             except Exception as e:
-                st.error(f"Error: {e}")
 with col2:
-    st.markdown("### Or Upload an Image")
-    uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"])
     if uploaded_file:
         image = Image.open(uploaded_file)
         st.image(image, caption="Uploaded Image", use_column_width=True)
-        with st.spinner("Running OCR..."):
             ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
-            st.markdown("#### OCR Extracted Text:")
             st.code(ocr_text)
-            translated, label, score, suggestion = classify_emoji_text(ocr_text)
-            st.markdown("#### Translated:")
             st.code(translated)
-            st.markdown(f"**Prediction:** {label}")
-            st.markdown(f"**Confidence:** {score:.2%}")
-            st.markdown("**Model Explanation:**")
-            st.info(suggestion)
 st.markdown("---")
-# Analysis dashboard
-st.markdown("## Analysis Dashboard")
 if st.session_state.history:
     df = pd.DataFrame(st.session_state.history)
-    st.markdown("### History Records")
     for item in st.session_state.history:
-        st.markdown(
-            f"- **Input:** `{item['text']}` | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}"
-        )
-        st.markdown(f"  - Translated: `{item['translated']}`")
-        st.markdown(f"  - Suggestion: {item['suggestion']} ")
-    # Radar chart
     radar_df = pd.DataFrame({
-        "Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"],
-        "Score": [0.7, 0.4, 0.3, 0.5, 0.6]
     })
     radar_fig = px.line_polar(
         radar_df,
         r='Score',
         theta='Category',
         line_close=True,
-        title="Risk Radar by Category",
         color_discrete_sequence=['black']
     )
     st.plotly_chart(radar_fig)
-    # Analyze words related to each offensive category
-    st.markdown("### Top Offensive Terms by Category")
-    categories = df['label'].unique()
-    for cat in categories:
-        st.markdown(f"**{cat}**")
-        # collect max score per word in texts of this category
-        word_scores = {}
-        for _, row in df[df['label'] == cat].iterrows():
-            words = row['text'].split()
-            for w in words:
-                word_scores[w] = max(word_scores.get(w, 0), row['score'])
-        sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
-        # display top 5 by default
-        for w, s in sorted_words[:5]:
-            st.markdown(f"- `{w}` ({s:.2%})")
-        # show more if exists
-        if len(sorted_words) > 5:
-            with st.expander("Show more"):
-                for w, s in sorted_words[5:]:
-                    st.markdown(f"- `{w}` ({s:.2%})")
 else:
-    st.info("No data available. Please analyze some text first.")

 import pandas as pd
 import plotly.express as px
+# Step 1: Emoji 翻译模型（你自己训练的模型）
 emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
 emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
 emoji_model = AutoModelForCausalLM.from_pretrained(
 ).to("cuda" if torch.cuda.is_available() else "cpu")
 emoji_model.eval()
+# Step 2: 可选择的冒犯性文本识别模型
 model_options = {
     "Toxic-BERT": "unitary/toxic-bert",
     "Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
     "BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
 }
+# 页面配置
 st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")
+# 初始化历史记录
 if "history" not in st.session_state:
     st.session_state.history = []
+# Emoji 文本翻译与分类函数
 def classify_emoji_text(text: str):
+    prompt = f"输入：{text}\n输出："
     input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
     with torch.no_grad():
         output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
     decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    translated_text = decoded.split("输出：")[-1].strip() if "输出：" in decoded else decoded.strip()
+    result = classifier(translated_text)[0]
     label = result["label"]
     score = result["score"]
+    reasoning = f"The sentence was flagged as '{label}' due to potentially offensive phrases. Consider replacing emotionally charged, ambiguous, or abusive terms."
+    st.session_state.history.append({"text": text, "translated": translated_text, "label": label, "score": score, "reason": reasoning})
+    return translated_text, label, score, reasoning
+# 页面布局
+st.sidebar.header("🧠 Settings")
+selected_model = st.sidebar.selectbox("Choose classification model", list(model_options.keys()))
 selected_model_id = model_options[selected_model]
+classifier = pipeline("text-classification", model=selected_model_id, device=0 if torch.cuda.is_available() else -1)
+# 主页面：集成 Text Moderation 和 Text Analysis
+st.title("🚨 Emoji Offensive Text Detector & Violation Analysis")
+# 输入与分类
+st.markdown("## ✍️ 输入或上传文本进行分类")
+col1, col2 = st.columns([2,1])
 with col1:
+    text = st.text_area("Enter sentence with emojis:", value="你是🐷", height=150)
+    if st.button("🚦 Analyze Text"):
+        with st.spinner("🔍 Processing..."):
             try:
+                translated, label, score, reason = classify_emoji_text(text)
+                st.markdown("### 🔄 Translated sentence:")
+                st.code(translated, language="text")
+                st.markdown(f"### 🎯 Prediction: {label}")
+                st.markdown(f"### 📊 Confidence Score: {score:.2%}")
+                st.markdown("### 🧠 Model Explanation:")
+                st.info(reason)
             except Exception as e:
+                st.error(f"❌ Error during processing: {e}")
 with col2:
+    st.markdown("### 🖼️ Or upload a screenshot:")
+    uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg","png","jpeg"])
     if uploaded_file:
         image = Image.open(uploaded_file)
         st.image(image, caption="Uploaded Image", use_column_width=True)
+        with st.spinner("🧠 Running OCR..."):
             ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
+            st.markdown("#### 📋 OCR Extracted Text:")
             st.code(ocr_text)
+            translated, label, score, reason = classify_emoji_text(ocr_text)
+            st.markdown("#### 🔄 Translated:")
             st.code(translated)
+            st.markdown(f"#### 🎯 Prediction: {label}")
+            st.markdown(f"#### 📊 Confidence: {score:.2%}")
+            st.markdown("#### 🧠 Explanation:")
+            st.info(reason)
 st.markdown("---")
+# 违规分析仪表盘
+st.markdown("## 📊 Violation Analysis Dashboard")
 if st.session_state.history:
     df = pd.DataFrame(st.session_state.history)
+    st.markdown("### 🧾 历史记录详情")
     for item in st.session_state.history:
+        st.markdown(f"- 🔹 **input:** {item['text']} | **Label:** {item['label']} | **Confidence:** {item['score']:.2%}")
+        st.markdown(f"  - **Translated:** {item['translated']}")
+        st.markdown(f"  - **Suggestion:** {item['reason']}")
     radar_df = pd.DataFrame({
+        "Category": ["Insult","Abuse","Discrimination","Hate Speech","Vulgarity"],
+        "Score": [0.7,0.4,0.3,0.5,0.6]
     })
+    # 优化雷达图，设置线条为黑色
     radar_fig = px.line_polar(
         radar_df,
         r='Score',
         theta='Category',
         line_close=True,
+        title="⚠️ Risk Radar by Category",
         color_discrete_sequence=['black']
     )
     st.plotly_chart(radar_fig)
 else:
+    st.info("⚠️ No data available. Please analyze some text first.")