Spaces:

aeresd
/

test_1

Sleeping

App Files Files Community

test_1 / app.py

aeresd

Update app.py

d6593c8 verified 4 months ago

raw

history blame

6.03 kB

	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	import streamlit as st
	from PIL import Image
	import pytesseract
	import pandas as pd
	import plotly.express as px

	# Step 1: Emoji translation model (fine-tuned)
	emoji_model_id = "JenniferHJF/qwen1.5-emoji-finetuned"
	emoji_tokenizer = AutoTokenizer.from_pretrained(emoji_model_id, trust_remote_code=True)
	emoji_model = AutoModelForCausalLM.from_pretrained(
	emoji_model_id,
	trust_remote_code=True,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	).to("cuda" if torch.cuda.is_available() else "cpu")
	emoji_model.eval()

	# Step 2: Offensive text classification model options
	model_options = {
	"Toxic-BERT": "unitary/toxic-bert",
	"Roberta Offensive": "cardiffnlp/twitter-roberta-base-offensive",
	"BERT Emotion": "bhadresh-savani/bert-base-go-emotion"
	}

	# Page configuration
	st.set_page_config(page_title="Emoji Offensive Text Detector", page_icon="🚨", layout="wide")

	# Initialize history
	if "history" not in st.session_state:
	st.session_state.history = []

	# Classification function
	def classify_emoji_text(text: str):
	prompt = f"Input: {text}\nOutput:"
	input_ids = emoji_tokenizer(prompt, return_tensors="pt").to(emoji_model.device)
	with torch.no_grad():
	output_ids = emoji_model.generate(**input_ids, max_new_tokens=64, do_sample=False)
	decoded = emoji_tokenizer.decode(output_ids[0], skip_special_tokens=True)
	translated = decoded.split("Output:")[-1].strip()

	result = classifier(translated)[0]
	label = result["label"]
	score = result["score"]
	suggestion = (
	f"The sentence was flagged as '{label}' due to potentially offensive content."
	" Consider replacing emotionally charged or abusive terms."
	)

	st.session_state.history.append({
	"text": text,
	"translated": translated,
	"label": label,
	"score": score,
	"suggestion": suggestion
	})
	return translated, label, score, suggestion

	# Sidebar settings
	st.sidebar.header("Settings")
	selected_model = st.sidebar.selectbox("Classification Model", list(model_options.keys()))
	selected_model_id = model_options[selected_model]
	classifier = pipeline(
	"text-classification",
	model=selected_model_id,
	device=0 if torch.cuda.is_available() else -1
	)

	# Main page title
	st.title("🚨 Emoji Offensive Text Detector & Analysis")

	# Input and classification section
	st.markdown("## Input or Upload Text for Classification")
	col1, col2 = st.columns([2, 1])

	with col1:
	user_input = st.text_area(
	"Enter sentence with emojis:",
	value="春竹你🐎是不是💩了,窩🌿泥🐎SB",
	height=150
	)
	if st.button("Analyze Text"):
	with st.spinner("Processing..."):
	try:
	translated, label, score, suggestion = classify_emoji_text(user_input)
	st.markdown("### Translated Sentence:")
	st.code(translated)
	st.markdown(f"Prediction: {label}")
	st.markdown(f"Confidence: {score:.2%}")
	st.markdown("Model Explanation:")
	st.info(suggestion)
	except Exception as e:
	st.error(f"Error: {e}")

	with col2:
	st.markdown("### Or Upload an Image")
	uploaded_file = st.file_uploader("Image (JPG/PNG)", type=["jpg", "jpeg", "png"])
	if uploaded_file:
	image = Image.open(uploaded_file)
	st.image(image, caption="Uploaded Image", use_column_width=True)
	with st.spinner("Running OCR..."):
	ocr_text = pytesseract.image_to_string(image, lang="chi_sim+eng").strip()
	st.markdown("#### OCR Extracted Text:")
	st.code(ocr_text)
	translated, label, score, suggestion = classify_emoji_text(ocr_text)
	st.markdown("#### Translated:")
	st.code(translated)
	st.markdown(f"Prediction: {label}")
	st.markdown(f"Confidence: {score:.2%}")
	st.markdown("Model Explanation:")
	st.info(suggestion)

	st.markdown("---")

	# Analysis dashboard
	st.markdown("## Analysis Dashboard")
	if st.session_state.history:
	df = pd.DataFrame(st.session_state.history)
	st.markdown("### History Records")
	for item in st.session_state.history:
	st.markdown(
	f"- Input: `{item['text']}` \| Label: {item['label']} \| Confidence: {item['score']:.2%}"
	)
	st.markdown(f" - Translated: `{item['translated']}`")
	st.markdown(f" - Suggestion: {item['suggestion']} ")

	# Radar chart
	radar_df = pd.DataFrame({
	"Category": ["Insult", "Abuse", "Discrimination", "Hate Speech", "Vulgarity"],
	"Score": [0.7, 0.4, 0.3, 0.5, 0.6]
	})
	radar_fig = px.line_polar(
	radar_df,
	r='Score',
	theta='Category',
	line_close=True,
	title="Risk Radar by Category",
	color_discrete_sequence=['black']
	)
	st.plotly_chart(radar_fig)

	# Analyze words related to each offensive category
	st.markdown("### Top Offensive Terms by Category")
	categories = df['label'].unique()
	for cat in categories:
	st.markdown(f"{cat}")
	# collect max score per word in texts of this category
	word_scores = {}
	for _, row in df[df['label'] == cat].iterrows():
	words = row['text'].split()
	for w in words:
	word_scores[w] = max(word_scores.get(w, 0), row['score'])
	sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
	# display top 5 by default
	for w, s in sorted_words[:5]:
	st.markdown(f"- `{w}` ({s:.2%})")
	# show more if exists
	if len(sorted_words) > 5:
	with st.expander("Show more"):
	for w, s in sorted_words[5:]:
	st.markdown(f"- `{w}` ({s:.2%})")
	else:
	st.info("No data available. Please analyze some text first.")