Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline | |
import numpy as np | |
import json | |
from datetime import datetime | |
import logging | |
import os | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class PromptBasedMultiAgentSystem: | |
def __init__(self): | |
self.detection_agent = None | |
self.counter_speech_agent = None | |
self.moderation_agent = None | |
self.sentiment_agent = None | |
# Load prompt configurations | |
self.counter_speech_prompts = self.load_prompts("counter_speech_prompts.json") | |
self.moderation_prompts = self.load_prompts("moderation_prompts.json") | |
self.initialize_agents() | |
def load_prompts(self, filename): | |
"""Load prompts from JSON file with fallback""" | |
try: | |
if os.path.exists(filename): | |
with open(filename, 'r', encoding='utf-8') as f: | |
return json.load(f) | |
else: | |
logger.warning(f"Prompt file {filename} not found, using built-in prompts") | |
return self.get_default_prompts(filename) | |
except Exception as e: | |
logger.error(f"Error loading prompts from {filename}: {e}") | |
return self.get_default_prompts(filename) | |
def get_default_prompts(self, filename): | |
"""Default prompts as fallback""" | |
if "counter_speech" in filename: | |
return { | |
"counter_speech_prompts": { | |
"high_risk": { | |
"system_prompt": "You are an expert educator specializing in counter-speech and conflict de-escalation.", | |
"user_prompt_template": "Generate a respectful, educational counter-speech response to address harmful content while promoting understanding. Original text (Risk: {risk_level}, Confidence: {confidence}%, Sentiment: {sentiment}): \"{original_text}\"\n\nCounter-speech response:", | |
}, | |
"general_template": { | |
"fallback_responses": [ | |
"Thank you for sharing your thoughts. Building strong communities works best when we focus on shared values and constructive dialogue. How might we work together on the concerns you've raised?", | |
"I appreciate your perspective. Sometimes our strongest feelings can be expressed in ways that bring people together. What specific positive changes would you like to see?", | |
"Your engagement with this topic is clear. When we channel that energy into inclusive dialogue, we often find solutions that work for everyone." | |
] | |
} | |
} | |
} | |
else: | |
return { | |
"moderation_prompts": { | |
"comprehensive_analysis": { | |
"system_prompt": "You are an expert content moderation specialist analyzing text for safety and compliance.", | |
"user_prompt_template": "Analyze this text for potential violations: \"{text}\"\n\nProvide: 1) Safety assessment 2) Violation categories 3) Severity level 4) Confidence score 5) Recommended action\n\nAnalysis:", | |
} | |
} | |
} | |
def initialize_agents(self): | |
"""Initialize all AI agents""" | |
logger.info("🤖 Initializing Prompt-Based Multi-Agent System...") | |
self.setup_detection_agent() | |
self.setup_counter_speech_agent() | |
self.setup_moderation_agent() | |
self.setup_sentiment_agent() | |
logger.info("✅ All agents initialized successfully!") | |
def setup_detection_agent(self): | |
"""Initialize the hate speech detection agent""" | |
try: | |
logger.info("🔍 Loading Detection Agent (Fine-tuned DistilBERT)...") | |
model_path = "./model" | |
tokenizer = AutoTokenizer.from_pretrained(model_path) | |
model = AutoModelForSequenceClassification.from_pretrained( | |
model_path, | |
torch_dtype=torch.float32 | |
) | |
self.detection_agent = pipeline( | |
"text-classification", | |
model=model, | |
tokenizer=tokenizer, | |
return_all_scores=True, | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
logger.info("✅ Detection Agent loaded successfully") | |
except Exception as e: | |
logger.error(f"❌ Detection Agent failed: {e}") | |
logger.info("🔄 Using fallback detection model...") | |
self.detection_agent = pipeline( | |
"text-classification", | |
model="unitary/toxic-bert", | |
return_all_scores=True | |
) | |
def setup_counter_speech_agent(self): | |
"""Initialize counter-speech generation agent with prompts""" | |
try: | |
logger.info("💬 Loading Counter-Speech Agent with Custom Prompts...") | |
# Using FLAN-T5 which is excellent at following instructions | |
self.counter_speech_agent = pipeline( | |
"text2text-generation", | |
model="google/flan-t5-base", | |
max_length=200, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
logger.info("✅ Counter-Speech Agent loaded (FLAN-T5 with custom prompts)") | |
except Exception as e: | |
logger.error(f"❌ Counter-Speech Agent failed: {e}") | |
self.counter_speech_agent = None | |
def setup_moderation_agent(self): | |
"""Initialize content moderation agent with prompts""" | |
try: | |
logger.info("🛡️ Loading Moderation Agent with Custom Prompts...") | |
# Using FLAN-T5 for structured moderation analysis | |
self.moderation_agent = pipeline( | |
"text2text-generation", | |
model="google/flan-t5-base", | |
max_length=300, | |
do_sample=False, | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
logger.info("✅ Moderation Agent loaded (FLAN-T5 with analysis prompts)") | |
except Exception as e: | |
logger.error(f"❌ Moderation Agent failed: {e}") | |
self.moderation_agent = None | |
def setup_sentiment_agent(self): | |
"""Initialize sentiment analysis agent""" | |
try: | |
logger.info("📊 Loading Sentiment Agent...") | |
self.sentiment_agent = pipeline( | |
"sentiment-analysis", | |
model="cardiffnlp/twitter-roberta-base-sentiment-latest", | |
return_all_scores=True, | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
logger.info("✅ Sentiment Agent loaded (Twitter-RoBERTa)") | |
except Exception as e: | |
logger.error(f"❌ Sentiment Agent failed: {e}") | |
self.sentiment_agent = None | |
def detect_hate_speech(self, text): | |
"""Detection Agent: Analyze text for hate speech""" | |
if not text or not text.strip(): | |
return { | |
"status": "❌ Please enter some text to analyze.", | |
"prediction": "No input", | |
"confidence": 0.0, | |
"all_scores": {}, | |
"risk_level": "Unknown", | |
"is_hate_speech": False | |
} | |
try: | |
results = self.detection_agent(text.strip()) | |
if isinstance(results, list) and len(results) > 0: | |
if isinstance(results[0], list): | |
results = results[0] | |
all_scores = {} | |
max_score = 0 | |
predicted_label = "UNKNOWN" | |
for result in results: | |
label = result["label"] | |
score = result["score"] | |
all_scores[label] = { | |
"score": score, | |
"percentage": f"{score*100:.2f}%", | |
"confidence": f"{score:.4f}" | |
} | |
if score > max_score: | |
max_score = score | |
predicted_label = label | |
# Determine hate speech status | |
hate_keywords = ["HATE", "TOXIC", "NEGATIVE", "HARMFUL", "1", "LABEL_1"] | |
is_hate_speech = False | |
risk_level = "Low" | |
if any(keyword in predicted_label.upper() for keyword in hate_keywords): | |
if max_score > 0.8: | |
is_hate_speech = True | |
risk_level = "High" | |
status = f"🚨 High confidence hate speech detected! ({predicted_label}: {max_score:.2%})" | |
elif max_score > 0.6: | |
is_hate_speech = True | |
risk_level = "Medium" | |
status = f"⚠️ Potential hate speech detected ({predicted_label}: {max_score:.2%})" | |
else: | |
risk_level = "Low-Medium" | |
status = f"⚡ Low confidence detection ({predicted_label}: {max_score:.2%})" | |
else: | |
risk_level = "Low" | |
status = f"✅ No hate speech detected ({predicted_label}: {max_score:.2%})" | |
return { | |
"status": status, | |
"prediction": predicted_label, | |
"confidence": max_score, | |
"all_scores": all_scores, | |
"risk_level": risk_level, | |
"is_hate_speech": is_hate_speech | |
} | |
except Exception as e: | |
logger.error(f"Detection error: {e}") | |
return { | |
"status": f"❌ Detection error: {str(e)}", | |
"prediction": "Error", | |
"confidence": 0.0, | |
"all_scores": {}, | |
"risk_level": "Unknown", | |
"is_hate_speech": False | |
} | |
def analyze_sentiment(self, text): | |
"""Sentiment Agent: Analyze emotional tone""" | |
if not self.sentiment_agent or not text.strip(): | |
return {"sentiment": "neutral", "confidence": 0.0} | |
try: | |
results = self.sentiment_agent(text.strip()) | |
if isinstance(results, list) and len(results) > 0: | |
if isinstance(results[0], list): | |
results = results[0] | |
best_sentiment = max(results, key=lambda x: x['score']) | |
return { | |
"sentiment": best_sentiment['label'].lower(), | |
"confidence": best_sentiment['score'], | |
"all_sentiments": {r['label']: r['score'] for r in results} | |
} | |
except Exception as e: | |
logger.error(f"Sentiment analysis error: {e}") | |
return {"sentiment": "neutral", "confidence": 0.0} | |
def moderate_content_with_prompts(self, text, detection_result, sentiment_result): | |
"""Moderation Agent: Structured analysis using prompts""" | |
if not self.moderation_agent or not text.strip(): | |
return {"analysis": "Unable to perform moderation analysis", "confidence": 0.0} | |
try: | |
# Get the appropriate moderation prompt | |
moderation_config = self.moderation_prompts.get("moderation_prompts", {}) | |
analysis_config = moderation_config.get("comprehensive_analysis", {}) | |
# Construct the analysis prompt | |
system_prompt = analysis_config.get("system_prompt", "Analyze this text for safety concerns.") | |
user_prompt_template = analysis_config.get("user_prompt_template", "Analyze: {text}") | |
# Fill in the template | |
full_prompt = f"{system_prompt}\n\n{user_prompt_template.format(text=text)}" | |
# Generate analysis | |
result = self.moderation_agent(full_prompt, max_length=250, do_sample=False) | |
if result and len(result) > 0: | |
analysis_text = result[0]['generated_text'] | |
# Parse the analysis for key information | |
confidence = self.extract_confidence_from_analysis(analysis_text) | |
safety_level = self.extract_safety_level_from_analysis(analysis_text) | |
return { | |
"analysis": analysis_text, | |
"confidence": confidence, | |
"safety_level": safety_level, | |
"prompt_used": "comprehensive_analysis" | |
} | |
except Exception as e: | |
logger.error(f"Moderation analysis error: {e}") | |
# Fallback analysis | |
return { | |
"analysis": f"Basic assessment: Risk level {detection_result.get('risk_level', 'unknown')}, requires review if confidence > 70%", | |
"confidence": detection_result.get('confidence', 0.0), | |
"safety_level": "review_needed" if detection_result.get('confidence', 0) > 0.7 else "acceptable" | |
} | |
def generate_counter_speech_with_prompts(self, text, detection_result, sentiment_result): | |
"""Counter-Speech Agent: Generate response using custom prompts""" | |
if not detection_result.get("is_hate_speech", False): | |
return "✨ This text promotes positive communication. Great job maintaining respectful dialogue!" | |
risk_level = detection_result.get("risk_level", "Low").lower() | |
confidence = detection_result.get("confidence", 0.0) * 100 | |
sentiment = sentiment_result.get("sentiment", "neutral") | |
# Get appropriate prompts based on risk level | |
counter_speech_config = self.counter_speech_prompts.get("counter_speech_prompts", {}) | |
# Select prompt based on risk level | |
if risk_level == "high": | |
prompt_config = counter_speech_config.get("high_risk", {}) | |
elif risk_level == "medium": | |
prompt_config = counter_speech_config.get("medium_risk", {}) | |
else: | |
prompt_config = counter_speech_config.get("low_risk", {}) | |
# If no specific config, use general template | |
if not prompt_config: | |
prompt_config = counter_speech_config.get("general_template", {}) | |
if self.counter_speech_agent and prompt_config: | |
try: | |
# Construct the prompt | |
system_prompt = prompt_config.get("system_prompt", "Generate a respectful counter-speech response.") | |
user_prompt_template = prompt_config.get("user_prompt_template", | |
"Generate a counter-speech response for: {original_text}") | |
# Fill in the template | |
full_prompt = f"{system_prompt}\n\n{user_prompt_template.format(original_text=text, risk_level=risk_level, confidence=confidence, sentiment=sentiment)}" | |
# Generate counter-speech | |
result = self.counter_speech_agent(full_prompt, max_length=150, do_sample=True, temperature=0.7) | |
if result and len(result) > 0: | |
generated_text = result[0]['generated_text'] | |
# Clean up the response | |
if "Counter-speech response:" in generated_text: | |
generated_text = generated_text.split("Counter-speech response:")[-1].strip() | |
elif "response:" in generated_text.lower(): | |
parts = generated_text.lower().split("response:") | |
if len(parts) > 1: | |
generated_text = parts[-1].strip() | |
return f"🤖 **AI-Generated Counter-Speech** ({risk_level.title()} Risk): {generated_text}" | |
except Exception as e: | |
logger.error(f"Counter-speech generation error: {e}") | |
# Fallback to template responses | |
fallback_responses = counter_speech_config.get("general_template", {}).get("fallback_responses", [ | |
"Thank you for sharing your thoughts. Building strong communities works best when we focus on shared values and constructive dialogue." | |
]) | |
import random | |
return f"📝 **Template Response** ({risk_level.title()} Risk): {random.choice(fallback_responses)}" | |
def extract_confidence_from_analysis(self, analysis_text): | |
"""Extract confidence score from moderation analysis""" | |
import re | |
# Look for confidence patterns like "85%" or "confidence: 0.85" | |
patterns = [ | |
r'(\d+)%', | |
r'confidence[:\s]+(\d*\.?\d+)', | |
r'(\d*\.?\d+)\s*confidence' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, analysis_text.lower()) | |
if match: | |
value = float(match.group(1)) | |
return value / 100 if value > 1 else value | |
return 0.5 # Default moderate confidence | |
def extract_safety_level_from_analysis(self, analysis_text): | |
"""Extract safety assessment from moderation analysis""" | |
analysis_lower = analysis_text.lower() | |
if any(word in analysis_lower for word in ['harmful', 'high risk', 'remove', 'violation']): | |
return "harmful" | |
elif any(word in analysis_lower for word in ['concerning', 'medium risk', 'review', 'warning']): | |
return "concerning" | |
elif any(word in analysis_lower for word in ['safe', 'low risk', 'acceptable', 'approve']): | |
return "safe" | |
else: | |
return "review_needed" | |
def comprehensive_analysis(self, text): | |
"""Run all agents with prompt-based analysis""" | |
start_time = datetime.now() | |
# Run core agents | |
detection_result = self.detect_hate_speech(text) | |
sentiment_result = self.analyze_sentiment(text) | |
# Run prompt-based agents | |
moderation_result = self.moderate_content_with_prompts(text, detection_result, sentiment_result) | |
counter_speech = self.generate_counter_speech_with_prompts(text, detection_result, sentiment_result) | |
processing_time = (datetime.now() - start_time).total_seconds() | |
return { | |
"detection": detection_result, | |
"sentiment": sentiment_result, | |
"moderation": moderation_result, | |
"counter_speech": counter_speech, | |
"processing_time": processing_time, | |
"timestamp": datetime.now().isoformat() | |
} | |
# Initialize the system | |
logger.info("🚀 Starting Prompt-Based Multi-Agent System...") | |
agent_system = PromptBasedMultiAgentSystem() | |
def analyze_text_with_prompts(text): | |
"""Main analysis function using prompt-based agents""" | |
if not text or not text.strip(): | |
return ( | |
"❌ Please enter some text to analyze.", | |
{}, | |
"No analysis performed.", | |
"No input provided", | |
{} | |
) | |
# Run comprehensive analysis with prompts | |
results = agent_system.comprehensive_analysis(text) | |
# Extract results for display | |
detection_status = results["detection"]["status"] | |
detection_scores = results["detection"]["all_scores"] | |
counter_speech = results["counter_speech"] | |
# Create detailed agent summary | |
agent_summary = f""" | |
🔍 **Detection Agent**: {results['detection']['risk_level']} risk ({results['detection']['confidence']:.2%} confidence) | |
📊 **Sentiment Agent**: {results['sentiment']['sentiment'].title()} ({results['sentiment']['confidence']:.2%} confidence) | |
🛡️ **Moderation Agent**: {results['moderation'].get('safety_level', 'unknown').title()} safety level ({results['moderation'].get('confidence', 0):.2%} confidence) | |
💬 **Counter-Speech Agent**: {"Custom prompt-based" if "AI-Generated" in counter_speech else "Template-based"} response | |
⏱️ **Processing Time**: {results['processing_time']:.3f} seconds | |
📋 **Moderation Analysis**: {results['moderation'].get('analysis', 'No detailed analysis available')[:200]}... | |
""" | |
# Compile comprehensive agent data | |
all_agent_data = { | |
"Detection_Analysis": { | |
"scores": detection_scores, | |
"risk_level": results['detection']['risk_level'], | |
"is_hate_speech": results['detection']['is_hate_speech'] | |
}, | |
"Sentiment_Analysis": { | |
"primary_sentiment": results['sentiment']['sentiment'], | |
"all_sentiments": results["sentiment"].get("all_sentiments", {}) | |
}, | |
"Moderation_Analysis": { | |
"safety_assessment": results['moderation'].get('safety_level', 'unknown'), | |
"detailed_analysis": results['moderation'].get('analysis', ''), | |
"confidence": results['moderation'].get('confidence', 0), | |
"prompt_used": results['moderation'].get('prompt_used', 'fallback') | |
}, | |
"Counter_Speech": { | |
"response": counter_speech, | |
"generation_method": "AI-Generated" if "AI-Generated" in counter_speech else "Template-based" | |
}, | |
"System_Info": { | |
"timestamp": results["timestamp"], | |
"processing_time_seconds": results["processing_time"], | |
"prompt_files_loaded": { | |
"counter_speech": bool(agent_system.counter_speech_prompts), | |
"moderation": bool(agent_system.moderation_prompts) | |
} | |
} | |
} | |
return detection_status, detection_scores, counter_speech, agent_summary, all_agent_data | |
def reload_prompts(): | |
"""Reload prompt files for testing""" | |
try: | |
agent_system.counter_speech_prompts = agent_system.load_prompts("counter_speech_prompts.json") | |
agent_system.moderation_prompts = agent_system.load_prompts("moderation_prompts.json") | |
return "✅ Prompts reloaded successfully!" | |
except Exception as e: | |
return f"❌ Error reloading prompts: {e}" | |
def get_prompt_info(): | |
"""Get information about loaded prompts""" | |
counter_prompts = len(agent_system.counter_speech_prompts.get("counter_speech_prompts", {})) | |
moderation_prompts = len(agent_system.moderation_prompts.get("moderation_prompts", {})) | |
return { | |
"counter_speech_prompt_categories": counter_prompts, | |
"moderation_prompt_categories": moderation_prompts, | |
"prompt_files_status": { | |
"counter_speech_prompts.json": "✅ Loaded" if counter_prompts > 0 else "❌ Not found", | |
"moderation_prompts.json": "✅ Loaded" if moderation_prompts > 0 else "❌ Not found" | |
} | |
} | |
# Create the Gradio interface | |
with gr.Blocks( | |
title="Prompt-Based Multi-Agent Hate Speech Detection System", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1400px !important; | |
} | |
.prompt-info { | |
background: linear-gradient(90deg, #f0f9ff 0%, #e0f2fe 100%); | |
padding: 1rem; | |
border-radius: 8px; | |
border-left: 4px solid #0284c7; | |
} | |
.agent-summary { | |
background: linear-gradient(90deg, #fefce8 0%, #fef3c7 100%); | |
padding: 1rem; | |
border-radius: 8px; | |
border-left: 4px solid #f59e0b; | |
} | |
""" | |
) as demo: | |
gr.Markdown(""" | |
# 🤖 Prompt-Based Multi-Agent Hate Speech Detection System | |
**Advanced AI Agent Collaboration with Custom Prompts** | |
🔍 **Detection Agent** - Your fine-tuned DistilBERT model | |
💬 **Counter-Speech Agent** - FLAN-T5 with custom prompt engineering | |
🛡️ **Moderation Agent** - Structured analysis using specialized prompts | |
📊 **Sentiment Agent** - Twitter-RoBERTa for emotional context | |
*Each agent uses carefully crafted prompts from external JSON files for optimal performance.* | |
""") | |
with gr.Tab("🤖 Prompt-Based Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=2): | |
text_input = gr.Textbox( | |
label="Enter text for comprehensive prompt-based analysis", | |
placeholder="Enter text here to see how prompt-engineered AI agents collaborate...", | |
lines=5, | |
max_lines=15 | |
) | |
with gr.Row(): | |
analyze_btn = gr.Button("🚀 Run Prompt-Based Analysis", variant="primary", size="lg") | |
clear_btn = gr.Button("🗑️ Clear All", variant="secondary") | |
reload_btn = gr.Button("🔄 Reload Prompts", variant="secondary") | |
gr.Examples( | |
examples=[ | |
["This is a wonderful day to collaborate and learn from each other!"], | |
["I appreciate everyone's different perspectives and backgrounds."], | |
["Let's work together to build a more inclusive community."], | |
["Thank you for sharing your experience. I'd love to understand your viewpoint better."], | |
["The diversity in our group makes our discussions much richer and more meaningful."], | |
["I respectfully disagree, but I value your right to express your opinion."] | |
], | |
inputs=text_input, | |
label="📝 Try these examples with prompt-based agents:" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
detection_output = gr.Textbox( | |
label="🎯 Primary Detection Result", | |
interactive=False, | |
lines=3 | |
) | |
agent_summary = gr.Textbox( | |
label="🤖 Prompt-Based Agent Summary", | |
interactive=False, | |
lines=8, | |
elem_classes=["agent-summary"] | |
) | |
with gr.Column(): | |
counter_speech_output = gr.Textbox( | |
label="💬 Prompt-Generated Counter-Speech", | |
interactive=False, | |
lines=6 | |
) | |
reload_status = gr.Textbox( | |
label="🔄 Prompt Reload Status", | |
interactive=False, | |
lines=2 | |
) | |
with gr.Row(): | |
all_agents_output = gr.JSON( | |
label="📊 Complete Prompt-Based Multi-Agent Analysis", | |
visible=True | |
) | |
with gr.Tab("📝 Prompt Management"): | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(""" | |
## 📝 Counter-Speech Prompts | |
The system uses specialized prompts for different risk levels: | |
### 🚨 High Risk Prompts | |
- **Purpose**: Address clear hate speech with educational responses | |
- **Tone**: Firm but respectful, educational focus | |
- **Length**: 50-100 words | |
- **Goal**: De-escalation and education | |
### ⚠️ Medium Risk Prompts | |
- **Purpose**: Handle potentially problematic content | |
- **Tone**: Gentle guidance, supportive | |
- **Length**: 40-80 words | |
- **Goal**: Reflection and improvement | |
### ⚡ Low Risk Prompts | |
- **Purpose**: Encourage even better communication | |
- **Tone**: Positive reinforcement | |
- **Length**: 30-60 words | |
- **Goal**: Enhancement and encouragement | |
""") | |
with gr.Column(): | |
gr.Markdown(""" | |
## 🛡️ Moderation Prompts | |
Structured analysis prompts for comprehensive assessment: | |
### 🔍 Comprehensive Analysis | |
- **Safety Assessment**: SAFE/CONCERNING/HARMFUL | |
- **Violation Categories**: Specific policy areas | |
- **Severity Levels**: LOW/MEDIUM/HIGH | |
- **Confidence Scoring**: 0-100% certainty | |
- **Contextual Factors**: Cultural and situational | |
### 📊 Specialized Analysis Types | |
- **Hate Speech Focus**: Protected group targeting | |
- **Toxicity Assessment**: Discourse quality impact | |
- **Context Analysis**: Cultural and situational factors | |
- **Action Recommendations**: Specific moderation steps | |
""") | |
with gr.Row(): | |
prompt_info_output = gr.JSON( | |
label="📋 Current Prompt Configuration", | |
value=get_prompt_info() | |
) | |
gr.Markdown(""" | |
## 📁 Prompt File Structure | |
To customize the system behavior, create these JSON files: | |
### `counter_speech_prompts.json` | |
```json | |
{ | |
"counter_speech_prompts": { | |
"high_risk": { | |
"system_prompt": "You are an expert educator...", | |
"user_prompt_template": "Generate response for: {original_text}..." | |
} | |
} | |
} | |
``` | |
### `moderation_prompts.json` | |
```json | |
{ | |
"moderation_prompts": { | |
"comprehensive_analysis": { | |
"system_prompt": "You are a content moderation expert...", | |
"user_prompt_template": "Analyze: {text}..." | |
} | |
} | |
} | |
``` | |
**Benefits of External Prompts:** | |
- 🎯 **Fine-tuned control** over agent behavior | |
- 🔄 **Easy iteration** without code changes | |
- 📊 **A/B testing** of different prompt strategies | |
- 🎨 **Domain-specific customization** for different platforms | |
- 📈 **Performance optimization** through prompt engineering | |
""") | |
with gr.Tab("🔧 System Architecture"): | |
gr.Markdown(""" | |
## 🏗️ Prompt-Based Agent Architecture | |
### 🔄 Agent Collaboration Flow | |
``` | |
Input Text | |
├── Detection Agent → Risk Classification (DistilBERT) | |
├── Sentiment Agent → Emotional Context (RoBERTa) | |
├── Moderation Agent → Structured Analysis (FLAN-T5 + Prompts) | |
└── Counter-Speech Agent → Educational Response (FLAN-T5 + Prompts) | |
↑ | |
Uses custom prompts and outputs from all other agents | |
``` | |
### 📝 Prompt Engineering Advantages | |
#### 🎯 **Precision Control** | |
- **Task-specific instructions** for each scenario | |
- **Tone and style guidelines** for appropriate responses | |
- **Length and format specifications** for consistency | |
- **Context integration** from multiple agent outputs | |
#### 🔄 **Iterative Improvement** | |
- **Hot-swappable prompts** without system restart | |
- **A/B testing capabilities** for prompt effectiveness | |
- **Performance metrics** tracking for optimization | |
- **Domain adaptation** for different use cases | |
#### 🛡️ **Quality Assurance** | |
- **Bias mitigation** through careful prompt design | |
- **Safety guardrails** built into prompt structure | |
- **Consistency enforcement** across all responses | |
- **Cultural sensitivity** considerations | |
### 🚀 Production Benefits | |
- **🎨 Customizable**: Adapt to different platforms and communities | |
- **📈 Scalable**: Easy to add new prompt categories | |
- **🔧 Maintainable**: Update behavior without code deployment | |
- **📊 Measurable**: Track prompt performance and effectiveness | |
- **🌍 Localizable**: Different prompts for different regions/cultures | |
### ⚠️ Deployment Considerations | |
#### 🔒 Security | |
- **Prompt injection protection** for user inputs | |
- **Content filtering** on generated responses | |
- **Rate limiting** to prevent abuse | |
- **Audit logging** for compliance | |
#### 📊 Monitoring | |
- **Response quality metrics** tracking | |
- **User feedback integration** for continuous improvement | |
- **Error rate monitoring** across different prompt types | |
- **Performance benchmarking** against baseline models | |
#### 👥 Human Oversight | |
- **Expert review processes** for prompt updates | |
- **Community feedback loops** for prompt effectiveness | |
- **Escalation pathways** for edge cases | |
- **Regular bias audits** and prompt refinement | |
""") | |
# Event handlers | |
analyze_btn.click( | |
fn=analyze_text_with_prompts, | |
inputs=text_input, | |
outputs=[detection_output, all_agents_output, counter_speech_output, agent_summary, all_agents_output] | |
) | |
clear_btn.click( | |
fn=lambda: ("", "", "", "", {}), | |
outputs=[text_input, detection_output, counter_speech_output, agent_summary, all_agents_output] | |
) | |
reload_btn.click( | |
fn=reload_prompts, | |
outputs=reload_status | |
) | |
# Launch configuration | |
if __name__ == "__main__": | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
show_api=False, | |
share=False | |
) |