Spaces:
Sleeping
Sleeping
File size: 10,153 Bytes
b2d89cf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 |
"""
Shared utilities for the ML suite.
This module provides shared functions used across the ML suite components:
- Email text analysis using heuristics (adapted from app.py)
- Text cleaning and normalization
- Timestamp and logging utilities
- HTML processing for email content extraction
These utilities ensure consistent processing across different components of the ML suite.
"""
import re
import os
import urllib.parse
import datetime
import html
from typing import Dict, List, Tuple, Optional, Any, Union
from bs4 import BeautifulSoup
# --- Email Heuristic Analysis ---
# Keywords that suggest an email is marketing/promotional/unsubscribable
UNSUBSCRIBE_KEYWORDS_FOR_AI_HEURISTICS = [
'unsubscribe', 'opt-out', 'opt out', 'stop receiving', 'manage preferences',
'email preferences', 'subscription', 'marketing', 'newsletter', 'promotional',
'offer', 'sale', 'discount', 'deal', 'coupon', 'promo code', 'promotion',
'limited time', 'subscribe', 'update preferences', 'mailing list',
'no longer wish to receive', 'manage subscriptions', 'manage your subscriptions'
]
# Keywords that suggest promotional content
PROMO_KEYWORDS_FOR_AI_HEURISTICS = [
'limited time', 'exclusive', 'offer', 'sale', 'discount', 'deal', 'coupon',
'promo code', 'promotion', 'savings', 'special offer', 'limited offer',
'buy now', 'shop now', 'order now', 'click here', 'purchase', 'buy',
'free shipping', 'free trial', 'new arrival', 'new product', 'flash sale'
]
# Common formatting patterns in promotional emails
FORMATTING_PATTERNS_FOR_AI_HEURISTICS = [
r'\*+\s*[A-Z]+\s*\*+', # ***TEXT***
r'\*\*[^*]+\*\*', # **TEXT**
r'!{2,}', # Multiple exclamation marks
r'\$\d+(\.\d{2})?(\s+off|\s+discount|%\s+off)', # Price patterns
r'\d+%\s+off', # Percentage discounts
r'SAVE\s+\d+%', # SAVE XX%
r'SAVE\s+\$\d+', # SAVE $XX
r'HURRY', # Urgency words
r'LIMITED TIME',
r'LAST CHANCE',
r'ENDING SOON'
]
def analyze_email_heuristics_for_ai(subject_text: str, snippet_text: str, list_unsubscribe_header: Optional[str] = None) -> Dict[str, bool]:
"""
Analyze email subject and body (snippet) text to determine if it's likely promotional/unsubscribable.
This function is adapted from the original heuristic analysis in app.py but modified
to be self-contained and not rely on Flask's app context. It examines the subject
and body for patterns common in promotional emails and subscription-based content.
Args:
subject_text: The subject line of the email
snippet_text: A snippet of the email body text
list_unsubscribe_header: Optional List-Unsubscribe header value
Returns:
Dict of boolean flags indicating different heuristic results:
{
'has_unsubscribe_text': bool, # Contains unsubscribe keywords
'has_promotional_keywords': bool, # Contains promotional keywords
'has_promotional_formatting': bool, # Contains typical promotional formatting
'has_list_unsubscribe_header': bool, # Has List-Unsubscribe header
'likely_unsubscribable': bool # Overall assessment
}
"""
# Ensure inputs are strings
subject_text = str(subject_text).lower() if subject_text else ""
snippet_text = str(snippet_text).lower() if snippet_text else ""
combined_text = f"{subject_text} {snippet_text}".lower()
# Initialize result with default values
result = {
'has_unsubscribe_text': False,
'has_promotional_keywords': False,
'has_promotional_formatting': False,
'has_list_unsubscribe_header': False,
'likely_unsubscribable': False
}
# Check for unsubscribe keywords
for keyword in UNSUBSCRIBE_KEYWORDS_FOR_AI_HEURISTICS:
if keyword.lower() in combined_text:
result['has_unsubscribe_text'] = True
break
# Check for promotional keywords
for keyword in PROMO_KEYWORDS_FOR_AI_HEURISTICS:
if keyword.lower() in combined_text:
result['has_promotional_keywords'] = True
break
# Check for promotional formatting patterns
combined_text_original_case = f"{subject_text} {snippet_text}" if subject_text and snippet_text else ""
for pattern in FORMATTING_PATTERNS_FOR_AI_HEURISTICS:
if re.search(pattern, combined_text_original_case, re.IGNORECASE):
result['has_promotional_formatting'] = True
break
# Check for List-Unsubscribe header
if list_unsubscribe_header:
result['has_list_unsubscribe_header'] = True
# Overall assessment: likely unsubscribable if any of the criteria are met
# For training data preparation, we want to be somewhat inclusive in what we label as potentially unsubscribable
result['likely_unsubscribable'] = any([
result['has_unsubscribe_text'],
(result['has_promotional_keywords'] and result['has_promotional_formatting']),
result['has_list_unsubscribe_header']
])
return result
# --- Text Cleaning Utilities ---
def clean_html_text(html_content: str) -> str:
"""
Clean HTML content and extract readable text.
Args:
html_content: Raw HTML content string
Returns:
Cleaned plain text extracted from HTML
"""
if not html_content:
return ""
try:
# Create BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script_or_style in soup(['script', 'style', 'head', 'title', 'meta', '[document]']):
script_or_style.decompose()
# Get text content
text = soup.get_text()
# Clean up text: replace multiple newlines, spaces, etc.
text = re.sub(r'\n+', '\n', text)
text = re.sub(r' +', ' ', text)
text = text.strip()
return text
except Exception:
# If parsing fails, try to extract text with regex (fallback)
text = re.sub(r'<[^>]*>', ' ', html_content)
text = html.unescape(text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def normalize_spaces(text: str) -> str:
"""
Normalize whitespace in text.
Args:
text: Input text
Returns:
Text with normalized whitespace
"""
if not text:
return ""
# Replace newlines, tabs with spaces
text = re.sub(r'[\n\r\t]+', ' ', text)
# Collapse multiple spaces into one
text = re.sub(r' +', ' ', text)
return text.strip()
def normalize_urls(text: str) -> str:
"""
Replace URLs with a placeholder to reduce noise in training data.
Args:
text: Input text
Returns:
Text with URLs replaced by a placeholder
"""
if not text:
return ""
# URL regex pattern
url_pattern = r'(https?://[^\s]+)|(www\.[^\s]+\.[^\s]+)'
# Replace URLs with placeholder
return re.sub(url_pattern, '[URL]', text)
def clean_text_for_model(text: str, max_length: Optional[int] = None) -> str:
"""
Clean and normalize text for model input.
Args:
text: Input text (can be HTML or plain text)
max_length: Optional maximum length to truncate to
Returns:
Cleaned text ready for model input
"""
if not text:
return ""
# Check if input is likely HTML
if re.search(r'<\w+[^>]*>.*?</\w+>', text, re.DOTALL):
text = clean_html_text(text)
# Normalize whitespace
text = normalize_spaces(text)
# Replace URLs with placeholder
text = normalize_urls(text)
# Truncate if needed
if max_length and len(text) > max_length:
text = text[:max_length]
return text
# --- Timestamp and Path Utilities ---
def get_current_timestamp() -> str:
"""Returns ISO format timestamp for current time."""
return datetime.datetime.now().isoformat()
def get_current_timestamp_log_prefix() -> str:
"""Returns a formatted timestamp string for log entries."""
return f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]"
def ensure_directory_exists(directory_path: str) -> bool:
"""
Ensure that a directory exists, creating it if necessary.
Args:
directory_path: Path to the directory
Returns:
True if directory exists or was created, False on error
"""
try:
os.makedirs(directory_path, exist_ok=True)
return True
except Exception:
return False
# --- Email Header Analysis ---
def extract_email_addresses(header_value: str) -> List[str]:
"""
Extract email addresses from a header value.
Args:
header_value: Raw header value containing email addresses
Returns:
List of extracted email addresses
"""
if not header_value:
return []
# Basic email regex pattern
email_pattern = r'[\w.+-]+@[\w-]+\.[\w.-]+'
return re.findall(email_pattern, header_value)
def parse_list_unsubscribe_header(header_value: str) -> Dict[str, Any]:
"""
Parse the List-Unsubscribe header to extract URLs and email addresses.
Args:
header_value: Raw List-Unsubscribe header value
Returns:
Dict with extracted URLs and email addresses
"""
if not header_value:
return {"urls": [], "emails": []}
result = {"urls": [], "emails": []}
# Split by comma and process each value
for item in header_value.split(','):
item = item.strip()
# Handle <mailto:...> format
if item.startswith('<mailto:') and item.endswith('>'):
email = item[8:-1] # Remove <mailto: and >
result["emails"].append(email)
# Handle <http...> format
elif item.startswith('<http') and item.endswith('>'):
url = item[1:-1] # Remove < and >
result["urls"].append(url)
return result |