File size: 10,153 Bytes
b2d89cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
"""
Shared utilities for the ML suite.

This module provides shared functions used across the ML suite components:
- Email text analysis using heuristics (adapted from app.py)
- Text cleaning and normalization
- Timestamp and logging utilities
- HTML processing for email content extraction

These utilities ensure consistent processing across different components of the ML suite.
"""

import re
import os
import urllib.parse
import datetime
import html
from typing import Dict, List, Tuple, Optional, Any, Union
from bs4 import BeautifulSoup


# --- Email Heuristic Analysis ---

# Keywords that suggest an email is marketing/promotional/unsubscribable
UNSUBSCRIBE_KEYWORDS_FOR_AI_HEURISTICS = [
    'unsubscribe', 'opt-out', 'opt out', 'stop receiving', 'manage preferences', 
    'email preferences', 'subscription', 'marketing', 'newsletter', 'promotional',
    'offer', 'sale', 'discount', 'deal', 'coupon', 'promo code', 'promotion',
    'limited time', 'subscribe', 'update preferences', 'mailing list',
    'no longer wish to receive', 'manage subscriptions', 'manage your subscriptions'
]

# Keywords that suggest promotional content
PROMO_KEYWORDS_FOR_AI_HEURISTICS = [
    'limited time', 'exclusive', 'offer', 'sale', 'discount', 'deal', 'coupon',
    'promo code', 'promotion', 'savings', 'special offer', 'limited offer',
    'buy now', 'shop now', 'order now', 'click here', 'purchase', 'buy',
    'free shipping', 'free trial', 'new arrival', 'new product', 'flash sale'
]

# Common formatting patterns in promotional emails
FORMATTING_PATTERNS_FOR_AI_HEURISTICS = [
    r'\*+\s*[A-Z]+\s*\*+',  # ***TEXT***
    r'\*\*[^*]+\*\*',       # **TEXT**
    r'!{2,}',               # Multiple exclamation marks
    r'\$\d+(\.\d{2})?(\s+off|\s+discount|%\s+off)',  # Price patterns
    r'\d+%\s+off',          # Percentage discounts
    r'SAVE\s+\d+%',         # SAVE XX%
    r'SAVE\s+\$\d+',        # SAVE $XX
    r'HURRY',               # Urgency words
    r'LIMITED TIME',
    r'LAST CHANCE',
    r'ENDING SOON'
]


def analyze_email_heuristics_for_ai(subject_text: str, snippet_text: str, list_unsubscribe_header: Optional[str] = None) -> Dict[str, bool]:
    """
    Analyze email subject and body (snippet) text to determine if it's likely promotional/unsubscribable.
    
    This function is adapted from the original heuristic analysis in app.py but modified
    to be self-contained and not rely on Flask's app context. It examines the subject
    and body for patterns common in promotional emails and subscription-based content.
    
    Args:
        subject_text: The subject line of the email
        snippet_text: A snippet of the email body text
        list_unsubscribe_header: Optional List-Unsubscribe header value
        
    Returns:
        Dict of boolean flags indicating different heuristic results:
        {
            'has_unsubscribe_text': bool,  # Contains unsubscribe keywords
            'has_promotional_keywords': bool,  # Contains promotional keywords
            'has_promotional_formatting': bool,  # Contains typical promotional formatting
            'has_list_unsubscribe_header': bool,  # Has List-Unsubscribe header
            'likely_unsubscribable': bool  # Overall assessment
        }
    """
    # Ensure inputs are strings
    subject_text = str(subject_text).lower() if subject_text else ""
    snippet_text = str(snippet_text).lower() if snippet_text else ""
    combined_text = f"{subject_text} {snippet_text}".lower()
    
    # Initialize result with default values
    result = {
        'has_unsubscribe_text': False,
        'has_promotional_keywords': False,
        'has_promotional_formatting': False,
        'has_list_unsubscribe_header': False,
        'likely_unsubscribable': False
    }
    
    # Check for unsubscribe keywords
    for keyword in UNSUBSCRIBE_KEYWORDS_FOR_AI_HEURISTICS:
        if keyword.lower() in combined_text:
            result['has_unsubscribe_text'] = True
            break
    
    # Check for promotional keywords
    for keyword in PROMO_KEYWORDS_FOR_AI_HEURISTICS:
        if keyword.lower() in combined_text:
            result['has_promotional_keywords'] = True
            break
    
    # Check for promotional formatting patterns
    combined_text_original_case = f"{subject_text} {snippet_text}" if subject_text and snippet_text else ""
    for pattern in FORMATTING_PATTERNS_FOR_AI_HEURISTICS:
        if re.search(pattern, combined_text_original_case, re.IGNORECASE):
            result['has_promotional_formatting'] = True
            break
    
    # Check for List-Unsubscribe header
    if list_unsubscribe_header:
        result['has_list_unsubscribe_header'] = True
    
    # Overall assessment: likely unsubscribable if any of the criteria are met
    # For training data preparation, we want to be somewhat inclusive in what we label as potentially unsubscribable
    result['likely_unsubscribable'] = any([
        result['has_unsubscribe_text'],
        (result['has_promotional_keywords'] and result['has_promotional_formatting']),
        result['has_list_unsubscribe_header']
    ])
    
    return result


# --- Text Cleaning Utilities ---

def clean_html_text(html_content: str) -> str:
    """
    Clean HTML content and extract readable text.
    
    Args:
        html_content: Raw HTML content string
        
    Returns:
        Cleaned plain text extracted from HTML
    """
    if not html_content:
        return ""
    
    try:
        # Create BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Remove script and style elements
        for script_or_style in soup(['script', 'style', 'head', 'title', 'meta', '[document]']):
            script_or_style.decompose()
        
        # Get text content
        text = soup.get_text()
        
        # Clean up text: replace multiple newlines, spaces, etc.
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r' +', ' ', text)
        text = text.strip()
        
        return text
    except Exception:
        # If parsing fails, try to extract text with regex (fallback)
        text = re.sub(r'<[^>]*>', ' ', html_content)
        text = html.unescape(text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()


def normalize_spaces(text: str) -> str:
    """
    Normalize whitespace in text.
    
    Args:
        text: Input text
        
    Returns:
        Text with normalized whitespace
    """
    if not text:
        return ""
    
    # Replace newlines, tabs with spaces
    text = re.sub(r'[\n\r\t]+', ' ', text)
    # Collapse multiple spaces into one
    text = re.sub(r' +', ' ', text)
    return text.strip()


def normalize_urls(text: str) -> str:
    """
    Replace URLs with a placeholder to reduce noise in training data.
    
    Args:
        text: Input text
        
    Returns:
        Text with URLs replaced by a placeholder
    """
    if not text:
        return ""
    
    # URL regex pattern
    url_pattern = r'(https?://[^\s]+)|(www\.[^\s]+\.[^\s]+)'
    
    # Replace URLs with placeholder
    return re.sub(url_pattern, '[URL]', text)


def clean_text_for_model(text: str, max_length: Optional[int] = None) -> str:
    """
    Clean and normalize text for model input.
    
    Args:
        text: Input text (can be HTML or plain text)
        max_length: Optional maximum length to truncate to
        
    Returns:
        Cleaned text ready for model input
    """
    if not text:
        return ""
    
    # Check if input is likely HTML
    if re.search(r'<\w+[^>]*>.*?</\w+>', text, re.DOTALL):
        text = clean_html_text(text)
    
    # Normalize whitespace
    text = normalize_spaces(text)
    
    # Replace URLs with placeholder
    text = normalize_urls(text)
    
    # Truncate if needed
    if max_length and len(text) > max_length:
        text = text[:max_length]
    
    return text


# --- Timestamp and Path Utilities ---

def get_current_timestamp() -> str:
    """Returns ISO format timestamp for current time."""
    return datetime.datetime.now().isoformat()


def get_current_timestamp_log_prefix() -> str:
    """Returns a formatted timestamp string for log entries."""
    return f"[{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]"


def ensure_directory_exists(directory_path: str) -> bool:
    """
    Ensure that a directory exists, creating it if necessary.
    
    Args:
        directory_path: Path to the directory
        
    Returns:
        True if directory exists or was created, False on error
    """
    try:
        os.makedirs(directory_path, exist_ok=True)
        return True
    except Exception:
        return False


# --- Email Header Analysis ---

def extract_email_addresses(header_value: str) -> List[str]:
    """
    Extract email addresses from a header value.
    
    Args:
        header_value: Raw header value containing email addresses
        
    Returns:
        List of extracted email addresses
    """
    if not header_value:
        return []
    
    # Basic email regex pattern
    email_pattern = r'[\w.+-]+@[\w-]+\.[\w.-]+'
    return re.findall(email_pattern, header_value)


def parse_list_unsubscribe_header(header_value: str) -> Dict[str, Any]:
    """
    Parse the List-Unsubscribe header to extract URLs and email addresses.
    
    Args:
        header_value: Raw List-Unsubscribe header value
        
    Returns:
        Dict with extracted URLs and email addresses
    """
    if not header_value:
        return {"urls": [], "emails": []}
    
    result = {"urls": [], "emails": []}
    
    # Split by comma and process each value
    for item in header_value.split(','):
        item = item.strip()
        
        # Handle <mailto:...> format
        if item.startswith('<mailto:') and item.endswith('>'):
            email = item[8:-1]  # Remove <mailto: and >
            result["emails"].append(email)
        
        # Handle <http...> format
        elif item.startswith('<http') and item.endswith('>'):
            url = item[1:-1]  # Remove < and >
            result["urls"].append(url)
    
    return result