unsubscriber / ml_suite /config.py
kshanmukha1501's picture
Initial deployment of Unsubscriber app with AI model
b2d89cf
"""
Centralized configuration for Gmail Unsubscriber AI Suite.
This module defines all configuration parameters for the ML components, including:
- Directory paths for models, datasets, and task status
- Hugging Face cache configuration
- Model specifications
- Data preparation parameters
- Training hyperparameters
- User data collection and personalization parameters
All directories are automatically created when this module is imported.
"""
import os
# --- Base Path Configuration ---
ML_SUITE_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(ML_SUITE_DIR)
# --- Cache and Model Storage ---
MODELS_DIR = os.path.join(ML_SUITE_DIR, "models")
BASE_TRANSFORMER_CACHE_DIR = os.path.join(MODELS_DIR, "base_transformer_cache")
# FINE_TUNED_MODEL_DIR = os.path.join(MODELS_DIR, "fine_tuned_unsubscriber") # Old model
FINE_TUNED_MODEL_DIR = os.path.join(PROJECT_ROOT, "final_optimized_model") # New trained model
# Set Hugging Face environment variables to use project-local cache
os.environ['HF_HOME'] = BASE_TRANSFORMER_CACHE_DIR
os.environ['TRANSFORMERS_CACHE'] = BASE_TRANSFORMER_CACHE_DIR
os.environ['HF_DATASETS_CACHE'] = os.path.join(BASE_TRANSFORMER_CACHE_DIR, 'datasets')
os.environ['HF_METRICS_CACHE'] = os.path.join(BASE_TRANSFORMER_CACHE_DIR, 'metrics')
# --- Dataset Storage ---
DATASETS_DIR = os.path.join(ML_SUITE_DIR, "datasets")
RAW_DATASETS_DIR = os.path.join(DATASETS_DIR, "raw")
EXTRACTED_DATASETS_DIR = os.path.join(DATASETS_DIR, "extracted")
PROCESSED_DATASETS_DIR = os.path.join(DATASETS_DIR, "processed")
PREPARED_DATA_FILE = os.path.join(PROCESSED_DATASETS_DIR, "unsubscriber_training_data.csv")
DATA_COLUMNS_SCHEMA = ['text', 'label'] # Schema for the training CSV
# --- Task Status Storage ---
TASK_STATUS_DIR = os.path.join(ML_SUITE_DIR, "task_status")
DATA_PREP_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "data_preparation_status.json")
MODEL_TRAIN_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "model_training_status.json")
PERSONALIZED_TRAIN_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "personalized_training_status.json")
# --- User Data Collection and Personalization ---
USER_DATA_DIR = os.path.join(ML_SUITE_DIR, "user_data")
USER_FEEDBACK_DIR = os.path.join(USER_DATA_DIR, "feedback")
USER_MODELS_DIR = os.path.join(USER_DATA_DIR, "models")
USER_DATASETS_DIR = os.path.join(USER_DATA_DIR, "datasets")
# User feedback collection configuration
USER_FEEDBACK_FILE = os.path.join(USER_FEEDBACK_DIR, "user_feedback.csv")
FEEDBACK_COLUMNS_SCHEMA = ['email_id', 'text', 'predicted_label', 'predicted_confidence', 'user_feedback', 'timestamp', 'session_id']
# Personalized model configuration
PERSONALIZED_MODEL_DIR_TEMPLATE = os.path.join(USER_MODELS_DIR, "{user_id}")
PERSONALIZED_MODEL_FILE_TEMPLATE = os.path.join(PERSONALIZED_MODEL_DIR_TEMPLATE, "model.pt")
PERSONALIZED_MODEL_INFO_TEMPLATE = os.path.join(PERSONALIZED_MODEL_DIR_TEMPLATE, "model_info.json")
PERSONALIZED_DATASET_FILE_TEMPLATE = os.path.join(USER_DATASETS_DIR, "{user_id}_training_data.csv")
# Personalization hyperparameters
MIN_FEEDBACK_ENTRIES_FOR_PERSONALIZATION = 10 # Minimum number of user feedback entries required for personalization
PERSONALIZATION_WEIGHT = 0.7 # Weight given to user feedback vs. base model (higher = more personalized)
PERSONALIZATION_EPOCHS = 2 # Number of epochs for fine-tuning a personalized model
# --- Directory Creation (Updated with User Data directories) ---
for dir_path in [MODELS_DIR, BASE_TRANSFORMER_CACHE_DIR, FINE_TUNED_MODEL_DIR,
RAW_DATASETS_DIR, EXTRACTED_DATASETS_DIR, PROCESSED_DATASETS_DIR, TASK_STATUS_DIR,
USER_DATA_DIR, USER_FEEDBACK_DIR, USER_MODELS_DIR, USER_DATASETS_DIR]:
os.makedirs(dir_path, exist_ok=True)
# --- Transformer Model Configuration ---
# Choice: DistilBERT offers a good balance of performance and resource efficiency.
# Other candidates: 'bert-base-uncased', 'roberta-base', 'google/electra-small-discriminator'.
# The choice impacts download size, training time, and inference speed.
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"
# --- Data Preparation Parameters ---
# Define sources for public email data. URLs and types guide the preparator.
PUBLIC_DATASETS_INFO = {
"spamassassin_easy_ham_2003": {
"url": "https://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2",
"type": "important_leaning", # Expected dominant class after heuristic application
"extract_folder_name": "spamassassin_easy_ham_2003"
},
"spamassassin_spam_2003": {
"url": "https://spamassassin.apache.org/publiccorpus/20030228_spam.tar.bz2",
"type": "unsubscribable_leaning",
"extract_folder_name": "spamassassin_spam_2003"
},
# Consider adding more diverse datasets like:
# - Enron (requires significant parsing and ethical review for a suitable subset)
# - Public mailing list archives (e.g., from Apache Software Foundation, carefully selected for relevance)
}
MIN_TEXT_LENGTH_FOR_TRAINING = 60 # Emails shorter than this (after cleaning) are likely not useful.
MAX_SAMPLES_PER_RAW_DATASET = 7500 # Limits processing time for initial data prep. Can be increased.
EMAIL_SNIPPET_LENGTH_FOR_MODEL = 1024 # Max characters from email body to combine with subject for model input.
# --- Training Hyperparameters & Configuration ---
NUM_LABELS = 2 # Binary classification: Unsubscribable vs. Important
LABEL_IMPORTANT_ID = 0
LABEL_UNSUBSCRIBABLE_ID = 1
ID_TO_LABEL_MAP = {LABEL_IMPORTANT_ID: "IMPORTANT", LABEL_UNSUBSCRIBABLE_ID: "UNSUBSCRIBABLE"}
LABEL_TO_ID_MAP = {"IMPORTANT": LABEL_IMPORTANT_ID, "UNSUBSCRIBABLE": LABEL_UNSUBSCRIBABLE_ID}
MAX_SEQ_LENGTH = 512 # Max token sequence length for Transformer. Impacts memory and context window.
TRAIN_BATCH_SIZE = 16 # Batch size for training. Reduced for GTX 1650 (4GB VRAM)
EVAL_BATCH_SIZE = 32 # Batch size for evaluation. Reduced for GTX 1650
NUM_TRAIN_EPOCHS = 8 # Number of full passes through the training data (increased for better learning).
LEARNING_RATE = 1e-5 # AdamW optimizer learning rate, slightly reduced for more stable training.
WEIGHT_DECAY = 0.02 # Regularization parameter.
WARMUP_STEPS_RATIO = 0.15 # Ratio of total training steps for learning rate warmup.
TEST_SPLIT_SIZE = 0.2 # Proportion of data for the evaluation set (increased for better validation).
# Hugging Face Trainer Arguments
EVALUATION_STRATEGY = "epoch" # Evaluate at the end of each epoch.
SAVE_STRATEGY = "epoch" # Save model checkpoint at the end of each epoch.
LOAD_BEST_MODEL_AT_END = True # Reload the best model (based on metric_for_best_model) at the end of training.
METRIC_FOR_BEST_MODEL = "f1_unsub" # Focus on F1 for the "unsubscribable" class.
FP16_TRAINING = True # Enable mixed-precision training if a CUDA GPU is available and supports it.
EARLY_STOPPING_PATIENCE = 3 # Stop training if metric_for_best_model doesn't improve for this many epochs.
EARLY_STOPPING_THRESHOLD = 0.001 # Minimum change to be considered an improvement.
# --- AI User Preferences (Defaults stored in JS, but can be defined here for reference) ---
DEFAULT_AI_ENABLED_ON_SCAN = True
DEFAULT_AI_CONFIDENCE_THRESHOLD = 0.5 # (50%) - Balanced threshold for optimal precision/recall
# --- API Endpoint Configuration for Backend Integration ---
API_ENDPOINTS = {
"submit_feedback": "/api/ai/feedback",
"get_feedback_stats": "/api/ai/feedback/stats",
"train_personalized": "/api/ai/train_personalized",
"reset_user_data": "/api/ai/user_data/reset",
"export_user_data": "/api/ai/user_data/export",
"import_user_data": "/api/ai/user_data/import"
}
# --- Advanced Transformer Configuration (2024 Research) ---
# Based on 2024 research showing RoBERTa and DistilBERT achieve 99%+ accuracy
TRANSFORMER_MODEL_NAME = "distilbert-base-uncased" # Optimal balance of speed and accuracy
USE_MIXED_PRECISION = True # FP16 training for efficiency
GRADIENT_ACCUMULATION_STEPS = 4 # Increased for GTX 1650 to simulate larger batch size
MAX_GRAD_NORM = 1.0 # Gradient clipping for stability
LABEL_SMOOTHING_FACTOR = 0.1 # Reduce overconfidence
SAVE_TOTAL_LIMIT = 3 # Keep only best 3 checkpoints
LOGGING_STEPS = 50 # Frequent logging for monitoring
EVAL_STEPS = 100 # Regular evaluation during training
DATALOADER_NUM_WORKERS = 2 # Reduced for GTX 1650 to avoid memory issues