Spaces:
Sleeping
Sleeping
""" | |
Centralized configuration for Gmail Unsubscriber AI Suite. | |
This module defines all configuration parameters for the ML components, including: | |
- Directory paths for models, datasets, and task status | |
- Hugging Face cache configuration | |
- Model specifications | |
- Data preparation parameters | |
- Training hyperparameters | |
- User data collection and personalization parameters | |
All directories are automatically created when this module is imported. | |
""" | |
import os | |
# --- Base Path Configuration --- | |
ML_SUITE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
PROJECT_ROOT = os.path.dirname(ML_SUITE_DIR) | |
# --- Cache and Model Storage --- | |
MODELS_DIR = os.path.join(ML_SUITE_DIR, "models") | |
BASE_TRANSFORMER_CACHE_DIR = os.path.join(MODELS_DIR, "base_transformer_cache") | |
# FINE_TUNED_MODEL_DIR = os.path.join(MODELS_DIR, "fine_tuned_unsubscriber") # Old model | |
FINE_TUNED_MODEL_DIR = os.path.join(PROJECT_ROOT, "final_optimized_model") # New trained model | |
# Set Hugging Face environment variables to use project-local cache | |
os.environ['HF_HOME'] = BASE_TRANSFORMER_CACHE_DIR | |
os.environ['TRANSFORMERS_CACHE'] = BASE_TRANSFORMER_CACHE_DIR | |
os.environ['HF_DATASETS_CACHE'] = os.path.join(BASE_TRANSFORMER_CACHE_DIR, 'datasets') | |
os.environ['HF_METRICS_CACHE'] = os.path.join(BASE_TRANSFORMER_CACHE_DIR, 'metrics') | |
# --- Dataset Storage --- | |
DATASETS_DIR = os.path.join(ML_SUITE_DIR, "datasets") | |
RAW_DATASETS_DIR = os.path.join(DATASETS_DIR, "raw") | |
EXTRACTED_DATASETS_DIR = os.path.join(DATASETS_DIR, "extracted") | |
PROCESSED_DATASETS_DIR = os.path.join(DATASETS_DIR, "processed") | |
PREPARED_DATA_FILE = os.path.join(PROCESSED_DATASETS_DIR, "unsubscriber_training_data.csv") | |
DATA_COLUMNS_SCHEMA = ['text', 'label'] # Schema for the training CSV | |
# --- Task Status Storage --- | |
TASK_STATUS_DIR = os.path.join(ML_SUITE_DIR, "task_status") | |
DATA_PREP_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "data_preparation_status.json") | |
MODEL_TRAIN_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "model_training_status.json") | |
PERSONALIZED_TRAIN_STATUS_FILE = os.path.join(TASK_STATUS_DIR, "personalized_training_status.json") | |
# --- User Data Collection and Personalization --- | |
USER_DATA_DIR = os.path.join(ML_SUITE_DIR, "user_data") | |
USER_FEEDBACK_DIR = os.path.join(USER_DATA_DIR, "feedback") | |
USER_MODELS_DIR = os.path.join(USER_DATA_DIR, "models") | |
USER_DATASETS_DIR = os.path.join(USER_DATA_DIR, "datasets") | |
# User feedback collection configuration | |
USER_FEEDBACK_FILE = os.path.join(USER_FEEDBACK_DIR, "user_feedback.csv") | |
FEEDBACK_COLUMNS_SCHEMA = ['email_id', 'text', 'predicted_label', 'predicted_confidence', 'user_feedback', 'timestamp', 'session_id'] | |
# Personalized model configuration | |
PERSONALIZED_MODEL_DIR_TEMPLATE = os.path.join(USER_MODELS_DIR, "{user_id}") | |
PERSONALIZED_MODEL_FILE_TEMPLATE = os.path.join(PERSONALIZED_MODEL_DIR_TEMPLATE, "model.pt") | |
PERSONALIZED_MODEL_INFO_TEMPLATE = os.path.join(PERSONALIZED_MODEL_DIR_TEMPLATE, "model_info.json") | |
PERSONALIZED_DATASET_FILE_TEMPLATE = os.path.join(USER_DATASETS_DIR, "{user_id}_training_data.csv") | |
# Personalization hyperparameters | |
MIN_FEEDBACK_ENTRIES_FOR_PERSONALIZATION = 10 # Minimum number of user feedback entries required for personalization | |
PERSONALIZATION_WEIGHT = 0.7 # Weight given to user feedback vs. base model (higher = more personalized) | |
PERSONALIZATION_EPOCHS = 2 # Number of epochs for fine-tuning a personalized model | |
# --- Directory Creation (Updated with User Data directories) --- | |
for dir_path in [MODELS_DIR, BASE_TRANSFORMER_CACHE_DIR, FINE_TUNED_MODEL_DIR, | |
RAW_DATASETS_DIR, EXTRACTED_DATASETS_DIR, PROCESSED_DATASETS_DIR, TASK_STATUS_DIR, | |
USER_DATA_DIR, USER_FEEDBACK_DIR, USER_MODELS_DIR, USER_DATASETS_DIR]: | |
os.makedirs(dir_path, exist_ok=True) | |
# --- Transformer Model Configuration --- | |
# Choice: DistilBERT offers a good balance of performance and resource efficiency. | |
# Other candidates: 'bert-base-uncased', 'roberta-base', 'google/electra-small-discriminator'. | |
# The choice impacts download size, training time, and inference speed. | |
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased" | |
# --- Data Preparation Parameters --- | |
# Define sources for public email data. URLs and types guide the preparator. | |
PUBLIC_DATASETS_INFO = { | |
"spamassassin_easy_ham_2003": { | |
"url": "https://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2", | |
"type": "important_leaning", # Expected dominant class after heuristic application | |
"extract_folder_name": "spamassassin_easy_ham_2003" | |
}, | |
"spamassassin_spam_2003": { | |
"url": "https://spamassassin.apache.org/publiccorpus/20030228_spam.tar.bz2", | |
"type": "unsubscribable_leaning", | |
"extract_folder_name": "spamassassin_spam_2003" | |
}, | |
# Consider adding more diverse datasets like: | |
# - Enron (requires significant parsing and ethical review for a suitable subset) | |
# - Public mailing list archives (e.g., from Apache Software Foundation, carefully selected for relevance) | |
} | |
MIN_TEXT_LENGTH_FOR_TRAINING = 60 # Emails shorter than this (after cleaning) are likely not useful. | |
MAX_SAMPLES_PER_RAW_DATASET = 7500 # Limits processing time for initial data prep. Can be increased. | |
EMAIL_SNIPPET_LENGTH_FOR_MODEL = 1024 # Max characters from email body to combine with subject for model input. | |
# --- Training Hyperparameters & Configuration --- | |
NUM_LABELS = 2 # Binary classification: Unsubscribable vs. Important | |
LABEL_IMPORTANT_ID = 0 | |
LABEL_UNSUBSCRIBABLE_ID = 1 | |
ID_TO_LABEL_MAP = {LABEL_IMPORTANT_ID: "IMPORTANT", LABEL_UNSUBSCRIBABLE_ID: "UNSUBSCRIBABLE"} | |
LABEL_TO_ID_MAP = {"IMPORTANT": LABEL_IMPORTANT_ID, "UNSUBSCRIBABLE": LABEL_UNSUBSCRIBABLE_ID} | |
MAX_SEQ_LENGTH = 512 # Max token sequence length for Transformer. Impacts memory and context window. | |
TRAIN_BATCH_SIZE = 16 # Batch size for training. Reduced for GTX 1650 (4GB VRAM) | |
EVAL_BATCH_SIZE = 32 # Batch size for evaluation. Reduced for GTX 1650 | |
NUM_TRAIN_EPOCHS = 8 # Number of full passes through the training data (increased for better learning). | |
LEARNING_RATE = 1e-5 # AdamW optimizer learning rate, slightly reduced for more stable training. | |
WEIGHT_DECAY = 0.02 # Regularization parameter. | |
WARMUP_STEPS_RATIO = 0.15 # Ratio of total training steps for learning rate warmup. | |
TEST_SPLIT_SIZE = 0.2 # Proportion of data for the evaluation set (increased for better validation). | |
# Hugging Face Trainer Arguments | |
EVALUATION_STRATEGY = "epoch" # Evaluate at the end of each epoch. | |
SAVE_STRATEGY = "epoch" # Save model checkpoint at the end of each epoch. | |
LOAD_BEST_MODEL_AT_END = True # Reload the best model (based on metric_for_best_model) at the end of training. | |
METRIC_FOR_BEST_MODEL = "f1_unsub" # Focus on F1 for the "unsubscribable" class. | |
FP16_TRAINING = True # Enable mixed-precision training if a CUDA GPU is available and supports it. | |
EARLY_STOPPING_PATIENCE = 3 # Stop training if metric_for_best_model doesn't improve for this many epochs. | |
EARLY_STOPPING_THRESHOLD = 0.001 # Minimum change to be considered an improvement. | |
# --- AI User Preferences (Defaults stored in JS, but can be defined here for reference) --- | |
DEFAULT_AI_ENABLED_ON_SCAN = True | |
DEFAULT_AI_CONFIDENCE_THRESHOLD = 0.5 # (50%) - Balanced threshold for optimal precision/recall | |
# --- API Endpoint Configuration for Backend Integration --- | |
API_ENDPOINTS = { | |
"submit_feedback": "/api/ai/feedback", | |
"get_feedback_stats": "/api/ai/feedback/stats", | |
"train_personalized": "/api/ai/train_personalized", | |
"reset_user_data": "/api/ai/user_data/reset", | |
"export_user_data": "/api/ai/user_data/export", | |
"import_user_data": "/api/ai/user_data/import" | |
} | |
# --- Advanced Transformer Configuration (2024 Research) --- | |
# Based on 2024 research showing RoBERTa and DistilBERT achieve 99%+ accuracy | |
TRANSFORMER_MODEL_NAME = "distilbert-base-uncased" # Optimal balance of speed and accuracy | |
USE_MIXED_PRECISION = True # FP16 training for efficiency | |
GRADIENT_ACCUMULATION_STEPS = 4 # Increased for GTX 1650 to simulate larger batch size | |
MAX_GRAD_NORM = 1.0 # Gradient clipping for stability | |
LABEL_SMOOTHING_FACTOR = 0.1 # Reduce overconfidence | |
SAVE_TOTAL_LIMIT = 3 # Keep only best 3 checkpoints | |
LOGGING_STEPS = 50 # Frequent logging for monitoring | |
EVAL_STEPS = 100 # Regular evaluation during training | |
DATALOADER_NUM_WORKERS = 2 # Reduced for GTX 1650 to avoid memory issues | |