DeepMostInnovations
/

sales-conversion-model-reinf-learning

+import pandas as pd
+import numpy as np
+import json
+import os
+import argparse
+import logging
+from tqdm import tqdm
+import chardet
+import csv
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("dataset_cleaner.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+class SaaSDatasetCleaner:
+    """
+    Class for cleaning and validating the SaaS sales conversation dataset.
+    Handles issues resulting from interrupted generations.
+    """
+    def __init__(self, input_file, output_file=None, chunk_size=1000, encoding='utf-8', skip_encoding_check=False):
+        """
+        Initialize the cleaner.
+        Args:
+            input_file: Path to the input CSV file
+            output_file: Path to save cleaned dataset (defaults to 'cleaned_' + input_file)
+            chunk_size: Number of rows to process at once
+            encoding: File encoding (defaults to utf-8)
+            skip_encoding_check: Whether to skip encoding detection and line-by-line processing
+        """
+        self.input_file = input_file
+        self.output_file = output_file or f"cleaned_{os.path.basename(input_file)}"
+        self.chunk_size = chunk_size
+        self.encoding = encoding
+        self.skip_encoding_check = skip_encoding_check
+        self.stats = {
+            'total_rows': 0,
+            'valid_rows': 0,
+            'invalid_json': 0,
+            'missing_values': 0,
+            'invalid_embeddings': 0,
+            'duplicates': 0,
+            'encoding_errors': 0,
+            'recovered_rows': 0
+        }
+        # If not skipping encoding check, detect encoding
+        if not self.skip_encoding_check and not self.encoding:
+            self.detect_encoding()
+        # Get the columns and prepare for processing
+        self.initialize_columns()
+    def detect_encoding(self):
+        """Detect the file encoding."""
+        logger.info("Detecting file encoding...")
+        # Read a sample of the file to detect encoding
+        with open(self.input_file, 'rb') as f:
+            sample = f.read(min(10000000, os.path.getsize(self.input_file)))  # Read up to 10MB
+        result = chardet.detect(sample)
+        self.encoding = result['encoding']
+        confidence = result['confidence']
+        logger.info(f"Detected encoding: {self.encoding} with confidence: {confidence:.2f}")
+        # If confidence is low, try common encodings
+        if confidence < 0.7:
+            logger.warning(f"Low confidence in encoding detection. Will try multiple encodings.")
+            self.encoding = None  # Will try multiple encodings later
+    def initialize_columns(self):
+        """Initialize column information."""
+        # Try to read the header with different encodings if needed
+        encodings_to_try = ['utf-8'] if (self.skip_encoding_check or self.encoding) else ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
+        for enc in encodings_to_try:
+            try:
+                # Try to read just the header
+                with open(self.input_file, 'r', encoding=enc, errors='replace') as f:
+                    reader = csv.reader(f)
+                    self.columns = next(reader)
+                self.encoding = enc
+                logger.info(f"Successfully read header with encoding: {enc}")
+                # Identify embedding columns
+                self.embedding_cols = [col for col in self.columns if col.startswith('embedding_')]
+                logger.info(f"Found {len(self.embedding_cols)} embedding columns")
+                return
+            except Exception as e:
+                logger.warning(f"Failed to read header with encoding {enc}: {str(e)}")
+        # If we get here, all encodings failed
+        logger.error("Could not read column headers with any encoding")
+        self.columns = []
+        self.embedding_cols = []
+    def process_line_by_line(self):
+        """Process the file line by line to handle encoding issues."""
+        logger.info("Processing file line by line to handle encoding issues...")
+        # Open the output file
+        with open(self.output_file, 'w', encoding='utf-8', newline='') as out_file:
+            writer = None  # Will initialize after getting headers
+            # Process the input file
+            with open(self.input_file, 'rb') as in_file:
+                # Process line by line
+                line_count = 0
+                valid_count = 0
+                for line in tqdm(in_file, desc="Reading lines"):
+                    line_count += 1
+                    # Try to decode with multiple encodings
+                    decoded_line = None
+                    for enc in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
+                        try:
+                            decoded_line = line.decode(enc)
+                            break
+                        except UnicodeDecodeError:
+                            continue
+                    if decoded_line is None:
+                        # Could not decode with any encoding, skip line
+                        self.stats['encoding_errors'] += 1
+                        continue
+                    # Parse the CSV line
+                    try:
+                        reader = csv.reader([decoded_line])
+                        row = next(reader)
+                        # Initialize writer with headers if this is the first line
+                        if line_count == 1:
+                            writer = csv.writer(out_file)
+                            writer.writerow(row)  # Write headers
+                            continue
+                        # Basic validation - check number of columns
+                        if len(row) != len(self.columns):
+                            logger.debug(f"Line {line_count}: Column count mismatch. Expected {len(self.columns)}, got {len(row)}")
+                            continue
+                        # Write the row
+                        writer.writerow(row)
+                        valid_count += 1
+                    except Exception as e:
+                        logger.debug(f"Error processing line {line_count}: {str(e)}")
+                        self.stats['encoding_errors'] += 1
+            self.stats['total_rows'] = line_count - 1  # Subtract header
+            self.stats['recovered_rows'] = valid_count
+            logger.info(f"Processed {line_count} lines, recovered {valid_count} valid rows")
+            logger.info(f"Found {self.stats['encoding_errors']} lines with encoding errors")
+    def _validate_json_fields(self, df):
+        """Validate and clean JSON fields."""
+        # List of columns that should contain JSON
+        json_columns = ['scenario', 'conversation', 'probability_trajectory']
+        for col in json_columns:
+            if col not in df.columns:
+                continue
+            # Create a valid indicator
+            df[f'{col}_valid'] = True
+            # Check each value
+            for idx, value in enumerate(df[col]):
+                try:
+                    if pd.isna(value):
+                        df.at[idx, f'{col}_valid'] = False
+                        self.stats['invalid_json'] += 1
+                        continue
+                    # Attempt to parse JSON
+                    json.loads(value)
+                except:
+                    df.at[idx, f'{col}_valid'] = False
+                    self.stats['invalid_json'] += 1
+        # Create an overall valid flag
+        valid_flags = [f'{col}_valid' for col in json_columns if f'{col}_valid' in df.columns]
+        if valid_flags:
+            df['json_valid'] = df[valid_flags].all(axis=1)
+        else:
+            df['json_valid'] = True
+        # Clean up the temporary columns
+        for col in json_columns:
+            if f'{col}_valid' in df.columns:
+                df = df.drop(columns=[f'{col}_valid'])
+        return df
+    def _validate_embeddings(self, df):
+        """Check if embeddings are valid."""
+        if not self.embedding_cols:
+            return df
+        # Check if the first embedding column has a value as a simple check
+        if 'embedding_0' in df.columns:
+            df['embeddings_valid'] = ~df['embedding_0'].isna()
+        else:
+            df['embeddings_valid'] = True
+        # Count invalid embeddings
+        self.stats['invalid_embeddings'] += (~df['embeddings_valid']).sum()
+        return df
+    def _check_missing_values(self, df):
+        """Check for missing values in important columns."""
+        important_cols = [
+            'company_id', 'company_name', 'product_name', 'conversation_id',
+            'conversation', 'full_text', 'outcome'
+        ]
+        # Filter to columns that actually exist
+        important_cols = [col for col in important_cols if col in df.columns]
+        if not important_cols:
+            df['missing_important'] = False
+            return df
+        # Create a flag for rows with missing important values
+        missing_flags = df[important_cols].isna().any(axis=1)
+        df['missing_important'] = missing_flags
+        # Count missing values
+        self.stats['missing_values'] += missing_flags.sum()
+        return df
+    def _flag_valid_rows(self, df):
+        """Create a single flag for valid rows."""
+        # A row is valid if it has valid JSON, valid embeddings, and no missing important values
+        required_flags = []
+        if 'json_valid' in df.columns:
+            required_flags.append('json_valid')
+        if 'embeddings_valid' in df.columns:
+            required_flags.append('embeddings_valid')
+        if 'missing_important' in df.columns:
+            required_flags.append('~missing_important')
+        if required_flags:
+            if '~missing_important' in required_flags:
+                required_flags.remove('~missing_important')
+                if required_flags:
+                    df['row_valid'] = df[required_flags].all(axis=1) & ~df['missing_important']
+                else:
+                    df['row_valid'] = ~df['missing_important']
+            else:
+                df['row_valid'] = df[required_flags].all(axis=1)
+        else:
+            df['row_valid'] = True
+        # Update valid rows count
+        self.stats['valid_rows'] += df['row_valid'].sum()
+        return df
+    def _remove_duplicates(self, df):
+        """Remove duplicate conversation IDs."""
+        if 'conversation_id' in df.columns:
+            # Check for duplicates
+            dup_mask = df.duplicated(subset=['conversation_id'], keep='first')
+            df['is_duplicate'] = dup_mask
+            # Count duplicates
+            self.stats['duplicates'] += dup_mask.sum()
+        else:
+            df['is_duplicate'] = False
+        return df
+    def clean_dataset(self):
+        """
+        Clean the dataset by first fixing encoding issues, then cleaning the data.
+        """
+        logger.info(f"Starting to clean dataset: {self.input_file}")
+        # Check if the file exists
+        if not os.path.exists(self.input_file):
+            logger.error(f"Input file not found: {self.input_file}")
+            return
+        # If we're not skipping encoding checks, process line by line
+        if not self.skip_encoding_check:
+            self.process_line_by_line()
+            intermediate_file = self.output_file
+            self.output_file = f"validated_{os.path.basename(self.input_file)}"
+        else:
+            logger.info("Skipping encoding check as requested")
+            # Use the input file directly as the intermediate file
+            intermediate_file = self.input_file
+            # Count rows in the file for progress tracking
+            with open(intermediate_file, 'r', encoding=self.encoding) as f:
+                self.stats['total_rows'] = sum(1 for _ in f) - 1  # Subtract header
+                self.stats['recovered_rows'] = self.stats['total_rows']
+            logger.info(f"Total rows to validate: {self.stats['total_rows']}")
+        # Now that we have a cleaned file with proper encoding, process it for data validation
+        logger.info("Beginning data validation on recovered rows...")
+        # Get the total number of rows for progress tracking
+        try:
+            total_rows = self.stats['recovered_rows']
+            logger.info(f"Total rows to validate: {total_rows}")
+        except Exception as e:
+            logger.error(f"Error counting rows: {str(e)}")
+            total_rows = 0
+        # Process the dataset in chunks
+        try:
+            # Create a reader - now with known proper encoding
+            # Use error_bad_lines=False for older pandas versions (renamed to on_bad_lines in newer versions)
+            reader = pd.read_csv(
+                intermediate_file,
+                chunksize=self.chunk_size,
+                encoding='utf-8',
+                low_memory=False,  # Avoid dtype warnings
+                error_bad_lines=False  # Skip bad lines (older parameter name)
+            )
+            # Create a header flag for the first chunk
+            first_chunk = True
+            # Process each chunk
+            with tqdm(total=total_rows, desc="Validating data") as pbar:
+                for chunk_num, chunk in enumerate(reader):
+                    logger.debug(f"Processing chunk {chunk_num+1}")
+                    # Run validation steps
+                    chunk = self._validate_json_fields(chunk)
+                    chunk = self._validate_embeddings(chunk)
+                    chunk = self._check_missing_values(chunk)
+                    chunk = self._remove_duplicates(chunk)
+                    chunk = self._flag_valid_rows(chunk)
+                    # Filter to valid rows only
+                    valid_chunk = chunk[chunk['row_valid'] & ~chunk['is_duplicate']]
+                    # Remove the validation columns
+                    for col in ['json_valid', 'embeddings_valid', 'missing_important', 'row_valid', 'is_duplicate']:
+                        if col in valid_chunk.columns:
+                            valid_chunk = valid_chunk.drop(columns=[col])
+                    # Write the cleaned chunk
+                    valid_chunk.to_csv(
+                        self.output_file,
+                        mode='w' if first_chunk else 'a',
+                        header=first_chunk,
+                        index=False,
+                        encoding='utf-8'
+                    )
+                    # Update the first chunk flag
+                    if first_chunk:
+                        first_chunk = False
+                    # Update progress
+                    pbar.update(len(chunk))
+            logger.info(f"Dataset cleaning complete. Results saved to {self.output_file}")
+            # Print statistics
+            logger.info(f"Cleaning Statistics:")
+            logger.info(f"- Total rows processed: {self.stats['total_rows']}")
+            logger.info(f"- Rows recovered from encoding issues: {self.stats['recovered_rows']}")
+            logger.info(f"- Encoding errors: {self.stats['encoding_errors']}")
+            logger.info(f"- Valid rows after validation: {self.stats['valid_rows']}")
+            logger.info(f"- Rows with invalid JSON: {self.stats['invalid_json']}")
+            logger.info(f"- Rows with missing values: {self.stats['missing_values']}")
+            logger.info(f"- Rows with invalid embeddings: {self.stats['invalid_embeddings']}")
+            logger.info(f"- Duplicate rows: {self.stats['duplicates']}")
+            # Create a summary file
+            with open(f"{self.output_file}_summary.txt", 'w') as f:
+                f.write("Dataset Cleaning Summary\n")
+                f.write("=======================\n\n")
+                f.write(f"Input file: {self.input_file}\n")
+                f.write(f"Output file: {self.output_file}\n\n")
+                f.write(f"Total rows processed: {self.stats['total_rows']}\n")
+                f.write(f"Rows recovered from encoding issues: {self.stats['recovered_rows']}\n")
+                f.write(f"Encoding errors: {self.stats['encoding_errors']}\n")
+                f.write(f"Valid rows after validation: {self.stats['valid_rows']}\n")
+                f.write(f"Rows with invalid JSON: {self.stats['invalid_json']}\n")
+                f.write(f"Rows with missing values: {self.stats['missing_values']}\n")
+                f.write(f"Rows with invalid embeddings: {self.stats['invalid_embeddings']}\n")
+                f.write(f"Duplicate rows: {self.stats['duplicates']}\n")
+            return self.stats
+        except Exception as e:
+            logger.error(f"Error validating dataset: {str(e)}")
+            raise e
+def main():
+    """Main function to run the dataset cleaner."""
+    parser = argparse.ArgumentParser(description="Clean and validate SaaS sales conversation dataset")
+    parser.add_argument("input_file", type=str, help="Path to the input CSV file")
+    parser.add_argument("--output_file", type=str, default=None,
+                       help="Path to save cleaned dataset (defaults to 'cleaned_' + input_file)")
+    parser.add_argument("--chunk_size", type=int, default=1000,
+                       help="Number of rows to process at once")
+    parser.add_argument("--encoding", type=str, default='utf-8',
+                       help="File encoding (defaults to utf-8)")
+    parser.add_argument("--skip_encoding_check", action="store_true",
+                       help="Skip encoding detection and line-by-line processing")
+    args = parser.parse_args()
+    # Create and run the cleaner
+    cleaner = SaaSDatasetCleaner(
+        input_file=args.input_file,
+        output_file=args.output_file,
+        chunk_size=args.chunk_size,
+        encoding=args.encoding,
+        skip_encoding_check=args.skip_encoding_check
+    )
+    cleaner.clean_dataset()
+if __name__ == "__main__":
+    main()