Spaces:
Running
Running
""" | |
Data Analyzer class for causal inference pipelines. | |
This module provides the DataAnalyzer class for analyzing datasets | |
and extracting relevant information for causal inference. | |
""" | |
import pandas as pd | |
import numpy as np | |
from typing import Dict, List, Any, Optional | |
class DataAnalyzer: | |
""" | |
Data analyzer for causal inference datasets. | |
This class provides methods for analyzing datasets to extract | |
relevant information for causal inference, such as variables, | |
relationships, and temporal structures. | |
""" | |
def __init__(self, verbose=False): | |
""" | |
Initialize the data analyzer. | |
Args: | |
verbose: Whether to print verbose information | |
""" | |
self.verbose = verbose | |
def analyze_dataset(self, dataset_path: str) -> Dict[str, Any]: | |
""" | |
Analyze a dataset and extract relevant information. | |
Args: | |
dataset_path: Path to the dataset file | |
Returns: | |
Dictionary with dataset analysis results | |
""" | |
try: | |
# Load the dataset | |
df = pd.read_csv(dataset_path) | |
# Get basic statistics | |
n_rows, n_cols = df.shape | |
columns = list(df.columns) | |
# Get column types and categories | |
column_types = {col: str(df[col].dtype) for col in columns} | |
column_categories = self._categorize_columns(df) | |
# Check for temporal structure | |
temporal_structure = self._check_temporal_structure(df) | |
# Identify potential confounders | |
variable_relationships = self._identify_relationships(df) | |
# Look for potential instruments | |
potential_instruments = self._identify_potential_instruments(df) | |
# Check for discontinuities | |
discontinuities = self._check_discontinuities(df) | |
# Construct the analysis result | |
analysis = { | |
"filepath": dataset_path, | |
"n_rows": n_rows, | |
"n_cols": n_cols, | |
"columns": columns, | |
"column_types": column_types, | |
"column_categories": column_categories, | |
"temporal_structure": temporal_structure, | |
"variable_relationships": variable_relationships, | |
"potential_instruments": potential_instruments, | |
"discontinuities": discontinuities | |
} | |
if self.verbose: | |
print(f"Dataset analysis completed: {n_rows} rows, {n_cols} columns") | |
return analysis | |
except Exception as e: | |
if self.verbose: | |
print(f"Error analyzing dataset: {str(e)}") | |
return { | |
"error": str(e), | |
"filepath": dataset_path, | |
"n_rows": 0, | |
"n_cols": 0, | |
"columns": [], | |
"column_types": {}, | |
"column_categories": {}, | |
"temporal_structure": {"has_temporal_structure": False}, | |
"variable_relationships": {"potential_confounders": []}, | |
"potential_instruments": [], | |
"discontinuities": {"has_discontinuities": False} | |
} | |
def _categorize_columns(self, df: pd.DataFrame) -> Dict[str, str]: | |
""" | |
Categorize columns by data type. | |
Args: | |
df: Pandas DataFrame | |
Returns: | |
Dictionary mapping column names to categories | |
""" | |
categories = {} | |
for col in df.columns: | |
if df[col].dtype == 'bool': | |
categories[col] = 'binary' | |
elif pd.api.types.is_numeric_dtype(df[col]): | |
if len(df[col].unique()) <= 2: | |
categories[col] = 'binary' | |
else: | |
categories[col] = 'continuous' | |
else: | |
unique_values = df[col].nunique() | |
if unique_values <= 2: | |
categories[col] = 'binary' | |
elif unique_values <= 10: | |
categories[col] = 'categorical' | |
else: | |
categories[col] = 'high_cardinality' | |
return categories | |
def _check_temporal_structure(self, df: pd.DataFrame) -> Dict[str, Any]: | |
""" | |
Check for temporal structure in the dataset. | |
Args: | |
df: Pandas DataFrame | |
Returns: | |
Dictionary with temporal structure information | |
""" | |
# Look for date/time columns | |
date_cols = [col for col in df.columns if | |
any(keyword in col.lower() for keyword in | |
['date', 'time', 'year', 'month', 'day', 'period'])] | |
# Check for panel data structure | |
id_cols = [col for col in df.columns if | |
any(keyword in col.lower() for keyword in | |
['id', 'group', 'entity', 'unit'])] | |
return { | |
"has_temporal_structure": len(date_cols) > 0, | |
"is_panel_data": len(date_cols) > 0 and len(id_cols) > 0, | |
"time_variables": date_cols, | |
"id_variables": id_cols | |
} | |
def _identify_relationships(self, df: pd.DataFrame) -> Dict[str, List[str]]: | |
""" | |
Identify potential variable relationships. | |
Args: | |
df: Pandas DataFrame | |
Returns: | |
Dictionary with relationship information | |
""" | |
# This is a simplified implementation | |
# A real implementation would use statistical tests or causal discovery | |
return { | |
"potential_confounders": [] | |
} | |
def _identify_potential_instruments(self, df: pd.DataFrame) -> List[str]: | |
""" | |
Identify potential instrumental variables. | |
Args: | |
df: Pandas DataFrame | |
Returns: | |
List of potential instrumental variables | |
""" | |
# This is a simplified implementation | |
# A real implementation would use statistical tests | |
# Look for variables that might be instruments based on naming | |
potential_instruments = [col for col in df.columns if | |
any(keyword in col.lower() for keyword in | |
['instrument', 'random', 'assignment', 'iv'])] | |
return potential_instruments | |
def _check_discontinuities(self, df: pd.DataFrame) -> Dict[str, Any]: | |
""" | |
Check for potential discontinuities for RDD. | |
Args: | |
df: Pandas DataFrame | |
Returns: | |
Dictionary with discontinuity information | |
""" | |
# This is a simplified implementation | |
# A real implementation would use statistical tests | |
return { | |
"has_discontinuities": False, | |
"potential_running_variables": [] | |
} |